diff --git a/.gitignore b/.gitignore
index 1c9730a5ad..6aae076a49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ build/
 .pydevproject
 Makefile
 .test_env/
+third_party/
 
 *~
 bazel-*
+third_party/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index f635e65784..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "warp-ctc"]
-	path = warp-ctc
-	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b9902a863d..a6e45028eb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -15,7 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index 7de4ec7fc5..0705baa1ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:
@@ -21,28 +25,21 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
-      - wget
       - git
       - build-essential
       - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
-      - m4
-      - libprotobuf-dev
-      - doxygen
-      - protobuf-compiler
-      - python-protobuf
       - python-numpy
       - python-wheel
-      - libgoogle-glog-dev
-      - libgflags-dev
-      - libgtest-dev
       - curl
-      - lcov
-      - graphviz
       - swig
+      - graphviz
       - clang-format-3.8
+      - automake
+      - libtool
+      - ccache
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
@@ -55,10 +52,9 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit
+  - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a44e56719..4b0682c4fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,173 +1,99 @@
-cmake_minimum_required(VERSION 2.8)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+cmake_minimum_required(VERSION 3.0)
 
 project(paddle CXX C)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
-include(package)
-find_package(SWIG 2.0)
-find_package(CUDA QUIET)
-find_package(Protobuf REQUIRED)
-
-# Check protobuf library version.
-execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-	OUTPUT_VARIABLE PROTOBUF_VERSION)
-string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
-
-set(PROTOBUF_3 OFF)
-if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
-    set(PROTOBUF_3 ON)
-endif()
 
-find_package(PythonLibs 2.7 REQUIRED)
-find_package(PythonInterp 2.7 REQUIRED)
-find_package(ZLIB REQUIRED)
-find_package(NumPy REQUIRED)
-find_package(Threads REQUIRED)
-find_package(AVX QUIET)
-find_package(Glog)
-find_package(Gflags QUIET)
-find_package(GTest)
 find_package(Sphinx)
-find_package(Doxygen)
-include(cblas)
-find_program(M4_EXECUTABLE m4)
-###################### Configurations ###########################
-option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
-option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
-option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
-option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
-option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
-option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
-option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
-option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
-option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
-option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
-option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
-option(ON_TRAVIS "Running test on travis-ci or not." OFF)
-option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
-option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
-
-
-include(cpplint)
-include(ccache)
-if(WITH_RDMA)
-  include(rdma)
-endif()
-include(util)
-include(flags)
-include(cudnn)
-include(FindPythonModule)
-include(check_packages)
-include(swig)
-include(coveralls)
-
-# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
-# version.cmake will get the current PADDLE_VERSION
-include(version)
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-
-if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
-    endif()
-
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-    set(ACCURACY double)
-else(WITH_DOUBLE)
-    set(ACCURACY float)
-endif(WITH_DOUBLE)
-
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
-
-if(WITH_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIR})
-    include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
-else(WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(WITH_PYTHON)
-
-if(WITH_RDMA)
-  include_directories("${RDMA_INC_DIR}")
-else(WITH_RDMA)
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
-
-if(WITH_GLOG)
-    add_definitions(-DPADDLE_USE_GLOG)
-    include_directories(${LIBGLOG_INCLUDE_DIR})
-endif()
+find_package(Threads REQUIRED)
 
-if(WITH_GFLAGS)
-    add_definitions(-DPADDLE_USE_GFLAGS)
-    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-    include_directories(${GFLAGS_INCLUDE_DIRS})
+include(system)
+include(simd)
+
+################################ Configurations #######################################
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
+option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
+option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
+option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
+option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
+option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
 endif()
 
-if(WITH_TESTING)
-    enable_testing()
-    include_directories(${GTEST_INCLUDE_DIRS})
-endif()
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+  "A path setting third party libraries download & build directories.")
+########################################################################################
+
+include(external/zlib)      # download, build, install zlib
+include(external/gflags)    # download, build, install gflags
+include(external/glog)      # download, build, install glog
+include(external/gtest)     # download, build, install gtest
+include(external/protobuf)  # download, build, install protobuf
+include(external/python)    # download, build, install python
+include(external/openblas)  # download, build, install openblas
+include(external/swig)      # download, build, install swig
+include(external/warpctc)   # download, build, install warpctc
+
+include(package)            # set paddle packages
+include(cpplint)            # set paddle c++ style
+include(ccache)             # set ccache for compilation
+include(util)               # set unittest and link libs
+include(rdma)               # set rdma libraries
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries
+include(version)            # set PADDLE_VERSION
+include(coveralls)          # set code coverage
+
+include(configure)          # add paddle env configuration
 
-include_directories("${CBLAS_INC_DIR}")
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
-include_directories(${PROTOBUF_INCLUDE_DIRS})
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-if(EXISTS "${PROJ_ROOT}/paddle/internals/CMakeLists.txt")
-    set(PADDLE_WITH_INTERNAL ON)
-    include(paddle/internals/CMakeLists.txt)
-else()
-    set(PADDLE_WITH_INTERNAL OFF)
-    set(INTERNAL_PROTO_PATH "")
-endif()
+
+set(EXTERNAL_LIBS
+    # have not include gtest here.
+    ${GFLAGS_LIBRARIES}
+    ${GLOG_LIBRARIES}
+    ${CBLAS_LIBRARIES}
+    ${PROTOBUF_LIBRARY}
+    ${ZLIB_LIBRARIES}
+)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+
 if(WITH_DOC)
     add_subdirectory(doc)
-    add_subdirectory(doc_cn)
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..0d4bb973ae
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1 @@
+./doc/howto/dev/contribute_to_paddle_en.md
diff --git a/WORKSPACE b/WORKSPACE
deleted file mode 100644
index 0b8299905a..0000000000
--- a/WORKSPACE
+++ /dev/null
@@ -1,15 +0,0 @@
-# External dependency to Google protobuf.
-http_archive(
-    name="protobuf",
-    url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
-    sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix="protobuf-3.1.0", )
-
-# External dependency to gtest 1.7.0.  This method comes from
-# https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
-new_http_archive(
-    name="gtest",
-    url="https://github.com/google/googletest/archive/release-1.7.0.zip",
-    sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
-    build_file="third_party/gtest.BUILD",
-    strip_prefix="googletest-release-1.7.0", )
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 6702f45a16..d319442ef1 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,6 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
     )
 
   set_property(
@@ -143,4 +144,4 @@ function( Sphinx_add_targets target_base_name conf source base_destination )
 
     add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
   endif()
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 685334c658..4e1ae7dc81 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,6 +13,7 @@
 # system paths.
 #
 
+set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
@@ -35,11 +36,12 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
   set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
-  set(CBLAS_LIBS ${MKL_INTEL_LP64}
+  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
-  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return() # return file.
 endif()
 
@@ -68,9 +70,10 @@ find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -98,8 +101,9 @@ find_library(OPENBLAS_LIB NAMES openblas
 if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
-  set(CBLAS_LIBS ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -130,9 +134,7 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
 if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBS ${REFERENCE_CBLAS_LIBRARY})
-  return()
+  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_FOUND ON)
 endif()
-
-message(FATAL_ERROR "CBlas must be set. Paddle support MKL, ATLAS, OpenBlas, reference-cblas."
-  " Try set MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT or REFERENCE_CBLAS_ROOT.")
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
deleted file mode 100644
index 0688745541..0000000000
--- a/cmake/check_packages.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-# Check package for each cmake option
-
-if(WITH_GPU)
-  find_package(CUDA REQUIRED)  # CUDA is required when use gpu
-endif()
-
-if(WITH_PYTHON)
-  find_package(PythonLibs 2.6 REQUIRED)
-  find_package(PythonInterp REQUIRED)
-  find_package(NumPy REQUIRED)
-endif()
-
-if(WITH_STYLE_CHECK)
-  find_package(PythonInterp REQUIRED)
-endif()
-
-if(WITH_GLOG)
-  find_package(Glog REQUIRED)
-endif()
-
-if(WITH_GFLAGS)
-  find_package(Gflags REQUIRED)
-endif()
-
-if(WITH_TESTING)
-  find_package(GTest REQUIRED)
-endif()
-
-if(WITH_DOC)
-  find_package(Sphinx REQUIRED)
-  find_package(Doxygen REQUIRED)
-  find_python_module(recommonmark REQUIRED)
-endif()
-
-if(WITH_SWIG_PY)
-  if(NOT SWIG_FOUND)
-    message(FATAL_ERROR "SWIG is not found. Please install swig or disable WITH_SWIG_PY")
-  endif()
-  find_python_module(wheel REQUIRED)  # package wheel
-endif()
-
-if(NOT M4_EXECUTABLE)
-  message(FATAL_ERROR "Paddle need m4 to generate proto file.")
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
new file mode 100644
index 0000000000..0bb016201d
--- /dev/null
+++ b/cmake/configure.cmake
@@ -0,0 +1,68 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_PYTHON)
+    add_definitions(-DPADDLE_NO_PYTHON)
+endif(NOT WITH_PYTHON)
+
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
+endif(WITH_DOUBLE)
+
+if(NOT WITH_TIMER)
+    add_definitions(-DPADDLE_DISABLE_TIMER)
+endif(NOT WITH_TIMER)
+
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
+if(NOT WITH_GPU)
+    add_definitions(-DPADDLE_ONLY_CPU)
+    add_definitions(-DHPPL_STUB_FUNC)
+
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+else()
+    FIND_PACKAGE(CUDA REQUIRED)
+
+    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+    endif()
+
+    if(NOT CUDNN_FOUND)
+        message(FATAL_ERROR "Paddle need cudnn to compile")
+    endif()
+
+    if(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
+    endif(WITH_AVX)
+
+    # Include cuda and cudnn
+    include_directories(${CUDNN_INCLUDE_DIR})
+    include_directories(${CUDA_TOOLKIT_INCLUDE})
+endif(NOT WITH_GPU)
+
+if(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
+else(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
+endif(WITH_AVX)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 241af9a083..38c636b30e 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -53,7 +53,7 @@ macro(add_style_check_target TARGET_NAME)
             if(LINT MATCHES ON)
                 add_custom_command(TARGET ${TARGET_NAME}
                     PRE_BUILD
-                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                                 "--filter=${STYLE_FILTER}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
new file mode 100644
index 0000000000..2a49d76eb3
--- /dev/null
+++ b/cmake/external/gflags.cmake
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
+SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+IF(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ELSE(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+
+ExternalProject_Add(
+    gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    PREFIX          ${GFLAGS_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
new file mode 100644
index 0000000000..71e20c8527
--- /dev/null
+++ b/cmake/external/glog.cmake
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
+SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
+
+IF(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+ELSE(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+
+ExternalProject_Add(
+    glog
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    PREFIX          ${GLOG_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
new file mode 100644
index 0000000000..11d829a9e2
--- /dev/null
+++ b/cmake/external/gtest.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WITH_TESTING)
+    ENABLE_TESTING()
+    INCLUDE(ExternalProject)
+
+    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
+    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
+    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
+
+    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+
+    IF(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+    ELSE(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        gtest
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
+        PREFIX          ${GTEST_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        CMAKE_ARGS      -DBUILD_GMOCK=ON
+        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+    )
+    LIST(APPEND external_project_dependencies gtest)
+ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
new file mode 100644
index 0000000000..0e8c29c831
--- /dev/null
+++ b/cmake/external/openblas.cmake
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(cblas)
+
+IF(NOT ${CBLAS_FOUND})
+    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
+    INCLUDE(ExternalProject)
+
+    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
+    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+
+    IF(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
+    ELSE(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        PREFIX              ${CBLAS_SOURCES_DIR}
+        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+        BUILD_IN_SOURCE     1
+        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
+        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        UPDATE_COMMAND      ""
+    )
+
+    LIST(APPEND external_project_dependencies openblas)
+ENDIF()
+
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
new file mode 100644
index 0000000000..c0cf2719f9
--- /dev/null
+++ b/cmake/external/protobuf.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
+SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+IF(WIN32)
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+ELSE(WIN32)
+  IF(${HOST_SYSTEM} STREQUAL "centos")
+    SET(LIB "lib64")
+  ELSE()
+    SET(LIB "lib")
+  ENDIF()
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF(WIN32)
+
+ExternalProject_Add(
+  protobuf
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX          ${PROTOBUF_SOURCES_DIR}
+  UPDATE_COMMAND  ""
+  DEPENDS         zlib
+  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+  CONFIGURE_COMMAND
+    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+    -Dprotobuf_BUILD_TESTS=OFF
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+)
+
+LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
new file mode 100644
index 0000000000..29247d5c3d
--- /dev/null
+++ b/cmake/external/python.cmake
@@ -0,0 +1,226 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+INCLUDE(python_module)
+
+FIND_PACKAGE(PythonInterp 2.7)
+FIND_PACKAGE(PythonLibs 2.7)
+
+SET(py_env "")
+
+IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    find_python_module(pip REQUIRED)
+    find_python_module(numpy REQUIRED)
+    find_python_module(wheel REQUIRED)
+    find_python_module(google.protobuf REQUIRED)
+    FIND_PACKAGE(NumPy REQUIRED)
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        "please use pip to upgrade protobuf.")
+    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    ##################################### PYTHON ########################################
+    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
+    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
+    SET(_python_DIR ${PYTHON_INSTALL_DIR})
+
+    IF(UNIX)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSEIF(WIN32)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown system !")
+    ENDIF()
+
+    IF(APPLE)
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
+            )
+    ENDIF()
+
+    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
+
+    # Force Python build to "Release".
+    IF(CMAKE_CONFIGURATION_TYPES)
+        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
+        SET(CMAKE_CFG_INTDIR "Release")
+    ELSE()
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            )
+    ENDIF()
+
+    ExternalProject_Add(python
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
+        PREFIX            ${PYTHON_SOURCES_DIR}
+        UPDATE_COMMAND    ""
+        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
+        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
+            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
+            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
+            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
+            -DDOWNLOAD_SOURCES:BOOL=ON
+            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
+        DEPENDS zlib
+    )
+
+    SET(py_env
+        PATH=${PYTHON_INSTALL_DIR}/bin
+        PYTHONHOME=${PYTHON_INSTALL_DIR}
+        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
+    ####################################################################################
+
+    ##################################### SETUPTOOLS ###################################
+    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
+    ExternalProject_Add(setuptools
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
+        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python zlib
+    )
+    #####################################################################################
+
+    ##################################### SIX ###########################################
+    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
+    ExternalProject_Add(six
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SIX_SOURCES_DIR}
+        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python setuptools
+    )
+    #####################################################################################
+
+    ##################################### CYTHON ########################################
+    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
+    ExternalProject_Add(cython
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX                ${CYTHON_SOURCES_DIR}
+        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
+        GIT_TAG               0.25.2
+        BUILD_IN_SOURCE       1
+        CONFIGURE_COMMAND     ""
+        PATCH_COMMAND         ""
+        UPDATE_COMMAND        ""
+        INSTALL_COMMAND       ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python
+    )
+    ####################################################################################
+
+    ##################################### NUMPY ########################################
+    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
+    SET(NUMPY_TAG_VERSION "v1.11.3")
+    SET(NUMPY_VERSION "1.11.3")
+
+    SET(EGG_NAME "")
+    SET(PYTHON_NUMPY_INCLUDE_DIR "")
+    IF(WIN32)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
+    ELSE(WIN32)
+        IF(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
+        ELSE(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+        ENDIF(APPLE)
+
+        FOREACH(suffix x86_64 intel fat64 fat32 universal)
+            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
+        ENDFOREACH()
+    ENDIF(WIN32)
+
+    ExternalProject_Add(numpy
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/numpy/numpy.git
+        GIT_TAG             ${NUMPY_TAG_VERSION}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        PREFIX              ${NUMPY_SOURCES_DIR}
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools cython
+    )
+    ####################################################################################
+
+    ##################################### WHEEL ########################################
+    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
+    ExternalProject_Add(wheel
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
+        PREFIX              ${WHEEL_SOURCES_DIR}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ""
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools
+    )
+    ####################################################################################
+
+    ################################### PROTOBUF #######################################
+    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
+    ExternalProject_Add(python-protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
+        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
+        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
+        BUILD_IN_SOURCE       1
+        PATCH_COMMAND         ""
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python setuptools six
+    )
+    ####################################################################################
+
+    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
+
+ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+
+MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}")
+MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}")
+MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}")
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
new file mode 100644
index 0000000000..63e8bd2546
--- /dev/null
+++ b/cmake/external/swig.cmake
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FIND_PACKAGE(SWIG)
+
+IF(NOT SWIG_FOUND)
+    # build swig as an external project
+    INCLUDE(ExternalProject)
+
+    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
+    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
+    SET(SWIG_TARGET_VERSION "3.0.2")
+    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
+    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
+
+    IF(WIN32)
+        # swig.exe available as pre-built binary on Windows:
+        ExternalProject_Add(swig
+            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
+            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
+            SOURCE_DIR          ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     ""
+            UPDATE_COMMAND      ""
+        )
+        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
+        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
+    ELSE(WIN32)
+        # From PCRE configure
+        ExternalProject_Add(pcre
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY https://github.com/svn2github/pcre.git
+            PREFIX ${SWIG_SOURCES_DIR}/pcre
+            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
+        )
+
+        # swig uses bison find it by cmake and pass it down
+        FIND_PACKAGE(BISON)
+
+        # From SWIG configure
+        ExternalProject_Add(swig
+            GIT_REPOSITORY      https://github.com/swig/swig.git
+            GIT_TAG             rel-3.0.10
+            PREFIX              ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
+            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
+            ./configure
+                --prefix=${SWIG_INSTALL_DIR}
+                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
+            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
+            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
+            UPDATE_COMMAND  ""
+            DEPENDS pcre
+        )
+
+        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
+        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
+    ENDIF(WIN32)
+
+    LIST(APPEND external_project_dependencies swig)
+ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
new file mode 100644
index 0000000000..f5e4b3e1eb
--- /dev/null
+++ b/cmake/external/warpctc.cmake
@@ -0,0 +1,60 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
+
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+IF(WIN32)
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ELSE(WIN32)
+    IF(APPLE)
+        SET(_warpctc_SHARED_SUFFIX dylib)
+    ELSE(APPLE)
+        SET(_warpctc_SHARED_SUFFIX so)
+    ENDIF(APPLE)
+
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
+
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+    SET(USE_OMP OFF)
+ELSE()
+    SET(USE_OMP ON)
+ENDIF()
+
+ExternalProject_Add(
+    warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    PREFIX          ${WARPCTC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+    CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DBUILD_SHARED=ON
+)
+
+LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
new file mode 100644
index 0000000000..47fa8817fb
--- /dev/null
+++ b/cmake/external/zlib.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
+SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
+SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
+
+IF(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    zlib
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
+    PREFIX          ${ZLIB_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+)
+
+LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0983d83b73..0d1ef5cd84 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -3,12 +3,6 @@ include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
-
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
diff --git a/cmake/FindPythonModule.cmake b/cmake/python_module.cmake
similarity index 72%
rename from cmake/FindPythonModule.cmake
rename to cmake/python_module.cmake
index 2eb3441428..1412b7f7f2 100644
--- a/cmake/FindPythonModule.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
     if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
         message(FATAL_ERROR "python module ${module} is not found")
     endif()
+
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
+
     set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
 endfunction(find_python_module)
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
index e9a4da79aa..9ff1a77cac 100644
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -5,72 +5,76 @@
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
 # we use static output in svn repositories to avoid implict bugs from not standard runtime env.
 
-set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+if(WITH_RDMA)
+  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
 
-function(generate_rdma_links)
-  #redirect to current DIR to isolate the pollution from system runtime environment
-  #it can benifits unified control for different gcc environment. 
-  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-  #runtime libraries that will crash process while loading it. That redirect trick
-  #can fix it.
-  execute_process(
-    COMMAND mkdir -p librdma
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-endfunction(generate_rdma_links)
-
-
-#check and set headers
-find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-#check and set libs
-find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-if(
-    RDMA_INC_SXISOCK AND
-    RDMA_INC_XIO AND
-    RDMA_INC_EVENT AND
-    RDMA_INC_NUMA AND
-    RDMA_LIB_SXISOCK AND 
-    RDMA_LIB_XIO AND
-    RDMA_LIB_EVENT AND
-    RDMA_LIB_EVENT_CORE AND
-    RDMA_LIB_EVENT_EXTRA AND
-    RDMA_LIB_EVENT_PTHREADS AND
-    RDMA_LIB_NUMA
+  function(generate_rdma_links)
+    #redirect to current DIR to isolate the pollution from system runtime environment
+    #it can benifits unified control for different gcc environment. 
+    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+    #runtime libraries that will crash process while loading it. That redirect trick
+    #can fix it.
+    execute_process(
+      COMMAND mkdir -p librdma
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
+  endfunction(generate_rdma_links)
 
-  set(RDMA_INC_DIR 
-    ${RDMA_INC_SXISOCK} 
-    ${RDMA_INC_XIO}
-    ${RDMA_INC_EVENT}
-    ${RDMA_INC_NUMA})
-  set(RDMA_LIBS  
-    ${RDMA_LIB_SXISOCK} 
-    ${RDMA_LIB_XIO} 
-    ${RDMA_LIB_EVENT} 
-    ${RDMA_LIB_EVENT_CORE} 
-    ${RDMA_LIB_EVENT_EXTRA} 
-    ${RDMA_LIB_EVENT_PTHREADS} 
-    ${RDMA_LIB_NUMA} 
-    )
-  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-  return()
-endif()
+  #check and set headers
+  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+  #check and set libs
+  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 
-#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+  if(
+      RDMA_INC_SXISOCK AND
+      RDMA_INC_XIO AND
+      RDMA_INC_EVENT AND
+      RDMA_INC_NUMA AND
+      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_XIO AND
+      RDMA_LIB_EVENT AND
+      RDMA_LIB_EVENT_CORE AND
+      RDMA_LIB_EVENT_EXTRA AND
+      RDMA_LIB_EVENT_PTHREADS AND
+      RDMA_LIB_NUMA
+      )
 
-message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+    set(RDMA_INC_DIR 
+      ${RDMA_INC_SXISOCK} 
+      ${RDMA_INC_XIO}
+      ${RDMA_INC_EVENT}
+      ${RDMA_INC_NUMA})
+    set(RDMA_LIBS  
+      ${RDMA_LIB_SXISOCK} 
+      ${RDMA_LIB_XIO} 
+      ${RDMA_LIB_EVENT} 
+      ${RDMA_LIB_EVENT_CORE} 
+      ${RDMA_LIB_EVENT_EXTRA} 
+      ${RDMA_LIB_EVENT_PTHREADS} 
+      ${RDMA_LIB_NUMA} 
+      )
+    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+    include_directories("${RDMA_INC_DIR}")
+  else()
+    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+  endif()
+else(WITH_RDMA)
+  set(RDMA_LIBS "")
+  set(RDMA_LD_FLAGS "")
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
diff --git a/cmake/FindAVX.cmake b/cmake/simd.cmake
similarity index 100%
rename from cmake/FindAVX.cmake
rename to cmake/simd.cmake
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
deleted file mode 100644
index 97e87aa947..0000000000
--- a/cmake/swig.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-function(generate_python_api target_name)
-    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    add_custom_target(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py)
-endfunction(generate_python_api)
diff --git a/cmake/system.cmake b/cmake/system.cmake
new file mode 100644
index 0000000000..788db404eb
--- /dev/null
+++ b/cmake/system.cmake
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WIN32)
+    SET(HOST_SYSTEM "win32")
+ELSE(WIN32)
+    IF(APPLE)
+        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
+        SET(MACOS_VERSION ${VERSION})
+        SET(HOST_SYSTEM "macosx")
+    ELSE(APPLE)
+        IF(EXISTS "/etc/issue")
+            FILE(READ "/etc/issue" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ELSEIF(LINUX_ISSUE MATCHES "Debian")
+                SET(HOST_SYSTEM "debian")
+            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
+                SET(HOST_SYSTEM "ubuntu")
+            ENDIF()
+        ENDIF(EXISTS "/etc/issue")
+    ENDIF(APPLE)
+ENDIF(WIN32)
+
+# query number of logical cores
+CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+
+MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
+
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+
+# external dependencies log output
+SET(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD    0     # Wrap download in script to log output
+    LOG_UPDATE      1     # Wrap update in script to log output
+    LOG_CONFIGURE   1     # Wrap configure in script to log output
+    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_TEST        1     # Wrap test in script to log output
+    LOG_INSTALL     1     # Wrap install in script to log output
+)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 11641f6064..24ad5c815c 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -24,7 +24,7 @@ function(target_circle_link_libraries TARGET_NAME)
                 list(APPEND libsInArgn ${arg})
             endif()
         endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
             list(APPEND LIBS "-undefined dynamic_lookup")
         endif()
         list(REVERSE libsInArgn)
@@ -65,7 +65,7 @@ endmacro()
 # link_paddle_exe
 # add paddle library for a paddle executable, such as trainer, pserver.
 #
-# It will handle WITH_PYTHON/WITH_GLOG etc.
+# It will handle WITH_PYTHON etc.
 function(link_paddle_exe TARGET_NAME)
     if(WITH_RDMA)
         generate_rdma_links()
@@ -81,21 +81,10 @@ function(link_paddle_exe TARGET_NAME)
         set(METRIC_LIBS "")
     endif()
 
-    if(PADDLE_WITH_INTERNAL)
-        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_internal_gserver
-            paddle_internal_owlqn
-            ARCHIVE_END
-            paddle_internal_parameter)
-    else()
-        set(INTERAL_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
+        paddle_function
         ${METRIC_LIBS}
         ARCHIVE_END
         paddle_pserver
@@ -107,39 +96,23 @@ function(link_paddle_exe TARGET_NAME)
         paddle_proto
         paddle_cuda
         ${METRIC_LIBS}
-        ${PROTOBUF_LIBRARY}
+        ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
-        ${CBLAS_LIBS}
-        ${ZLIB_LIBRARIES}
-        ${INTERAL_LIBS}
-        ${CMAKE_DL_LIBS})
+        ${CMAKE_DL_LIBS}
+        ${RDMA_LD_FLAGS}
+        ${RDMA_LIBS})
 
-    if(WITH_RDMA)
-        target_link_libraries(${TARGET_NAME}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
-    
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES})
-    endif()
-
-    if(WITH_GLOG)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBGLOG_LIBRARY})
-    endif()
-
-    if(WITH_GFLAGS)
-        target_link_libraries(${TARGET_NAME}
-            ${GFLAGS_LIBRARIES})
+            ${PYTHON_LIBRARIES} util)
     endif()
 
     if(WITH_GPU)
-        if(NOT WITH_DSO OR WITH_METRIC) 
+        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
+        if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
                 ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY}) 
+                ${CUDA_curand_LIBRARY})
             CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
         endif()
 
@@ -149,10 +122,7 @@ function(link_paddle_exe TARGET_NAME)
         endif()
     endif()
 
-    if(NOT WITH_DSO)
-        target_link_libraries(${TARGET_NAME}
-            ${WARPCTC_LIBRARY})
-    endif()
+    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
 # link_paddle_test
@@ -161,8 +131,10 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
     link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
-        ${GTEST_LIBRARIES})
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
+                          paddle_test_util
+                          ${GTEST_LIBRARIES})
 endfunction()
 
 # add_unittest_without_exec
@@ -206,5 +178,5 @@ function(create_resources res_file output)
     # Convert hex data for C compatibility
     string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
     # Append data to output file
-    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}0};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
 endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index a0518e07e8..ac1583a24c 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -21,4 +21,5 @@ while ("${PADDLE_VERSION}" STREQUAL "")
   endif()
 endwhile()
 
+add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh
index ae24ef2b7f..bbadc7c10c 100755
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh
old mode 100644
new mode 100755
diff --git a/demo/gan/gan_conf_image.py b/demo/gan/gan_conf_image.py
index f89a4e706c..c469227994 100644
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@@ -87,9 +87,9 @@ def conv_bn(input,
     print(imgSize, output_x, stride, filter_size, padding)
 
     if trans:
-        nameApx = "_conv"
-    else:
         nameApx = "_convt"
+    else:
+        nameApx = "_conv"
 
     if bn:
         conv = img_conv_layer(
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
index 52e82d0d98..532178d627 100755
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
index 87eed5eebd..6a315ff094 100644
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 #
 # {'img_size': 32,
-# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'settings': a global object,
 # 'color': True,
 # 'mean_img_size': 32,
 # 'meta': './data/cifar-out/batches/batches.meta',
@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
 
     settings.logger.info('Image size: %s', settings.img_size)
     settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = [
-        dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)
-    ]  # labels
+    settings.input_types = {
+        'image': dense_vector(settings.img_raw_size),
+        'label': integer_value(settings.num_classes)
+    }
 
     settings.logger.info('DataProvider Initialization finished')
 
@@ -83,4 +83,7 @@ def processData(settings, file_list):
                         img, settings.img_mean, settings.img_size,
                         settings.is_train, settings.color)
                     label = data['labels'][i]
-                    yield img_feat.astype('float32'), int(label)
+                    yield {
+                        'image': img_feat.astype('float32'),
+                        'label': int(label)
+                    }
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index 6fc11caf1c..e45bd47ad5 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore
new file mode 100644
index 0000000000..c54f3f9480
--- /dev/null
+++ b/demo/introduction/.gitignore
@@ -0,0 +1,5 @@
+dataprovider.pyc
+empty.list
+train.log
+output
+train.list
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
index 03c920cc34..5b48aad040 100644
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
@@ -17,8 +17,10 @@ import random
 
 
 # define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+@provider(
+    input_types={'x': dense_vector(1),
+                 'y': dense_vector(1)}, use_seq=False)
 def process(settings, input_file):
     for i in xrange(2000):
         x = random.random()
-        yield [x], [2 * x + 0.3]
+        yield {'x': [x], 'y': [2 * x + 0.3]}
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index b7bbb90ddd..2ce6446d7c 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
     --save_dir=./output \
     --num_passes=30 \
     2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index 41cebcf6e1..ecafe955f9 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -15,11 +15,8 @@
 from paddle.trainer_config_helpers import *
 
 # 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f:
-    f.writelines(' ')
 define_py_data_sources2(
-    train_list=data_file,
+    train_list=['no_matter.txt'],
     test_list=None,
     module='dataprovider',
     obj='process',
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 810910fd5c..8bd9837523 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -4,3 +4,4 @@ mnist_vgg_model
 plot.png
 train.log
 *pyc
+.ipynb_checkpoints
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
new file mode 100644
index 0000000000..f301da382f
--- /dev/null
+++ b/demo/mnist/api_train.py
@@ -0,0 +1,205 @@
+"""
+A very basic example for how to use current Raw SWIG API to train mnist network.
+
+Current implementation uses Raw SWIG, which means the API call is directly \
+passed to C++ side of Paddle.
+
+The user api could be simpler and carefully designed.
+"""
+import py_paddle.swig_paddle as api
+from py_paddle import DataProviderConverter
+import paddle.trainer.PyDataProvider2 as dp
+import numpy as np
+import random
+from mnist_util import read_from_mnist
+from paddle.trainer_config_helpers import *
+
+
+def optimizer_config():
+    settings(
+        learning_rate=1e-4,
+        learning_method=AdamOptimizer(),
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+
+def network_config():
+    imgs = data_layer(name='pixel', size=784)
+    hidden1 = fc_layer(input=imgs, size=200)
+    hidden2 = fc_layer(input=hidden1, size=200)
+    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+    cost = classification_cost(
+        input=inference, label=data_layer(
+            name='label', size=10))
+    outputs(cost)
+
+
+def init_parameter(network):
+    assert isinstance(network, api.GradientMachine)
+    for each_param in network.getParameters():
+        assert isinstance(each_param, api.Parameter)
+        array_size = len(each_param)
+        array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
+        each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
+
+
+def generator_to_batch(generator, batch_size):
+    ret_val = list()
+    for each_item in generator:
+        ret_val.append(each_item)
+        if len(ret_val) == batch_size:
+            yield ret_val
+            ret_val = list()
+    if len(ret_val) != 0:
+        yield ret_val
+
+
+class BatchPool(object):
+    def __init__(self, generator, batch_size):
+        self.data = list(generator)
+        self.batch_size = batch_size
+
+    def __call__(self):
+        random.shuffle(self.data)
+        for offset in xrange(0, len(self.data), self.batch_size):
+            limit = min(offset + self.batch_size, len(self.data))
+            yield self.data[offset:limit]
+
+
+def input_order_converter(generator):
+    for each_item in generator:
+        yield each_item['pixel'], each_item['label']
+
+
+def main():
+    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
+
+    # get enable_types for each optimizer.
+    # enable_types = [value, gradient, momentum, etc]
+    # For each optimizer(SGD, Adam), GradientMachine should enable different
+    # buffers.
+    opt_config_proto = parse_optimizer_config(optimizer_config)
+    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
+    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
+    enable_types = _temp_optimizer_.getParameterTypes()
+
+    # Create Simple Gradient Machine.
+    model_config = parse_network_config(network_config)
+    m = api.GradientMachine.createFromConfigProto(
+        model_config, api.CREATE_MODE_NORMAL, enable_types)
+
+    # This type check is not useful. Only enable type hint in IDE.
+    # Such as PyCharm
+    assert isinstance(m, api.GradientMachine)
+
+    # Initialize Parameter by numpy.
+    init_parameter(network=m)
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # Initialize ParameterUpdater.
+    updater.init(m)
+
+    # DataProvider Converter is a utility convert Python Object to Paddle C++
+    # Input. The input format is as same as Paddle's DataProvider.
+    converter = DataProviderConverter(
+        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+
+    train_file = './data/raw_data/train'
+    test_file = './data/raw_data/t10k'
+
+    # start gradient machine.
+    # the gradient machine must be started before invoke forward/backward.
+    # not just for training, but also for inference.
+    m.start()
+
+    # evaluator can print error rate, etc. It is a C++ class.
+    batch_evaluator = m.makeEvaluator()
+    test_evaluator = m.makeEvaluator()
+
+    # Get Train Data.
+    # TrainData will stored in a data pool. Currently implementation is not care
+    # about memory, speed. Just a very naive implementation.
+    train_data_generator = input_order_converter(read_from_mnist(train_file))
+    train_data = BatchPool(train_data_generator, 512)
+
+    # outArgs is Neural Network forward result. Here is not useful, just passed
+    # to gradient_machine.forward
+    outArgs = api.Arguments.createArguments(0)
+
+    for pass_id in xrange(2):  # we train 2 passes.
+        updater.startPass()
+
+        for batch_id, data_batch in enumerate(train_data()):
+            # data_batch is input images.
+            # here, for online learning, we could get data_batch from network.
+
+            # Start update one batch.
+            pass_type = updater.startBatch(len(data_batch))
+
+            # Start BatchEvaluator.
+            # batch_evaluator can be used between start/finish.
+            batch_evaluator.start()
+
+            # forwardBackward is a shortcut for forward and backward.
+            # It is sometimes faster than invoke forward/backward separately,
+            # because in GradientMachine, it may be async.
+            m.forwardBackward(converter(data_batch), outArgs, pass_type)
+
+            for each_param in m.getParameters():
+                updater.update(each_param)
+
+            # Get cost. We use numpy to calculate total cost for this batch.
+            cost_vec = outArgs.getSlotValue(0)
+            cost_vec = cost_vec.copyToNumpyMat()
+            cost = cost_vec.sum() / len(data_batch)
+
+            # Make evaluator works.
+            m.eval(batch_evaluator)
+
+            # Print logs.
+            print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
+                cost, batch_evaluator
+
+            batch_evaluator.finish()
+            # Finish batch.
+            #  * will clear gradient.
+            #  * ensure all values should be updated.
+            updater.finishBatch(cost)
+
+        # testing stage. use test data set to test current network.
+        updater.apply()
+        test_evaluator.start()
+        test_data_generator = input_order_converter(read_from_mnist(test_file))
+        for data_batch in generator_to_batch(test_data_generator, 512):
+            # in testing stage, only forward is needed.
+            m.forward(converter(data_batch), outArgs, api.PASS_TEST)
+            m.eval(test_evaluator)
+
+        # print error rate for test data set
+        print 'Pass', pass_id, ' test evaluator: ', test_evaluator
+        test_evaluator.finish()
+        updater.restore()
+
+        updater.catchUpWith()
+        params = m.getParameters()
+        for each_param in params:
+            assert isinstance(each_param, api.Parameter)
+            value = each_param.getBuf(api.PARAMETER_VALUE)
+            value = value.copyToNumpyArray()
+
+            # Here, we could save parameter to every where you want
+            print each_param.getName(), value
+
+        updater.finishPass()
+
+    m.finish()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 6df4676da3..888cfef1e7 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -1,31 +1,12 @@
 from paddle.trainer.PyDataProvider2 import *
+from mnist_util import read_from_mnist
 
 
 # Define a py data provider
 @provider(
     input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
+                 'label': integer_value(10)},
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
-    imgf = filename + "-images-idx3-ubyte"
-    labelf = filename + "-labels-idx1-ubyte"
-    f = open(imgf, "rb")
-    l = open(labelf, "rb")
-
-    f.read(16)
-    l.read(8)
-
-    # Define number of samples for train/test
-    if "train" in filename:
-        n = 60000
-    else:
-        n = 10000
-
-    for i in range(n):
-        label = ord(l.read(1))
-        pixels = []
-        for j in range(28 * 28):
-            pixels.append(float(ord(f.read(1))) / 255.0)
-        yield {"pixel": pixels, 'label': label}
-
-    f.close()
-    l.close()
+    for each in read_from_mnist(filename):
+        yield each
diff --git a/demo/mnist/mnist_util.py b/demo/mnist/mnist_util.py
new file mode 100644
index 0000000000..3fd88ae7ed
--- /dev/null
+++ b/demo/mnist/mnist_util.py
@@ -0,0 +1,30 @@
+import numpy
+
+__all__ = ['read_from_mnist']
+
+
+def read_from_mnist(filename):
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+
+    f.read(16)
+    l.read(8)
+
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}
+
+    f.close()
+    l.close()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index da90cd749a..ca2b1ad9eb 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index f97ef26107..f61c65a935 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -14,9 +14,19 @@
 # limitations under the License.
 set -e
 set -x
+BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
 
-# download the dictionary and pretrained model 
-for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
+          927cf70f27f860aff1a5703ebf7f1584
+	  a52e43655cd25d279777ed509a1ae27b
+	  b92c67fe9ff70fea53596080e351ac80)
+
+for ((i=0; i<${#ITEM_MD5[@]}; i++))
+do
+  FILENAME=${DOWNLOAD_ITEMS[${i}]}
+  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
+  EXPECTED_MD5=${ITEM_MD5[${i}]}
+  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
 done
diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore
index d6bc73105b..f71662563f 100644
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
@@ -8,6 +8,8 @@ data/test.list
 data/test.txt
 data/train.list
 data/train.txt
+data/pred.list
+data/pred.txt
 dataprovider_copy_1.py
 train.log
 output
diff --git a/demo/quick_start/api_predict.py b/demo/quick_start/api_predict.py
new file mode 100755
index 0000000000..9bdffe1006
--- /dev/null
+++ b/demo/quick_start/api_predict.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import sparse_binary_vector
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python api_predict.py -h
+"""
+
+
+class QuickStartPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [sparse_binary_vector(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["id"].tolist()
+        print("predicting labels is:")
+        print prob
+
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
+
+    batch = []
+    labels = []
+    for line in sys.stdin:
+        [label, text] = line.split("\t")
+        labels.append(int(label))
+        batch.append([predict.get_index(text)])
+    print("labels is:")
+    print labels
+    predict.batch_predict(batch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/api_predict.sh b/demo/quick_start/api_predict.sh
new file mode 100755
index 0000000000..4d9aa9e885
--- /dev/null
+++ b/demo/quick_start/api_predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+#only test on trainer_config.lr.py
+model=output/model/pass-00001/
+config=trainer_config.lr.py
+label=data/labels.list
+dict=data/dict.txt
+batch_size=20
+head -n$batch_size data/test.txt | python api_predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=$dict \
+     --batch_size=$batch_size
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
new file mode 100755
index 0000000000..a7b1f01064
--- /dev/null
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Should run pserver.sh before run this script.
+bin_dir=$(cd `dirname $0`; pwd)
+home_dir=$(cd "${bin_dir}/.."; pwd)
+source "$bin_dir/env.sh"
+
+model_dir="$bin_dir/output"
+log_file="$bin_dir/train.log"
+
+pushd "$home_dir"
+cfg=trainer_config.lr.py
+paddle train \
+  --start_pserver=false \
+  --config=$cfg \
+  --save_dir=${model_dir} \
+  --trainer_count=4 \
+  --local=0 \
+  --log_period=100 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  --num_gradient_servers=1 \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --pservers="127.0.0.1" \
+  --comment="paddle_trainer" \
+  2>&1 | tee "$log_file"
+popd
diff --git a/demo/quick_start/cluster/env.sh b/demo/quick_start/cluster/env.sh
new file mode 100644
index 0000000000..a404993835
--- /dev/null
+++ b/demo/quick_start/cluster/env.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_nics() {
+  machine=`uname -s`
+  local nics=""
+  if [ "$machine" == "Linux" ]; then
+    nics="lo"
+  elif [ "$machine" == "Darwin" ]; then
+    nics="lo0"
+  else
+    nics="unsupport"
+  fi
+  echo $nics
+}
diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh
new file mode 100755
index 0000000000..b187c1d9b9
--- /dev/null
+++ b/demo/quick_start/cluster/pserver.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+bin_dir=$(cd `dirname $0`; pwd)
+source "$bin_dir/env.sh"
+
+paddle pserver \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --ports_num_for_sparse=1 \
+  --num_gradient_servers=1 \
+  --comment="paddle_pserver" \
+  2>&1 | tee 'pserver.log'
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
index 8e651d77bf..2745495586 100644
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
 
     # setting.input_types specifies what the data types the data provider
     # generates.
-    settings.input_types = [
+    settings.input_types = {
         # The first input is a sparse_binary_vector,
         # which means each dimension of the vector is either 0 or 1. It is the
         # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
+        'word': sparse_binary_vector(len(dictionary)),
         # The second input is an integer. It represents the category id of the
         # sample. 2 means there are two labels in the dataset.
         # (1 for positive and 0 for negative)
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 # Delaring a data provider. It has an initializer 'data_initialzer'.
@@ -67,12 +67,12 @@ def process(settings, file_name):
             # Return the features for the current comment. The first is a list
             # of ids representing a 0-1 binary sparse vector of the text,
             # the second is the integer id of the label.
-            yield word_vector, int(label)
+            yield {'word': word_vector, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [sparse_binary_vector(len(dictionary))]
+    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
 
 
 # Declaring a data provider for prediction. The difference with process
@@ -83,4 +83,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_vector
+            yield {'word': word_vector}
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
index b010253a8a..ddfa3ce9b7 100755
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -19,13 +19,13 @@ UNK_IDX = 0
 
 def initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {
         # Define the type of the first input as sequence of integer.
         # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
+        'word': integer_value_sequence(len(dictionary)),
         # Define the second input for label id
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@@ -35,15 +35,12 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t')
             words = comment.split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
+            yield {'word': word_slot, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value(
-            len(dictionary), seq_type=SequenceType.SEQUENCE)
-    ]
+    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
 
 
 @provider(init_hook=predict_initializer, should_shuffle=False)
@@ -52,4 +49,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_slot
+            yield {'word': word_slot}
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index f02e5038e9..e47c2dd01f 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
     --init_model_path=$model \
     --config_args=is_predict=1 \
     --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 
 mv rank-00000 result.txt
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index e3595fce75..01697fed48 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
   --show_parameter_stats_period=100 \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
index d4fbdad1d7..c20c652866 100755
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
 def meta_to_header(meta, name):
     metas = meta[name]['__meta__']['raw_meta']
     for each_meta in metas:
+        slot_name = each_meta.get('name', '%s_id' % name)
         if each_meta['type'] == 'id':
-            yield integer_value(each_meta['max'])
+            yield slot_name, integer_value(each_meta['max'])
         elif each_meta['type'] == 'embedding':
             is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(
+            yield slot_name, integer_value(
                 len(each_meta['dict']),
                 seq_type=SequenceType.SEQUENCE
                 if is_seq else SequenceType.NO_SEQUENCE)
         elif each_meta['type'] == 'one_hot_dense':
-            yield dense_vector(len(each_meta['dict']))
+            yield slot_name, dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
index 80c62d7561..c4ff96d80e 100755
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 
 
+def __list_to_map__(lst):
+    ret_val = dict()
+    for each in lst:
+        k, v = each
+        ret_val[k] = v
+    return ret_val
+
+
 def hook(settings, meta, **kwargs):
     """
     Init hook is invoked before process data. It will set obj.slots and store
@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
     #    second part is user features.
     #    final part is rating score.
     # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    headers = list(common_utils.meta_to_header(meta, 'movie'))
-    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
-    headers.append(dense_vector(1))  # Score
+    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
+    settings.movie_names = [h[0] for h in movie_headers]
+    headers = movie_headers
+    user_headers = list(common_utils.meta_to_header(meta, 'user'))
+    settings.user_names = [h[0] for h in user_headers]
+    headers.extend(user_headers)
+    headers.append(("rating", dense_vector(1)))  # Score
 
     # slot types.
-    settings.input_types = headers
+    settings.input_types = __list_to_map__(headers)
     settings.meta = meta
 
 
@@ -57,20 +69,20 @@ def process(settings, filename):
             movie_meta = settings.meta['movie'][movie_id]
             user_meta = settings.meta['user'][user_id]
 
-            outputs = [movie_id - 1]
+            outputs = [('movie_id', movie_id - 1)]
 
             # Then add movie features
-            for each_meta in movie_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(movie_meta):
+                outputs.append((settings.movie_names[i + 1], each_meta))
 
             # Then add user id.
-            outputs.append(user_id - 1)
+            outputs.append(('user_id', user_id - 1))
 
             # Then add user features.
-            for each_meta in user_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(user_meta):
+                outputs.append((settings.user_names[i + 1], each_meta))
 
             # Finally, add score
-            outputs.append([score])
+            outputs.append(('rating', [score]))
             # Return data to paddle
-            yield outputs
+            yield __list_to_map__(outputs)
diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py
new file mode 100755
index 0000000000..3afa7a1e9d
--- /dev/null
+++ b/demo/recommendation/evaluate.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
index 191120188e..8ad993eab3 100755
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@@ -34,8 +34,8 @@ if __name__ == '__main__':
     network.loadParameters(model_path)
     with open('./data/meta.bin', 'rb') as f:
         meta = pickle.load(f)
-        headers = list(meta_to_header(meta, 'movie'))
-        headers.extend(list(meta_to_header(meta, 'user')))
+        headers = [h[1] for h in meta_to_header(meta, 'movie')]
+        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
         cvt = DataProviderConverter(headers)
         while True:
             movie_id = int(raw_input("Input movie_id: "))
diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh
index e121e47019..eeb81ce3cb 100755
--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
@@ -14,6 +14,15 @@
 # limitations under the License.
 set -e
 
+UNAME_STR=`uname`
+
+if [[ ${UNAME_STR} == 'Linux' ]]; then
+	SHUF_PROG='shuf'
+else
+	SHUF_PROG='gshuf'
+fi
+
+
 cd "$(dirname "$0")"
 delimiter='::'
 dir=ml-1m
@@ -25,7 +34,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
 echo 'split train/test file'
 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
 echo 'shuffle train file'
-shuf $dir/ratings.dat.train > ratings.dat.train
+${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
 cp $dir/ratings.dat.test .
 echo "./data/ratings.dat.train" > train.list
 echo "./data/ratings.dat.test" > test.list
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index e341d1cc7a..22aef55608 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
     --log_period=100 \
     --dot_period=1 \
     --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
index cd90ca7bbe..65c9b674c7 100644
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
@@ -8,3 +8,7 @@ data/test.wsj.seq_pair
 data/test.wsj.words
 data/tgt.dict
 output
+data/emb
+data/targetDict.txt
+data/verbDict.txt
+data/wordDict.txt
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index a02a49a86e..da44111976 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -43,13 +43,13 @@ def extract_dict_features(pair_file, feature_file):
             mark[verb_index] = 1
             ctx_0 = sentence_list[verb_index]
 
-            if verb_index < len(labels_list) - 2:
+            if verb_index < len(labels_list) - 1:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
 
-            if verb_index < len(labels_list) - 3:
+            if verb_index < len(labels_list) - 2:
                 mark[verb_index + 2] = 1
                 ctx_p2 = sentence_list[verb_index + 2]
             else:
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
old mode 100644
new mode 100755
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 042cd4e7a9..360c57ea62 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -43,7 +43,7 @@ def get_batch_size(yeild_data):
     init_hook=hook,
     should_shuffle=True,
     calc_batch_size=get_batch_size,
-    can_over_batch_size=False,
+    can_over_batch_size=True,
     cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_name):
     with open(file_name, 'r') as fdata:
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 11d9d6a19c..095bbff2ea 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
   --config_args=is_test=1 \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 9354e72f46..eee14010d7 100755
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
   --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 8af827c338..85c4f3ccfc 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
              --trainer_count=4 \
              --config_args=is_test=1 \
              2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index 5ce8bf4b99..14620f733b 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
              --show_parameter_stats_period=100 \
              --test_all_data_in_one_period=1 \
              2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 33a42f6eff..9bb6dbdb1d 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
     --log_period=10 \
     --dot_period=5 \
     2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index a700ae2134..64b78f5e96 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
     --test_pass=12 \
     --trainer_count=1 \
     2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index bdece693e5..b0ec9854b1 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
index 736b580bb8..0624b17787 100644
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -74,7 +74,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
index ad1e7b68e7..b9b41b2433 100644
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -112,7 +112,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
index 9a706b98d8..37e196c842 100755
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
        --dot_period=10 \
        --log_period=1000 \
        --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
index 597b5afea9..ad6e2d8ee7 100755
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
        --log_period=10000 \
        --test_period=0 \
        --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/demo/traffic_prediction/README b/demo/traffic_prediction/README
new file mode 100644
index 0000000000..4c95188583
--- /dev/null
+++ b/demo/traffic_prediction/README
@@ -0,0 +1,7 @@
+run by:
+cd ./data
+sh get_data.sh
+cd ..
+sh train.sh
+sh predict.sh
+
diff --git a/demo/traffic_prediction/data/get_data.sh b/demo/traffic_prediction/data/get_data.sh
new file mode 100755
index 0000000000..f2fa548d47
--- /dev/null
+++ b/demo/traffic_prediction/data/get_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#download the dataset
+echo "Downloading traffic data..."
+wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
+
+#extract package
+echo "Unzipping..."
+tar -zxvf traffic_data.tar.gz
+
+echo "data/speeds.csv" > train.list
+echo "data/speeds.csv" > test.list
+echo "data/speeds.csv" > pred.list
+
+echo "Done."
diff --git a/demo/traffic_prediction/dataprovider.py b/demo/traffic_prediction/dataprovider.py
new file mode 100644
index 0000000000..c7883b6950
--- /dev/null
+++ b/demo/traffic_prediction/dataprovider.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import sys
+import numpy as np
+TERM_NUM = 24
+FORECASTING_NUM = 24
+LABEL_VALUE_NUM = 4
+
+
+def initHook(settings, file_list, **kwargs):
+    """
+    Init hook is invoked before process data. It will set obj.slots and store data meta.
+
+    :param settings: global object. It will passed to process routine.
+    :type obj: object
+    :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
+    :param kwargs: unused other arguments.
+    """
+    del kwargs  #unused 
+
+    settings.pool_size = sys.maxint
+    #Use a time seires of the past as feature.
+    #Dense_vector's expression form is [float,float,...,float]
+    settings.input_types = [dense_vector(TERM_NUM)]
+    #There are next FORECASTING_NUM fragments you need predict.
+    #Every predicted condition at time point has four states.
+    for i in range(FORECASTING_NUM):
+        settings.input_types.append(integer_value(LABEL_VALUE_NUM))
+
+
+@provider(
+    init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
+def process(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(",")[1:])
+            # Get the max index.
+            end_time = len(speeds)
+            # Scanning and generating samples
+            for i in range(TERM_NUM, end_time - FORECASTING_NUM):
+                # For dense slot
+                pre_spd = map(float, speeds[i - TERM_NUM:i])
+
+                # Integer value need predicting, values start from 0, so every one minus 1.
+                fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
+
+                # Predicting label is missing, abandon the sample.
+                if -1 in fol_spd:
+                    continue
+                yield [pre_spd] + fol_spd
+
+
+def predict_initHook(settings, file_list, **kwargs):
+    settings.pool_size = sys.maxint
+    settings.input_types = [dense_vector(TERM_NUM)]
+
+
+@provider(init_hook=predict_initHook, should_shuffle=False)
+def process_predict(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(","))
+            end_time = len(speeds)
+            pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
+            yield pre_spd
diff --git a/demo/traffic_prediction/gen_result.py b/demo/traffic_prediction/gen_result.py
new file mode 100644
index 0000000000..3da70b3031
--- /dev/null
+++ b/demo/traffic_prediction/gen_result.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+res = []
+with open('./rank-00000') as f:
+    for line in f:
+        pred = map(int, line.strip('\r\n;').split(";"))
+        #raw prediction range from 0 to 3
+        res.append([i + 1 for i in pred])
+
+file_name = open('./data/pred.list').read().strip('\r\n')
+
+FORECASTING_NUM = 24
+header = [
+    'id',
+    '201604200805',
+    '201604200810',
+    '201604200815',
+    '201604200820',
+    '201604200825',
+    '201604200830',
+    '201604200835',
+    '201604200840',
+    '201604200845',
+    '201604200850',
+    '201604200855',
+    '201604200900',
+    '201604200905',
+    '201604200910',
+    '201604200915',
+    '201604200920',
+    '201604200925',
+    '201604200930',
+    '201604200935',
+    '201604200940',
+    '201604200945',
+    '201604200950',
+    '201604200955',
+    '201604201000',
+]
+###################
+## To CSV format ##
+###################
+with open(file_name) as f:
+    f.next()
+    print ','.join(header)
+    for row_num, line in enumerate(f):
+        fields = line.rstrip('\r\n').split(',')
+        linkid = fields[0]
+        print linkid + ',' + ','.join(map(str, res[row_num]))
diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
new file mode 100755
index 0000000000..cec35dce11
--- /dev/null
+++ b/demo/traffic_prediction/predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+# pass choice 
+model="output/pass-00000"
+paddle train \
+    --config=$cfg \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. 
+
+python gen_result.py > result.txt
+
+rm -rf rank-00000
diff --git a/demo/traffic_prediction/train.sh b/demo/traffic_prediction/train.sh
new file mode 100755
index 0000000000..48dfc5604f
--- /dev/null
+++ b/demo/traffic_prediction/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=1000 \
+  --dot_period=10 \
+  --num_passes=10 \
+  --use_gpu=false \
+  --show_parameter_stats_period=3000 \
+  2>&1 | tee 'train.log'
diff --git a/demo/traffic_prediction/trainer_config.py b/demo/traffic_prediction/trainer_config.py
new file mode 100755
index 0000000000..52d678624a
--- /dev/null
+++ b/demo/traffic_prediction/trainer_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+################################### DATA Configuration #############################################
+is_predict = get_config_arg('is_predict', bool, False)
+trn = './data/train.list' if not is_predict else None
+tst = './data/test.list' if not is_predict else './data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(
+    train_list=trn, test_list=tst, module="dataprovider", obj=process)
+################################### Parameter Configuaration #######################################
+TERM_NUM = 24
+FORECASTING_NUM = 24
+emb_size = 16
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=1e-3,
+    learning_method=RMSPropOptimizer())
+################################### Algorithm Configuration ########################################
+
+output_label = []
+
+link_encode = data_layer(name='link_encode', size=TERM_NUM)
+for i in xrange(FORECASTING_NUM):
+    # Each task share same weight.
+    link_param = ParamAttr(
+        name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
+    link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
+    score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
+    if is_predict:
+        maxid = maxid_layer(score)
+        output_label.append(maxid)
+    else:
+        # Multi-task training.
+        label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
+        cls = classification_cost(
+            input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
+        output_label.append(cls)
+outputs(output_label)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index efcf8b0ad3..6fa42fd0c7 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -7,25 +7,50 @@ if(NOT DEFINED SPHINX_THEME_DIR)
 endif()
 
 # configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
 
 # Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
 sphinx_add_target(paddle_docs
                   html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
+                  ${SPHINX_HTML_DIR_EN})
 
 add_dependencies(paddle_docs
   gen_proto_py)
+
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_docs_cn
+  gen_proto_py)
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
new file mode 100644
index 0000000000..3bf030004d
--- /dev/null
+++ b/doc/about/index_cn.md
@@ -0,0 +1,11 @@
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
index 8a372d2bc2..065c430cde 100644
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@@ -11,4 +11,4 @@ We hope to build an active open source community both by providing feedback and
 Credits
 --------
 
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc_cn/ui/data_provider/dataprovider.rst b/doc/api/data_provider/dataprovider_cn.rst
similarity index 81%
rename from doc_cn/ui/data_provider/dataprovider.rst
rename to doc/api/data_provider/dataprovider_cn.rst
index e6796429a7..d08c6b3efa 100644
--- a/doc_cn/ui/data_provider/dataprovider.rst
+++ b/doc/api/data_provider/dataprovider_cn.rst
@@ -1,13 +1,15 @@
+.. _api_dataprovider:
+
 DataProvider的介绍
 ==================
 
-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
 
 PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
 
-- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
-  
-  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
-  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
-  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
+- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
+  
+  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
+  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
+  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
 - 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
diff --git a/doc/api/data_provider/index_en.rst b/doc/api/data_provider/dataprovider_en.rst
similarity index 100%
rename from doc/api/data_provider/index_en.rst
rename to doc/api/data_provider/dataprovider_en.rst
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc/api/data_provider/pydataprovider2_cn.rst
similarity index 95%
rename from doc_cn/ui/data_provider/pydataprovider2.rst
rename to doc/api/data_provider/pydataprovider2_cn.rst
index dce373118c..8f9db31cfb 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2_cn.rst
@@ -1,227 +1,229 @@
-PyDataProvider2的使用
-=====================
-
-PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
-
-..  contents::
-
-MNIST的使用场景
----------------
-
-我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
-
-样例数据
-++++++++
-
-MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
-
-..  literalinclude:: mnist_train.txt
-
-其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
-
-..  literalinclude:: train.list
-
-dataprovider的使用
-++++++++++++++++++
-
-..  literalinclude:: mnist_provider.dict.py
-
-- 首先，引入PaddlePaddle的PyDataProvider2包。
-- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
-  
-  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-
-    ..  literalinclude:: mnist_config.py
-         :lines: 9-10
-
-  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
-- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
-
-  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
-    
-    - 返回的顺序需要和input_types中定义的顺序一致。
-    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
-    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
-  
-  - 该函数具有两个参数：
-  
-    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
-    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
-
-网络配置中的调用
-++++++++++++++++
-
-在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
-
-..  literalinclude:: mnist_config.py
-     :lines: 1-7
-
-训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-
-小结
-+++++
-
-至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
-
-* 将数据组合成Batch进行训练
-* 对训练数据进行Shuffle
-* 多线程的数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢？
-
-时序模型的使用场景
-------------------
-样例数据
-++++++++
-
-时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
-
-本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
-
-..  literalinclude:: sentimental_train.txt
-
-dataprovider的使用
-++++++++++++++++++
-
-相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
-
-- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
-- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
-
-..  literalinclude:: sentimental_provider.py
-
-网络配置中的调用
-++++++++++++++++
-
-调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
-
-* 在配置中需要读取外部字典。
-* 在声明DataProvider的时候传入dictionary作为参数。
-
-..  literalinclude:: sentimental_config.py
-     :emphasize-lines: 12-14
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-
-*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
-*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
-*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
-*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
-*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
-*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
-*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
-*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
-*  check：如果为true，会根据input_types检查数据的合法性。
-*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。
-
-四种数据类型：
-
-* dense_vector：稠密的浮点数向量。
-* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
-* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
-* integer：整数标签。
-
-三种序列模式：
-
-* SequenceType.NO_SEQUENCE：不是一条序列
-* SequenceType.SEQUENCE：是一条时间序列
-* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同，列表如下：
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中，f代表一个浮点数，i代表一个整数。
-
-注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
-
-- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
-- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
-
-* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
-    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
-    * settings.logger：一个logging对象。
-* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
-    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
-    * 用户定义的参数：使用args在网络配置中设置。
-
-注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-
-cache
-+++++
-
-PyDataProvider2提供了两种简单的Cache策略：
-
-* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
-
-虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-
-由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
-
-1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
-2. 在generator的上下文中尽量留下非常少的变量引用，例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-
+..  _api_pydataprovider2:
+
+PyDataProvider2的使用
+=====================
+
+PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
+
+..  contents::
+
+MNIST的使用场景
+---------------
+
+我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
+
+样例数据
+++++++++
+
+MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
+
+..  literalinclude:: src/mnist_train.txt
+
+其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
+
+..  literalinclude:: src/train.list
+
+dataprovider的使用
+++++++++++++++++++
+
+..  literalinclude:: src/mnist_provider.dict.py
+
+- 首先，引入PaddlePaddle的PyDataProvider2包。
+- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
+  
+  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
+
+    ..  literalinclude:: src/mnist_config.py
+         :lines: 9-10
+
+  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
+- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
+
+  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
+    
+    - 返回的顺序需要和input_types中定义的顺序一致。
+    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
+    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
+  
+  - 该函数具有两个参数：
+  
+    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
+    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
+
+网络配置中的调用
+++++++++++++++++
+
+在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
+
+..  literalinclude:: src/mnist_config.py
+     :lines: 1-7
+
+训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
+
+小结
++++++
+
+至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
+
+* 将数据组合成Batch进行训练
+* 对训练数据进行Shuffle
+* 多线程的数据读取
+* 缓存训练数据到内存(可选)
+* CPU->GPU双缓存
+
+是不是很简单呢？
+
+时序模型的使用场景
+------------------
+样例数据
+++++++++
+
+时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
+
+本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
+
+..  literalinclude:: src/sentimental_train.txt
+
+dataprovider的使用
+++++++++++++++++++
+
+相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
+
+- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
+- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
+
+..  literalinclude:: src/sentimental_provider.py
+
+网络配置中的调用
+++++++++++++++++
+
+调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
+
+* 在配置中需要读取外部字典。
+* 在声明DataProvider的时候传入dictionary作为参数。
+
+..  literalinclude:: src/sentimental_config.py
+     :emphasize-lines: 12-14
+
+参考(Reference)
+---------------
+
+@provider
++++++++++
+
+``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
+
+*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
+*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
+*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
+*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
+*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
+*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
+*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
+*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
+*  check：如果为true，会根据input_types检查数据的合法性。
+*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
+
+input_types
++++++++++++
+
+PaddlePaddle的数据包括四种主要类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+init_hook
++++++++++
+
+init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
+
+* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
+    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
+    * settings.logger：一个logging对象。
+* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
+    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
+    * 用户定义的参数：使用args在网络配置中设置。
+
+注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
+
+cache
++++++
+
+PyDataProvider2提供了两种简单的Cache策略：
+
+* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
+  读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
+
+虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
+
+1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
+2. 在generator的上下文中尽量留下非常少的变量引用，例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
+
diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/data_provider/pydataprovider2_en.rst
index 50e8b0d329..30357be325 100644
--- a/doc/api/data_provider/pydataprovider2_en.rst
+++ b/doc/api/data_provider/pydataprovider2_en.rst
@@ -1,4 +1,4 @@
-..  _api_pydataprovider2_en:
+..  _api_pydataprovider2:
 
 PyDataProvider2
 ===============
@@ -24,18 +24,18 @@ of 28 x 28 pixels.
 
 A small part of the original data as an example is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
+.. literalinclude:: src/mnist_train.txt
 
 Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/train.list
+.. literalinclude:: src/train.list
 
 The corresponding dataprovider is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.py
+.. literalinclude:: src/mnist_provider.dict.py
 
 The first line imports PyDataProvider2 package.
 The main function is the process function, that has two parameters.
@@ -74,7 +74,7 @@ sample by using keywords :code:`yield`.
 Only a few lines of codes need to be added into the training configuration file,
 you can take this as an example.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
+.. literalinclude:: src/mnist_config.py
 
 Here we specify training data by :code:`train.list`, and no testing data is specified.
 The method which actually provide data is :code:`process`.
@@ -83,7 +83,7 @@ User also can use another style to provide data, which defines the
 :code:`data_layer`'s name explicitly when `yield`. For example,
 the :code:`dataprovider` is shown as below.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+.. literalinclude:: src/mnist_provider.dict.py
    :linenos:
 
 If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
@@ -104,7 +104,7 @@ And PaddlePadle will do all of the rest things\:
 
 Is this cool?
 
-..  _api_pydataprovider2_en_sequential_model:
+..  _api_pydataprovider2_sequential_model:
 
 DataProvider for the sequential model
 -------------------------------------
@@ -121,11 +121,11 @@ negative sentiment (marked by 0 and 1 respectively).
 
 A small part of the original data as an example can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_train.txt
+.. literalinclude:: src/sentimental_train.txt
 
 The corresponding data provider can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_provider.py
+.. literalinclude:: src/sentimental_provider.py
 
 This data provider for sequential model is a little more complex than that
 for MINST dataset.
@@ -143,7 +143,7 @@ initialized. The :code:`on_init` function has the following parameters:
 To pass these parameters into DataProvider, the following lines should be added
 into trainer configuration file.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_config.py
+.. literalinclude:: src/sentimental_config.py
 
 The definition is basically same as MNIST example, except:
 * Load dictionary in this configuration
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc/api/data_provider/src/mnist_config.py
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_config.py
rename to doc/api/data_provider/src/mnist_config.py
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc/api/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_provider.dict.py
rename to doc/api/data_provider/src/mnist_provider.dict.py
diff --git a/doc_cn/ui/data_provider/mnist_train.txt b/doc/api/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_train.txt
rename to doc/api/data_provider/src/mnist_train.txt
diff --git a/doc_cn/ui/data_provider/sentimental_config.py b/doc/api/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_config.py
rename to doc/api/data_provider/src/sentimental_config.py
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc/api/data_provider/src/sentimental_provider.py
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_provider.py
rename to doc/api/data_provider/src/sentimental_provider.py
diff --git a/doc_cn/ui/data_provider/sentimental_train.txt b/doc/api/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_train.txt
rename to doc/api/data_provider/src/sentimental_train.txt
diff --git a/doc_cn/ui/data_provider/train.list b/doc/api/data_provider/src/train.list
similarity index 100%
rename from doc_cn/ui/data_provider/train.list
rename to doc/api/data_provider/src/train.list
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
new file mode 100644
index 0000000000..3718cd73a2
--- /dev/null
+++ b/doc/api/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 6fdee9f928..10c297a71d 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,7 +7,7 @@ DataProvider API
 ..  toctree::
     :maxdepth: 1
 
-    data_provider/index_en.rst
+    data_provider/dataprovider_en.rst
     data_provider/pydataprovider2_en.rst
 
 ..  _api_trainer_config:
diff --git a/doc/api/predict/predict_sample.py b/doc/api/predict/src/predict_sample.py
similarity index 100%
rename from doc/api/predict/predict_sample.py
rename to doc/api/predict/src/predict_sample.py
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc/api/predict/swig_py_paddle_cn.rst
similarity index 97%
rename from doc_cn/ui/predict/swig_py_paddle.rst
rename to doc/api/predict/swig_py_paddle_cn.rst
index 05f25345c5..42f333dba2 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc/api/predict/swig_py_paddle_cn.rst
@@ -1,3 +1,5 @@
+.. _api_swig_py_paddle:
+
 基于Python的预测
 ================
 
@@ -34,7 +36,7 @@ PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会
 
 如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
 
-..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
+..  literalinclude:: src/predict_sample.py
     :language: python
     :lines: 15-18,121-136
 
diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/predict/swig_py_paddle_en.rst
index 8b145e5b30..1c628e6971 100644
--- a/doc/api/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
@@ -13,7 +13,7 @@ Here is a sample python script that shows the typical prediction process for the
 MNIST classification problem. A complete sample code could be found at
 :code:`src_root/doc/ui/predict/predict_sample.py`.
 
-..  literalinclude:: ./predict_sample.py
+..  literalinclude:: src/predict_sample.py
     :language: python
     :lines: 15-18,90-100,101-104
 
@@ -23,7 +23,7 @@ python's :code:`help()` function. Let's walk through the above python script:
 
 * At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
   PaddlePaddle with command line arguments, for more about command line arguments
-  see :ref:`cmd_detail_introduction_en` .
+  see :ref:`cmd_detail_introduction` .
 * Parse the configuration file that is used in training with :code:`parse_config()`.
   Because data to predict with always have no label, and output of prediction work
   normally is the output layer rather than the cost layer, so you should modify
@@ -36,7 +36,7 @@ python's :code:`help()` function. Let's walk through the above python script:
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
       class DataProviderConverter that can accept the same input data with
       PyDataProvider2, for more information please refer to document
-      of :ref:`api_pydataprovider2_en` .
+      of :ref:`api_pydataprovider2` .
 * Do the prediction with :code:`forwardTest()`, which takes the converted
   input data and outputs the activations of the output layer.
 
diff --git a/doc/api/trainer_config_helpers/evaluators.rst b/doc/api/trainer_config_helpers/evaluators.rst
index d6a79c13e2..11dc735164 100644
--- a/doc/api/trainer_config_helpers/evaluators.rst
+++ b/doc/api/trainer_config_helpers/evaluators.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_evaluators:
+
 ==========
 Evaluators
 ==========
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
index 52a6cfb120..4e429650e5 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -187,6 +187,8 @@ get_output_layer
 Mixed Layer
 ===========
 
+..  _api_trainer_config_helpers_layers_mixed_layer:
+
 mixed_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -255,12 +257,16 @@ pooling_layer
     :members: pooling_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_last_seq:
+
 last_seq
 --------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: last_seq
     :noindex:
 
+..  _api_trainer_config_helpers_layers_first_seq:
+
 first_seq
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -282,6 +288,8 @@ block_expand_layer
     :members: block_expand_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_expand_layer:
+
 expand_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -374,6 +382,8 @@ sampling_id_layer
     :members: sampling_id_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_cost_layers:
+
 Cost Layers
 ===========
 
diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/trainer_config_helpers/networks.rst
index e13c368051..edb53acbf0 100644
--- a/doc/api/trainer_config_helpers/networks.rst
+++ b/doc/api/trainer_config_helpers/networks.rst
@@ -36,6 +36,8 @@ img_conv_group
     :members: img_conv_group
     :noindex:
 
+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
 simple_img_conv_pool
 --------------------
 ..  automodule:: paddle.trainer_config_helpers.networks
diff --git a/doc/api/trainer_config_helpers/optimizers.rst b/doc/api/trainer_config_helpers/optimizers.rst
index 7ca4e34156..d2f4958c92 100644
--- a/doc/api/trainer_config_helpers/optimizers.rst
+++ b/doc/api/trainer_config_helpers/optimizers.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_optimizers:
+
 ==========
 Optimizers
 ==========
@@ -50,6 +52,8 @@ RMSPropOptimizer
     :members: RMSPropOptimizer
     :noindex:
 
+..  _api_trainer_config_helpers_optimizers_settings:
+
 settings
 ========
 ..  automodule:: paddle.trainer_config_helpers.optimizers
diff --git a/doc_cn/faq/index.rst b/doc/faq/index_cn.rst
similarity index 93%
rename from doc_cn/faq/index.rst
rename to doc/faq/index_cn.rst
index df8f1308cb..6d5367177d 100644
--- a/doc_cn/faq/index.rst
+++ b/doc/faq/index_cn.rst
@@ -1,5 +1,5 @@
 ####################
-PaddlePaddle常见问题
+FAQ
 ####################
 
 ..  contents::
@@ -33,10 +33,9 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
 那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里
-<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。
 
 神经元激活内存
 ++++++++++++++
@@ -73,10 +72,10 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 减少数据载入的耗时
 ++++++++++++++++++
 
-使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
 同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
 
@@ -90,13 +89,12 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 
 使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
 
-..  literalinclude:: word2vec_dataprovider.py
+..  literalinclude:: src/word2vec_dataprovider.py
 
 这个任务的配置为\:
 
-..  literalinclude:: word2vec_config.py
+..  literalinclude:: src/word2vec_config.py
 
-更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
 
 利用更多的计算资源
 ++++++++++++++++++
@@ -104,17 +102,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:
 
 * 单机CPU训练
+
   * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
 
 * 单机GPU训练
+
   * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
   * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
 
 * 多机训练
-  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
+
+  * 请参考 :ref:`cluster_train` 。
 
 
-3. 遇到“非法指令”或者是“illegal instruction” 
+3. 遇到“非法指令”或者是“illegal instruction”
 --------------------------------------------
 
 PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
@@ -141,7 +142,7 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
 
 ..  code-block:: python
 
-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                       bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
 
 上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
@@ -157,8 +158,8 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
------------------------------------------------------------------------
+7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
 
 出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
 而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
@@ -191,14 +192,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     41 - test_config_parser (Failed)
     42 - test_swig_api (Failed)
     43 - layers_test (Failed)
-    
+
 并且查询PaddlePaddle单元测试的日志，提示：
 
 ..  code-block:: bash
-    
+
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-    
+
 解决办法是：
 
 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
@@ -220,18 +221,18 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 
 10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
-----------------------------------------------------------
+----------------------------------------------------------------
 
 这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
 用户强制指定特定的Python版本，具体操作如下：
 
     ..  code-block:: bash
-        
+
         cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
 
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
 
-10. A protocol message was rejected because it was too big
+10. A protocol message was rejected because it was too big
 ----------------------------------------------------------
 
 如果在训练NLP相关模型时，出现以下错误：
@@ -239,7 +240,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..  code-block:: bash
 
     [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
 
 可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
 
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc/faq/src/reduce_min_pool_size.py
similarity index 100%
rename from doc_cn/faq/reduce_min_pool_size.py
rename to doc/faq/src/reduce_min_pool_size.py
diff --git a/doc_cn/faq/word2vec_config.py b/doc/faq/src/word2vec_config.py
similarity index 100%
rename from doc_cn/faq/word2vec_config.py
rename to doc/faq/src/word2vec_config.py
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc/faq/src/word2vec_dataprovider.py
similarity index 100%
rename from doc_cn/faq/word2vec_dataprovider.py
rename to doc/faq/src/word2vec_dataprovider.py
diff --git a/doc_cn/introduction/index.rst b/doc/getstarted/basic_usage/index_cn.rst
similarity index 80%
rename from doc_cn/introduction/index.rst
rename to doc/getstarted/basic_usage/index_cn.rst
index c996f5f4ac..d01cdaaeb7 100644
--- a/doc_cn/introduction/index.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -1,16 +1,16 @@
-简介
-====
+经典的线性回归任务
+==================
 
 PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
 
-1. 一个经典的任务
------------------
+任务简介
+--------
 
 我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
 
 一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
 
-2. 准备数据
+准备数据
 -----------
 
 假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
@@ -28,7 +28,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
             x = random.random()
             yield [x], [2*x+0.3]
 
-3. 训练模型
+训练模型
 -----------
 
 为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
@@ -58,6 +58,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     cost = regression_cost(input= ȳ, label=y)
     outputs(cost)
 
+
 这段简短的配置展示了PaddlePaddle的基本用法：
 
 - 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
@@ -65,10 +66,10 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
 - 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
 
 - 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-	
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-	- **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    
+    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
@@ -78,7 +79,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
 
 PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
 
-4. 模型检验
+模型检验
 -----------
 
 训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
@@ -99,16 +100,9 @@ PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件
     # w=1.999743, b=0.300137
 
 .. image:: ./parameters.png
-	 :align: center
-	 :scale: 80 %
+     :align: center
+     :scale: 80 %
 
 从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
 
 这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
-
-5. 推荐后续阅读
----------------
-
-- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
-- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index 4ffadc68ee..c10b897d42 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -1,15 +1,15 @@
-Basic Usage
-=============
+Simple Linear Regression
+========================
 
 PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
 
-1. A Classic Problem
----------------------
+Problem Background
+------------------
 
 Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
 
-2. Prepare the Data
---------------------
+Prepare the Data
+-----------------
 
 Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
 
@@ -26,8 +26,8 @@ Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's se
                 x = random.random()
                 yield [x], [2*x+0.3]
 
-3. Train a NeuralNetwork
--------------------------
+Train a NeuralNetwork
+----------------------
 
 To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
 
@@ -73,8 +73,8 @@ Now that everything is ready, you can train the network with a simple command li
 This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
 
 
-4. Evaluate the Model
------------------------
+Evaluate the Model
+-------------------
 
 Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 5db871d59a..6954be3b2b 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
 ```
-
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
-```
-git submodule update --remote
-```
-
 ## <span id="requirements">Requirements</span>
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
 - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 2.8
+- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
-- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
-- **Python**: only python 2.7 is supported currently
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 
 ### Options
 
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 
 <html>
 <table> 
@@ -47,14 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_GLOG</td><td class="left">Compile with glog. If not found, default: an internal log implementation.</td></tr>
-<tr><td class="left">WITH_GFLAGS</td><td class="left">Compile with gflags. If not found, default: an internal flag implementation.</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">	Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -66,18 +62,15 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 
 As a simple example, consider the following:  
 
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
   
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
-    # install swig on ubuntu
-    sudo apt-get install swig
-    # install swig on Mac OS X
-    brew install swig
-
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
+    # specify MKL
+    cmake .. -DMKL_ROOT=<mkl_path>
+    # or specify OpenBLAS
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
     ```
 
 2. **Doc Dependencies(optional)**
@@ -106,17 +99,9 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-    # optional
-    sudo apt-get install libgoogle-glog-dev
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
-    sudo pip install wheel
-    pushd /usr/src/gtest
-    cmake .
-    make
-    sudo cp *.a /usr/lib
-    popd
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
     ```
   
 - **GPU Dependencies (optional)**
@@ -151,51 +136,17 @@ As usual, the best option is to create build folder under paddle project directo
 
 ```bash
 mkdir build && cd build
-cmake ..
-```
-
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-
-As a simple example, consider the following:
+``` 
 
-- **Only CPU with swig**
-
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
-- **GPU with swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
-  ```
-
-- **GPU with doc and swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
-
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
+# install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
similarity index 94%
rename from doc_cn/build_and_install/cmake/compile_options.rst
rename to doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index f345ead2bf..be0c1ffa45 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -1,43 +1,43 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
similarity index 100%
rename from doc_cn/build_and_install/cmake/cblas_settings.csv
rename to doc/getstarted/build_and_install/cmake/cblas_settings.csv
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
similarity index 65%
rename from doc_cn/build_and_install/cmake/compile_options.csv
rename to doc/getstarted/build_and_install/cmake/compile_options.csv
index 12b45eebb2..463b825470 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -1,14 +1,12 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
 WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
similarity index 93%
rename from doc_cn/build_and_install/install/docker_install.rst
rename to doc/getstarted/build_and_install/docker_install_cn.rst
index 40339659be..35234e0eb3 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -111,7 +111,24 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
 简单的含有ssh的Dockerfile如下：
 
-..  literalinclude:: paddle_ssh.Dockerfile
+..  code-block:: bash
+
+    FROM paddledev/paddle:cpu-latest
+
+    MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+    RUN apt-get update
+    RUN apt-get install -y openssh-server
+    RUN mkdir /var/run/sshd
+    RUN echo 'root:root' | chpasswd
+
+    RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+    RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+    EXPOSE 22
+
+    CMD    ["/usr/sbin/sshd", "-D"]
+
 
 使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 8df7e063a1..51a1a11674 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,79 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.
 
 
+Development Using Docker
+------------------------
+
+Developers can work on PaddlePaddle using Docker.  This allows
+developers to work on different platforms -- Linux, Mac OS X, and
+Windows -- in a consistent way.
+
+The general development workflow with Docker and CMake is as follows:
+
+1. Get the source code of Paddle:
+
+   .. code-block:: bash
+
+      git clone https://github.com/PaddlePaddle/Paddle.git
+
+
+2. Build a development Docker image :code:`paddle:dev` from the source
+   code.  This image contains all the development tools and
+   dependencies of PaddlePaddle.
+
+   .. code-block:: bash
+
+      cd paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+
+   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+
+   .. code-block:: bash
+
+      docker build \
+       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
+       -t paddle:dev \
+       -f paddle/scripts/docker/Dockerfile .
+
+
+3. Run the image as a container and mounting local source code
+   directory into the container.  This allows us to change the code on
+   the host and build it within the container.
+
+   .. code-block:: bash
+
+      docker run       \
+       -d              \
+       --name paddle   \
+       -p 2022:22      \
+       -v $PWD:/paddle \
+       paddle:dev
+
+   where :code:`-d` makes the container running in background,
+   :code:`--name paddle` allows us to run a nginx container to serve
+   documents in this container, :code:`-p 2022:22` allows us to SSH
+   into this container, :code:`-v $PWD:/paddle` shares the source code
+   on the host with the container.
+
+4. SSH into the container:
+
+   .. code-block:: bash
+
+      ssh root@localhost -p 2022
+
+5. We can edit the source code in the container or on this host.  Then
+   we can build using cmake
+
+   .. code-block:: bash
+
+      cd /paddle # where paddle source code has been mounted into the container
+      mkdir -p build
+      cd build
+      cmake -DWITH_TESTING=ON ..
+      make -j `nproc`
+      CTEST_OUTPUT_ON_FAILURE=1 ctest
+
+
 CPU-only and GPU Images
 -----------------------
 
@@ -17,7 +90,7 @@ CPU-only one and a CUDA GPU one.  We do so by configuring
 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
 automatically runs the following commands:
 
-.. code-block:: base
+.. code-block:: bash
 
    docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
    docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
@@ -77,9 +150,8 @@ source code:
 .. code-block:: bash
 
    cd ~
-   git clone github.com/PaddlePaddle/Paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
-   git submodule update --init --recursive
    docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
    docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
 
@@ -93,7 +165,7 @@ generated using `woboq code browser
 for users to browse and understand the C++ source code.
 
 As long as we give the Paddle Docker container a name, we can run an
-additional nginx Docker container to serve the volume from the Paddle
+additional Nginx Docker container to serve the volume from the Paddle
 container:
 
 .. code-block:: bash
@@ -104,78 +176,3 @@ container:
 
 Then we can direct our Web browser to the HTML version of source code
 at http://localhost:8088/paddle/
-
-
-Development Using Docker
-------------------------
-
-Develpers can work on PaddlePaddle using Docker.  This allows
-developers to work on different platforms -- Linux, Mac OS X, and
-Windows -- in a consistent way.
-
-The general development workflow with Docker and Bazel is as follows:
-
-1. Get the source code of Paddle:
-
-   .. code-block:: bash
-
-      git clone --recursive https://github.com/paddlepaddle/paddle
-
-
-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
-
-
-   .. code-block:: bash
-
-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
-
-
-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
-
-   .. code-block:: bash
-
-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       -v $HOME/.cache/bazel:/root/.cache/bazel \
-       paddle:dev
-
-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container, :code:`-v
-   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
-   host with the container.
-
-4. SSH into the container:
-
-   .. code-block:: bash
-
-      ssh root@localhost -p 2022
-
-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
-
-   .. code-block:: bash
-
-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
-
-   or Bazel in the container:
-
-   .. code-block:: bash
-
-      cd /paddle
-      bazel test ...
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
new file mode 100644
index 0000000000..a24df6c518
--- /dev/null
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -0,0 +1,29 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+安装流程
+++++++++
+
+PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+
+.. toctree::
+   :maxdepth: 1
+   
+   docker_install_cn.rst 
+   ubuntu_install_cn.rst
+
+
+
+编译流程
+++++++++
+
+..  warning::
+
+    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+
+..  toctree::
+    :maxdepth: 1
+
+    cmake/build_from_source_cn.rst
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
similarity index 71%
rename from doc_cn/build_and_install/install/ubuntu_install.rst
rename to doc/getstarted/build_and_install/ubuntu_install_cn.rst
index 4500d6e0b0..d02d9c63bb 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -38,7 +38,18 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
 
 安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 
-..  literalinclude:: paddle_version.txt
+..  code-block:: shell
+
+    PaddlePaddle 0.8.0b1, compiled with
+        with_avx: ON
+        with_gpu: OFF
+        with_double: OFF
+        with_python: ON
+        with_rdma: OFF
+        with_metric_learning:
+        with_timer: OFF
+        with_predict_sdk:
+
 
 可能遇到的问题
 --------------
@@ -48,9 +59,9 @@ libcudart.so/libcudnn.so找不到
 
 安装完成后，运行 :code:`paddle train` 报错\:
 
-.. 	code-block:: shell
+..  code-block:: shell
 
-	  0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
+      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 
 原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
new file mode 100644
index 0000000000..c6a4d3121c
--- /dev/null
+++ b/doc/getstarted/index_cn.rst
@@ -0,0 +1,8 @@
+新手入门
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index_cn.rst
+  basic_usage/index_cn.rst
diff --git a/doc/howto/cluster/cluster_train_en.md b/doc/howto/cluster/cluster_train_en.md
deleted file mode 100644
index 1de34a6a99..0000000000
--- a/doc/howto/cluster/cluster_train_en.md
+++ /dev/null
@@ -1,156 +0,0 @@
-# How to Run Distributed Training
-
-In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
-
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
-
-## Prerequisite
-
-1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
-
-   ```bash
-   pip install fabric
-   ```
-
-1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
-
-1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
-
-## Prepare Job Workspace
-
-We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
-
-These ```train/test``` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
-
-Generally, you can use same model file from local training for cluster training. What you should have in mind that, the ```batch_size``` set in ```setting``` function in model file means batch size in ```each``` node of cluster job instead of total batch size if synchronization SGD was used.
-
-Following steps are based on demo/recommendation demo in demo directory.
-
-You just go through demo/recommendation tutorial doc until ```Train``` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
-
-At last your workspace should look like as follow:
-```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
-```
-Not all of these files are needed for cluster training, but it's not necessary to remove useless files.
-
-```trainer_config.py```
-Indicates the model config file.
-
-```train.list``` and ```test.list```
-File index. It stores all relative or absolute file paths of all train/test data at current node.
-
-```dataprovider.py```
-used to read train/test samples. It's same as local training.
-
-```data```
-all files in data directory are refered by train.list/test.list which are refered by data provider.
-
-
-## Prepare Cluster Job Configuration
-
-The options below must be carefully set in cluster_train/conf.py
-
-```HOSTS```  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
-
-```ROOT_DIR``` workspace ROOT directory for placing JOB workspace directory
-
-```PADDLE_NIC``` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
-
-```PADDLE_PORT``` port number for cluster commnunication channel
-
-```PADDLE_PORTS_NUM``` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
-
-```PADDLE_PORTS_NUM_FOR_SPARSE``` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like ```PADDLE_PORTS_NUM```
-
-```LD_LIBRARY_PATH``` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
-
-Default Configuration as follow:
-
-```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-workspace configuration
-'''
-
-#root dir for workspace
-ROOT_DIR = "/home/paddle"
-
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
-```
-
-### Launching Cluster Job
-```paddle.py``` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as ```paddle.py``` command options and ```paddle.py``` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-```paddle.py```provides two distinguished command option for easy job launching.
-
-```job_dispatch_package```  set it with local ```workspace```directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
-```job_workspace```  set it with already deployed workspace directory, ```paddle.py``` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-```cluster_train/run.sh``` provides command line sample to run ```demo/recommendation``` cluster job, just modify ```job_dispatch_package``` and ```job_workspace``` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-### Kill Cluster Job
-```paddle.py``` can capture ```Ctrl + C``` SIGINT signal to automatically kill all processes launched by it. So just stop ```paddle.py``` to kill cluster job. You should mannally kill job if program crashed.
-
-### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-```paddle_trainer.INFO```
-It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
-
-```paddle_pserver2.INFO```
-It provides pserver running log, which could help to diagnose distributed error.
-
-```server.log```
-It provides stderr and stdout of pserver process. Check error log if training crashs.
-
-```train.log```
-It provides stderr and stdout of trainer process. Check error log if training crashs.
-
-### Check Model Output
-After one pass finished, model files will be writed in ```output``` directory in node 0.
-```nodefile``` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/cmd_parameter/index_en.md b/doc/howto/cmd_parameter/index_en.md
deleted file mode 100644
index fb658f2aa5..0000000000
--- a/doc/howto/cmd_parameter/index_en.md
+++ /dev/null
@@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index_en:
-```
-# How to Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/howto/deep_model/index_en.rst b/doc/howto/deep_model/index_en.rst
deleted file mode 100644
index 00a45641e6..0000000000
--- a/doc/howto/deep_model/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn_en.rst
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
similarity index 88%
rename from doc_cn/algorithm/rnn/hierarchical-layer.rst
rename to doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index a9906b8b9c..943b1d4bb8 100644
--- a/doc_cn/algorithm/rnn/hierarchical-layer.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -22,7 +22,7 @@
 pooling_layer
 ==============
 
-pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
+pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_pooling_layer` 配置API。
 
 ..	code-block:: bash
 
@@ -47,7 +47,7 @@ pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
 last_seq 和 first_seq
 =====================
 
-last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_ 配置API。
+last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_seq` 类似），详细见 :ref:`api_trainer_config_helpers_layers_last_seq` 配置API。
 
 ..	code-block:: bash
 
@@ -68,7 +68,7 @@ last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_
 expand_layer
 ============
 
-expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
+expand_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_expand_layer` 配置API。
 
 ..	code-block:: bash
 
@@ -87,9 +87,3 @@ expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
   - 作用：一个单层序列经过运算扩展成一个双层序列
   - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
   - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
-
-
-.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
-.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
-.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
-.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
diff --git a/doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 91%
rename from doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst
rename to doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
index 9baa0b5780..96e52b910a 100644
--- a/doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
@@ -24,18 +24,18 @@
 
 - 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
 
-..  literalinclude:: ../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
 
-..  literalinclude:: ../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
 其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -43,10 +43,11 @@
 - 这是普通的单层时间序列的DataProvider代码，其说明如下：
   
   * DataProvider共返回两个数据，分别是words和label。即上述代码中的第19行。
-  - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
-  - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequenceGen.py
+    - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
+    - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -63,7 +64,7 @@
 
 首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -84,7 +85,7 @@
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -106,7 +107,7 @@
 
 - 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -115,7 +116,7 @@
   - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
   - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -151,14 +152,14 @@
 
 * 单层RNN\:
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * 双层RNN\ \:
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
@@ -181,11 +182,11 @@ Memory
 
 Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络，通常要求时间步之间具有一些依赖性，即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。
 
-..  graphviz:: glossary_rnn.dot
+..  graphviz:: src/glossary_rnn.dot
 
 上图中虚线的连接，即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候，将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出，然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。
 
-..  graphviz:: glossary_rnn_with_memory.dot
+..  graphviz:: src/glossary_rnn_with_memory.dot
 
 使用这种方式，PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的，哪些不是。
 
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
new file mode 100644
index 0000000000..9ecab5594c
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -0,0 +1,10 @@
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_cn.rst
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
new file mode 100644
index 0000000000..7adc79873d
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -0,0 +1,7 @@
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc/howto/deep_model/rnn/recurrent_group_cn.md
similarity index 97%
rename from doc_cn/algorithm/rnn/rnn-tutorial.md
rename to doc/howto/deep_model/rnn/recurrent_group_cn.md
index 9e488b0d51..06dc9e089a 100644
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc/howto/deep_model/rnn/recurrent_group_cn.md
@@ -1,96 +1,96 @@
-# Recurrent Group教程
-
-## 概述
-
-序列数据是自然语言处理任务面对的一种主要输入数据类型。
-
-一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
-
-双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
-
-在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
-
-更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
-
-目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
- 
-## 相关概念
-
-### 基本原理
-`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
-
-PaddlePaddle中，`recurrent_group`的一个简单调用如下：
-
-``` python
-recurrent_group(step, input, reverse)
-```
-- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
-- input：输入，必须是一个单层序列，或者一个双层序列
-- reverse：是否以逆序处理输入序列
- 
-使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
-
-### 输入
-`recurrent_group`处理的输入序列主要分为以下三种类型：
- 
-- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
-		
-- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
-	  
-- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
-
-### 输入示例
-
-序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
-
-给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
-    
-- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
-
-- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
-		
-在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
-		 
-### 输出
-`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
-
-### memory
-memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
-
-可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
-
-## 双层RNN介绍
-`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
-
-利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
-
-- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
-- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
-
-为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
-
-## 双层RNN的使用
-
-### 训练流程的使用方法
-使用 `recurrent_group`需要遵循以下约定：
- 
-- **单进单出**：输入和输出都是单层序列。
-  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
-  - 输出一个单层序列，输出序列的词语数和输入序列一致。
-  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
-  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
- 
-- **双进双出**：输入和输出都是双层序列。
-  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
-  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
-  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
-  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
-
-- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
- 
-
-### 生成流程的使用方法
-使用`beam_search`需要遵循以下约定：
-
-- 单层RNN：从一个word生成下一个word。
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
 - 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
new file mode 100644
index 0000000000..ac2bd0775f
--- /dev/null
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -0,0 +1,278 @@
+RNN配置
+========
+
+本教程将指导你如何在 PaddlePaddle
+中配置循环神经网络（RNN）。PaddlePaddle
+高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
+
+-  准备用来学习循环神经网络的序列数据。
+-  配置循环神经网络架构。
+-  使用学习完成的循环神经网络模型生成序列。
+
+我们将使用 vanilla 循环神经网络和 sequence to sequence
+模型来指导你完成这些步骤。sequence to sequence
+模型的代码可以在\ ``demo / seqToseq``\ 找到。
+
+准备序列数据
+------------
+
+PaddlePaddle
+不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
+它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
+
+.. code:: python
+
+    settings.input_types = [
+      integer_value_sequence(len(settings.src_dict)),
+      integer_value_sequence(len(settings.trg_dict)),
+      integer_value_sequence(len(settings.trg_dict))]
+
+在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
+
+.. code:: python
+
+    yield src_ids, trg_ids, trg_ids_next
+
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
+``demo/seqToseq/dataprovider.py``\ 。
+
+配置循环神经网络架构
+--------------------
+
+简单门控循环神经网络(Gated Recurrent Neural Network)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
+
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center
+
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
+
+.. math::
+
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
+
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
+循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
+可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
+sequence
+模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的
+vanilla
+循环神经网络作为使用\ ``recurrent_group``\ 配置简单循环神经网络的例子。
+注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用\ ``grumemory``\ 和\ ``lstmemory``\ ，因为它们的计算效率比\ ``recurrent_group``\ 更高。
+
+对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
+
+.. math::
+
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
+
+``recurrent_group``\ 是构建循环神经网络的最重要的工具。
+它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
+
+.. code:: python
+
+    def simple_rnn(input,
+                   size=None,
+                   name=None,
+                   reverse=False,
+                   rnn_bias_attr=None,
+                   act=None,
+                   rnn_layer_attr=None):
+        def __rnn_step__(ipt):
+           out_mem = memory(name=name, size=size)
+           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
+                                          full_matrix_projection(out_mem)],
+                                 name = name,
+                                 bias_attr = rnn_bias_attr,
+                                 act = act,
+                                 layer_attr = rnn_layer_attr,
+                                 size = size)
+           return rnn_out
+        return recurrent_group(name='%s_recurrent_group' % name,
+                               step=__rnn_step__,
+                               reverse=reverse,
+                               input=input)
+
+PaddlePaddle
+使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
+一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
+layer(引导层)**\ ，其输出被用作Memory的初始值。
+在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
+(*x*\ \ *t* + 1)的输出被用作\ ``out_mem``\ Memory的\ **输出**\ 。
+
+Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。
+其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
+
+我们在函数的结尾返回\ ``rnn_out``\ 。 这意味着 ``rnn_out``
+层的输出被用作门控循环神经网络的\ **输出**\ 函数。
+
+Sequence to Sequence Model with Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+我们将使用 sequence to sequence model with attention
+作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
+
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
+
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
+用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
+
+模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
+``recurrent_group``
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
+
+我们还将编码向量投射到 ``decoder_size``
+维空间。这通过获得反向循环网络的第一个实例，并将其投射到
+``decoder_size`` 维空间完成：
+
+.. code:: python
+
+    # 定义源语句的数据层
+    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    # 计算每个词的词向量
+    src_embedding = embedding_layer(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=ParamAttr(name='_source_language_embedding'))
+    # 应用前向循环神经网络
+    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
+    src_backward = grumemory(input=src_embedding,
+                              size=encoder_size,
+                              reverse=True)
+    # 将循环神经网络的前向和反向部分混合在一起
+    encoded_vector = concat_layer(input=[src_forward, src_backward])
+
+    # 投射编码向量到 decoder_size
+    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
+                               size = decoder_size)
+
+    # 计算反向RNN的第一个实例
+    backward_first = first_seq(input=src_backward)
+
+    # 投射反向RNN的第一个实例到 decoder size
+    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+
+解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
+``gru_decoder_with_attention`` 中定义：
+
+.. code:: python
+
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+    trg_embedding = embedding_layer(
+        input=data_layer(name='target_language_word',
+                         size=target_dict_dim),
+        size=word_vector_dim,
+        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
+    # 对于配备有注意力机制的解码器，在训练中，
+    # 目标向量（groudtruth）是数据输入，
+    # 而源序列的编码向量可以被无边界的memory访问
+    # StaticInput 意味着不同时间步的输入都是相同的值，
+    # 否则它以一个序列输入，不同时间步的输入是不同的。
+    # 所有输入序列应该有相同的长度。
+    decoder = recurrent_group(name=decoder_group_name,
+                              step=gru_decoder_with_attention,
+                              input=group_inputs)
+
+单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
+attention，门控循环单元单步函数和输出函数：
+
+.. code:: python
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        # 定义解码器的Memory
+        # Memory的输出定义在 gru_step 内
+        # 注意 gru_step 应该与它的Memory名字相同
+        decoder_mem = memory(name='gru_decoder',
+                             size=decoder_size,
+                             boot_layer=decoder_boot)
+        # 计算 attention 加权编码向量
+        context = simple_attention(encoded_sequence=enc_vec,
+                                   encoded_proj=enc_proj,
+                                   decoder_state=decoder_mem)
+        # 混合当前词向量和attention加权编码向量
+        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
+                                               full_matrix_projection(current_word)],
+                                     size = decoder_size * 3)
+        # 定义门控循环单元循环神经网络单步函数
+        gru_step = gru_step_layer(name='gru_decoder',
+                                  input=decoder_inputs,
+                                  output_mem=decoder_mem,
+                                  size=decoder_size)
+        # 定义输出函数
+        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
+                          size=target_dict_dim,
+                          bias_attr=True,
+                          act=SoftmaxActivation())
+        return out
+
+生成序列
+--------
+
+训练模型后，我们可以使用它来生成序列。通常的做法是使用\ **beam search**
+生成序列。以下代码片段定义 beam search 算法。注意，\ ``beam_search``
+函数假设 ``step`` 的输出函数返回的是下一个时刻输出词的 softmax
+归一化概率向量。我们对模型进行了以下更改。
+
+-  使用 ``GeneratedInput`` 来表示 trg\_embedding。 ``GeneratedInput``
+   将上一时间步所生成的词的向量来作为当前时间步的输入。
+-  使用 ``beam_search`` 函数。这个函数需要设置：
+
+   -  ``bos_id``: 开始标记。每个句子都以开始标记开头。
+   -  ``eos_id``: 结束标记。每个句子都以结束标记结尾。
+   -  ``beam_size``: beam search 算法中的beam大小。
+   -  ``max_length``: 生成序列的最大长度。
+
+-  使用 ``seqtext_printer_evaluator``
+   根据索引矩阵和字典打印文本。这个函数需要设置：
+
+   -  ``id_input``: 数据的整数ID，用于标识生成的文件中的相应输出。
+   -  ``dict_file``: 用于将词ID转换为词的字典文件。
+   -  ``result_file``: 生成结果文件的路径。
+
+代码如下：
+
+.. code:: python
+
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
+    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
+    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
+    trg_embedding = GeneratedInput(
+        size=target_dict_dim,
+        embedding_name='_target_language_embedding',
+        embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = beam_search(name=decoder_group_name,
+                           step=gru_decoder_with_attention,
+                           input=group_inputs,
+                           bos_id=0, # Beginnning token.
+                           eos_id=1, # End of sentence token.
+                           beam_size=beam_size,
+                           max_length=max_length)
+
+    seqtext_printer_evaluator(input=beam_gen,
+                              id_input=data_layer(name="sent_id", size=1),
+                              dict_file=trg_dict_path,
+                              result_file=gen_trans_file)
+    outputs(beam_gen)
+
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
+
+完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
diff --git a/doc/howto/deep_model/rnn/rnn_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
similarity index 98%
rename from doc/howto/deep_model/rnn/rnn_en.rst
rename to doc/howto/deep_model/rnn/rnn_config_en.rst
index b4c0c8bb4c..73f5d5371f 100644
--- a/doc/howto/deep_model/rnn/rnn_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
     yield src_ids, trg_ids, trg_ids_next
 
 
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2_en` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -42,8 +42,8 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-	 :align: center
+.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+     :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
 
@@ -102,7 +102,7 @@ Sequence to Sequence Model with Attention
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
 .. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
- 	 :align: center
+      :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
 
@@ -246,6 +246,6 @@ The code is listed below:
     outputs(beam_gen)
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling_en` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
 
 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
diff --git a/doc_cn/algorithm/rnn/glossary_rnn.dot b/doc/howto/deep_model/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/glossary_rnn.dot
rename to doc/howto/deep_model/rnn/src/glossary_rnn.dot
diff --git a/doc_cn/algorithm/rnn/glossary_rnn_with_memory.dot b/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/glossary_rnn_with_memory.dot
rename to doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc_cn/algorithm/rnn/simple_full_hierarchical_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/simple_full_hierarchical_recurrent.dot
rename to doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc_cn/algorithm/rnn/simple_full_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/simple_full_recurrent.dot
rename to doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
diff --git a/doc/howto/new_layer/FullyConnected.jpg b/doc/howto/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/new_layer/FullyConnected.jpg
rename to doc/howto/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000..ee1b3213ea
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,130 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/howto/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
similarity index 81%
rename from doc/howto/contribute_to_paddle_en.md
rename to doc/howto/dev/contribute_to_paddle_en.md
index 1decc91d62..9b0d3e83c0 100644
--- a/doc/howto/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -1,8 +1,8 @@
-# How to Contribute Code
+# Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code. 
- 
+workflow to merge your code.
+
 ## Code Requirements
 - Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
@@ -12,11 +12,11 @@ workflow to merge your code.
 - Pass all unit tests.
 
 The following tutorial guides you into submitting your contibution.
- 
+
 ## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
- 
+
 Just head over to the GitHub page and click the "Fork" button.
-It's just that simple. 
+It's just that simple.
 
 ## Clone
 
@@ -25,7 +25,7 @@ The **develop** is the main branch, and other user's branches are feature branch
 
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
- 
+
 ```shell
 # Clone your fork to your local machine
 git clone --branch develop https://github.com/USERNAME/Paddle.git
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch
@@ -47,6 +46,22 @@ Then you can start to develop by making a local developement branch
 git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
+## Using `pre-commit` hook
+
+Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
+pre-commit hooks. It can help us format source codes (cpp, python), check some
+basic thing before commit (only one EOL for each file, do not add a huge file
+in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
+PR doesn't fit hook can not be merged into Paddle.
+
+To use [pre-commit](http://pre-commit.com/), you should install it by
+`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
+c/cpp sources. Please make sure clang-format 3.8+ installed.
+
+Then just run `pre-commit install` in your Paddle clone directory. When you
+commit your code, the pre-commit hook will check the local code if there is
+anything not suitable to commit, and so on.
+
 ## Commit
 
 Commit your changes by following command lines:
@@ -83,7 +98,7 @@ git pull --rebase upstream develop
 
 If there are no unique commits locally, git will simply perform a fast-forward.
 However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts. 
+probably shouldn't be), you may have to deal with conflicts.
 
 Now, your local master branch is up-to-date with everything modified upstream.
 
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
new file mode 100644
index 0000000000..9489a921c7
--- /dev/null
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+================
+实现新的网络层
+================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = f(W^T x + b)` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/howto/new_layer/index_en.rst b/doc/howto/dev/new_layer_en.rst
similarity index 98%
rename from doc/howto/new_layer/index_en.rst
rename to doc/howto/dev/new_layer_en.rst
index 922bda5b0d..46481f5ead 100644
--- a/doc/howto/new_layer/index_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -1,6 +1,6 @@
-=======================
-How to Write New Layers
-=======================
+================
+Write New Layers
+================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
@@ -209,7 +209,6 @@ The implementation of the backward part has the following steps.
       if (biases_ && biases_->getWGrad()) {
         biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
 
-        /* Increasing the number of gradient */
         biases_->getParameterPtr()->incUpdate(callback);
       }
 
@@ -297,7 +296,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
 + each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
 + call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
    - layer and input configurations. (:code:`config` in our example)
-   - type of the input. (:code:`fc` in our example)
+   - type of the layer. (:code:`fc` in our example)
    - batch size of the gradient check. (100 in our example)
    - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
    - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
@@ -310,7 +309,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       config.biasSize = 4096;
       config.layerConfig.set_type("fc");
       config.layerConfig.set_size(4096);
-      config.layerConfig.set_active_type("sigmoid");
+      config.layerConfig.set_active_type("softmax");
       config.layerConfig.set_drop_rate(0.1);
       // Setup inputs.
       config.inputDefs.push_back(
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc/howto/dev/write_docs_cn.rst
similarity index 90%
rename from doc_cn/howto/how_to_write_docs/index.rst
rename to doc/howto/dev/write_docs_cn.rst
index a1f983b340..5051a89230 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
@@ -51,4 +51,4 @@ TBD
 
 
 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
new file mode 100644
index 0000000000..bd3d0ec292
--- /dev/null
+++ b/doc/howto/index_cn.rst
@@ -0,0 +1,39 @@
+进阶指南
+========
+
+使用说明
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  usage/cmd_parameter/index_cn.rst
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/cluster_train_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md
+
+开发标准
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md
+
+模型配置
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index bd64c5b1fb..1fbfcd260b 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,9 +7,10 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  cmd_parameter/index_en.md
-  deep_model/index_en.rst
-  cluster/cluster_train_en.md
+  usage/cmd_parameter/index_en.rst
+  usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
@@ -17,8 +18,16 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  new_layer/index_en.rst
-  contribute_to_paddle_en.md
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst
 
 Optimization
 -------------
@@ -26,4 +35,4 @@ Optimization
 ..  toctree::
   :maxdepth: 1
 
-  optimization/index_en.rst
+  optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000..e2b0b0396e
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/gpu_profiling_en.rst b/doc/howto/optimization/gpu_profiling_en.rst
index 40ba698f4e..ed208ceaf7 100644
--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
@@ -1,5 +1,8 @@
-Profiling on PaddlePaddle
-=========================
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
 
 This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
 
@@ -49,11 +52,11 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 In this tutorial, we will focus on nvprof and nvvp.
 
 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
-above profilers. 
+above profilers.
 
 .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
    :language: c++
-   :lines: 111-124
+   :lines: 137-151
    :linenos:
 
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -79,8 +82,8 @@ As a simple example, consider the following:
 
     .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
-        :emphasize-lines: 8-10,13
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
         :linenos:
 
 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -90,31 +93,31 @@ As a simple example, consider the following:
         cmake .. -DWITH_TIMER=ON
         make
 
-3. Execute your code and observe the results (see the emphasize-lines). 
+3. Execute your code and observe the results (see the emphasize-lines).
 
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler                                                                             
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
-        [==========] Running 1 test from 1 test case.                                                                                                
-        [----------] Global test environment set-up.                                                                                                 
-        [----------] 1 test from Profiler                                                                                                            
-        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
         I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"                                                                                                                  
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
-        [----------] 1 test from Profiler (136 ms total)                                                                                             
-                                                                                                                                                    
-        [----------] Global test environment tear-down                                                                                               
-        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
         [  PASSED  ] 1 test.
 
 nvprof profiler
@@ -126,7 +129,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
+        :lines: 137-151
         :emphasize-lines: 6-7
         :linenos:
 
@@ -147,42 +150,42 @@ Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
-    ==78544== Profiling result:                                                                                                                                                
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
-                                                                                                                                                                            
-    ==78544== API calls:                                                                                                                                                       
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
     0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
 
 
diff --git a/doc/howto/optimization/index_en.rst b/doc/howto/optimization/index_en.rst
deleted file mode 100644
index 1e2f16b5da..0000000000
--- a/doc/howto/optimization/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Tune GPU Performance
-===========================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling_en.rst
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
new file mode 100644
index 0000000000..acdcfa1c00
--- /dev/null
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -0,0 +1,159 @@
+```eval_rst
+.. _cluster_train:
+```
+
+# 运行分布式训练
+
+在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
+
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+
+## 前提条件
+
+1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric:
+
+   ```bash
+   pip install fabric
+   ```
+
+2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。
+
+3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`， 该 ROOT_DIR 要在所有节点上存在。为了方便起见，我们通常在所有节点上创建一个 Unix 用户 `paddle`，并设置 `ROOT_DIR=/home/paddle`。这样，我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`，以便用户 `paddle` 可以 SSH 到所有节点而不用密码。
+
+## 准备工作空间
+
+我们将放置依赖库、配置等文件的目录视为 *工作空间（workspace）*。
+
+这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求，PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据，所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件，并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。
+
+通常，你可以使用本地训练中的相同模型文件进行集群训练。请记住，在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小，而不是使用同步 SGD 的总 batch 大小。
+
+以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。
+
+你只需完成 demo/recommendation 教程文档到 `Train` 的部分，之后你会得到训练/测试数据和模型配置文件。最后，只需使用 demo/recommendation 作为集群训练的工作空间。
+
+最后，你的工作空间应如下所示：
+```
+.
+|-- common_utils.py
+|-- data
+|   |-- config.json
+|   |-- config_generator.py
+|   |-- meta.bin
+|   |-- meta_config.json
+|   |-- meta_generator.py
+|   |-- ml-1m
+|   |-- ml_data.sh
+|   |-- ratings.dat.test
+|   |-- ratings.dat.train
+|   |-- split.py
+|   |-- test.list
+|   `-- train.list
+|-- dataprovider.py
+|-- evaluate.sh
+|-- prediction.py
+|-- preprocess.sh
+|-- requirements.txt
+|-- run.sh
+`-- trainer_config.py
+```
+虽然这些文件并非都需要集群训练，但是也没有必要删除无用的文件。
+
+`trainer_config.py`
+表示模型配置文件。
+
+`train.list` 和 `test.list`
+文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。
+
+`dataprovider.py`
+用于读取训练/测试样本。这与本地训练相同。
+
+`data`
+数据目录中的所有文件被 train.list/test.list 引用。
+
+
+## 准备集群作业配置
+
+以下选项必须在 cluster_train/conf.py 中认真设置
+
+`HOSTS`  所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上，例如 root@192.168.100.17:9090。
+
+`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录
+
+`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称，例如以太网的 eth0，infiniband 的 ib0。
+
+`PADDLE_PORT` 集群通信通道的端口号
+
+`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少（少于5〜6个节点），建议将其设置为较大，如2〜8，以获得更好的网络性能。
+
+`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update，则可以像 `PADDLE_PORTS_NUM` 一样设置。
+
+`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。
+
+默认配置如下：
+
+```python
+HOSTS = [
+        "root@192.168.100.17",
+        "root@192.168.100.18",
+        ]
+
+'''
+工作空间配置
+'''
+
+#工作空间根目录
+ROOT_DIR = "/home/paddle"
+
+'''
+网络配置
+'''
+#pserver NIC
+PADDLE_NIC = "eth0"
+#pserver 端口
+PADDLE_PORT = 7164
+#pserver 端口数
+PADDLE_PORTS_NUM = 2
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 2
+
+#集群作业中所有进程的环境设置
+LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+```
+
+### 启动集群作业
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+`job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 conf.py 中设置的所有节点。  它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+`job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作，只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+### 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+### 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+### 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
new file mode 100644
index 0000000000..30963dcd92
--- /dev/null
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -0,0 +1,156 @@
+# Run Distributed Training
+
+In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
+
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+
+## Prerequisite
+
+1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
+
+   ```bash
+   pip install fabric
+   ```
+
+1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
+
+1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
+
+## Prepare Job Workspace
+
+We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
+
+These `train/test` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+
+Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used.
+
+Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory.
+
+You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
+
+At last your workspace should look like as follow:
+```
+.
+|-- common_utils.py
+|-- data
+|   |-- config.json
+|   |-- config_generator.py
+|   |-- meta.bin
+|   |-- meta_config.json
+|   |-- meta_generator.py
+|   |-- ml-1m
+|   |-- ml_data.sh
+|   |-- ratings.dat.test
+|   |-- ratings.dat.train
+|   |-- split.py
+|   |-- test.list
+|   `-- train.list
+|-- dataprovider.py
+|-- evaluate.sh
+|-- prediction.py
+|-- preprocess.sh
+|-- requirements.txt
+|-- run.sh
+`-- trainer_config.py
+```
+Not all of these files are needed for cluster training, but it's not necessary to remove useless files.
+
+`trainer_config.py`
+Indicates the model config file.
+
+`train.list` and `test.list`
+File index. It stores all relative or absolute file paths of all train/test data at current node.
+
+`dataprovider.py`
+used to read train/test samples. It's same as local training.
+
+`data`
+all files in data directory are refered by train.list/test.list which are refered by data provider.
+
+
+## Prepare Cluster Job Configuration
+
+The options below must be carefully set in cluster_train/conf.py
+
+`HOSTS`  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+
+`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory
+
+`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+
+`PADDLE_PORT` port number for cluster commnunication channel
+
+`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+
+`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM`
+
+`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
+
+Default Configuration as follow:
+
+```python
+HOSTS = [
+        "root@192.168.100.17",
+        "root@192.168.100.18",
+        ]
+
+'''
+workspace configuration
+'''
+
+#root dir for workspace
+ROOT_DIR = "/home/paddle"
+
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 2
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 2
+
+#environments setting for all processes in cluster job
+LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+```
+
+### Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+`job_dispatch_package`  set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
+`job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+### Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed.
+
+### Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides pserver running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of pserver process. Check error log if training crashs.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashs.
+
+### Check Model Output
+After one pass finished, model files will be writed in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000..833e21dd19
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -0,0 +1,409 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">allow_inefficient_sparse_update</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">data_server_port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/howto/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/arguments_en.md
rename to doc/howto/usage/cmd_parameter/arguments_en.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000..dbf7c6f00b
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,336 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 度量学习(Metric Learning)
+* `--external`
+   - 指示是否使用外部机器进行度量学习.
+   - 类型: bool (默认: 0).
+
+* `--data_server_port`
+  - 数据服务器(data server)的监听端口，主要用在度量学习中.
+  - 类型: int32 (默认: 21134).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--allow_inefficient_sparse_update`
+  - 指示是否允许低效率的稀疏更新.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/howto/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
similarity index 95%
rename from doc/howto/cmd_parameter/detail_introduction_en.md
rename to doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 82136b7d4f..aa69a3bd54 100644
--- a/doc/howto/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -1,5 +1,5 @@
 ```eval_rst
-..  _cmd_detail_introduction_en:
+..  _cmd_detail_introduction:
 ```
 
 # Detail Description
@@ -73,7 +73,7 @@
   - type: bool (default: 0).
 
 * `--load_missing_parameter_strategy`
-  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
     - `fail`: program will exit.
     - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
     - `zero`: all parameters are zero.
@@ -118,11 +118,11 @@
   - type: int32 (default: 0).
 
 * `--test_wait`
-  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
   - type: bool (default: 0).
 
 * `--model_list`
-  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - File that saves the model list when testing. 
   - type: string (default: "", null).
 
 * `--predict_output_dir`
@@ -212,7 +212,7 @@
   - type: bool (default: 0).
 
 * `--pservers`
-  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - Comma separated IP addresses of pservers.
   - type: string (default: "127.0.0.1").
 
 * `--port`
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000..4c87298211
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/usage/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000..0e3c72d27a
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/usage/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000..db8c39d950
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/howto/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
similarity index 91%
rename from doc/howto/cmd_parameter/use_case_en.md
rename to doc/howto/usage/cmd_parameter/use_case_en.md
index 4d7bb33f36..e287f0c4b9 100644
--- a/doc/howto/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@@ -134,14 +134,14 @@ fc2=fc_layer(...)
 fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
 
 ```
-- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
 
-- device=-1: use the CPU for layer l3.
+- device=-1: use the CPU for layer fc3.
 
 - trainer_count:
-  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
 
-  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
 
 ### Case 2: Specify Layers in Different Devices
 
@@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
 In this case, we assume that there are 4 GPUs in one machine.
 
 - trainer_count=1:
-  - Use GPU 0 to compute layer l2.
-  - Use GPU 1 to compute layer l3.
-  - Use CPU to compute layer l4.
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
 
 - trainer_count=2:
-  - Use GPU 0 and 1 to compute layer l2.
-  - Use GPU 2 and 3 to compute layer l3.
-  - Use CPU to compute l4 in two threads.
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
 
 - trainer_count=4:
   - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
diff --git a/doc_cn/concepts/pserver_topology.dot b/doc/howto/usage/concepts/src/pserver_topology.dot
similarity index 100%
rename from doc_cn/concepts/pserver_topology.dot
rename to doc/howto/usage/concepts/src/pserver_topology.dot
diff --git a/doc_cn/concepts/trainer_config.py b/doc/howto/usage/concepts/src/trainer_config.py
similarity index 100%
rename from doc_cn/concepts/trainer_config.py
rename to doc/howto/usage/concepts/src/trainer_config.py
diff --git a/doc_cn/concepts/use_concepts.rst b/doc/howto/usage/concepts/use_concepts_cn.rst
similarity index 62%
rename from doc_cn/concepts/use_concepts.rst
rename to doc/howto/usage/concepts/use_concepts_cn.rst
index 2d27e29fac..fa334bcbb9 100644
--- a/doc_cn/concepts/use_concepts.rst
+++ b/doc/howto/usage/concepts/use_concepts_cn.rst
@@ -1,6 +1,6 @@
-#########################
-PaddlePaddle 基本使用概念
-#########################
+############
+基本使用概念
+############
 
 PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
 
@@ -8,36 +8,36 @@ PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
 
 本文首先介绍trainer进程中的一些使用概念，然后介绍pserver进程中概念。
 
-..	contents::
+..    contents::
 
 系统框图
 ========
 
 下图描述了用户使用框图，PaddlePaddle的trainer进程里内嵌了Python解释器，trainer进程可以利用这个解释器执行Python脚本，Python脚本里定义了模型配置、训练算法、以及数据读取函数。其中，数据读取程序往往定义在一个单独Python脚本文件里，被称为数据提供器（DataProvider），通常是一个Python函数。模型配置、训练算法通常定义在另一单独Python文件中, 称为训练配置文件。下面将分别介绍这两部分。
 
-..	graphviz:: 
-
-	digraph pp_process {
-		rankdir=LR;
-		config_file [label="用户神经网络配置"];
-		subgraph cluster_pp {
-			style=filled;
-			color=lightgrey;
-			node [style=filled, color=white, shape=box];
-			label = "PaddlePaddle C++";
-			py [label="Python解释器"];
-		}
-		data_provider [label="用户数据解析"];
-		config_file -> py;
-		py -> data_provider [dir="back"];
-	}
+..    graphviz:: 
+
+    digraph pp_process {
+        rankdir=LR;
+        config_file [label="用户神经网络配置"];
+        subgraph cluster_pp {
+            style=filled;
+            color=lightgrey;
+            node [style=filled, color=white, shape=box];
+            label = "PaddlePaddle C++";
+            py [label="Python解释器"];
+        }
+        data_provider [label="用户数据解析"];
+        config_file -> py;
+        py -> data_provider [dir="back"];
+    }
 
 数据提供器
 ==========
 
 DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据转换成系统可以识别的数据类型。每当系统需要新的数据训练时, trainer进程会调用DataProvider函数返回数据。当所有数据读取完一轮后，DataProvider返回空数据，通知系统一轮数据读取结束，并且系统每一轮训练开始时会重置DataProvider。需要注意的是，DataProvider是被系统调用，而不是新数据驱动系统，一些随机化噪声添加都应该在DataProvider中完成。
 
-在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 `PyDataProvider`_ 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 `PyDataProvider`_ 的相关文档继续深入了解。
+在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 ``PyDataProvider`` 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 :ref:`api_pydataprovider2` 继续深入了解。
 
 
 训练配置文件
@@ -47,24 +47,24 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 
 一个简单的训练配置文件为：
 
-..  literalinclude:: trainer_config.py
+..  literalinclude:: src/trainer_config.py
     :linenos:
 
-文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是`trainer_config_helpers`_的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
+文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是 :ref:`api_trainer_config` 的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
 
 下面分别介绍数据源配置、优化算法配置、网络结构配置这三部分该概念。
 
 数据源配置
 ----------
 
-使用 `PyDataProvider`_ 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。
+使用 ``PyDataProvider2`` 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。
 
-``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 `PyDataProvider`_ 。
+``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 :ref:`api_pydataprovider2` 。
 
 优化算法配置
 ------------
 
-通过 `settings`_ 接口设置神经网络所使用的训练参数和 `优化算法`_ ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 `settings`_ 文档。
+通过 :ref:`api_trainer_config_helpers_optimizers_settings` 接口设置神经网络所使用的训练参数和 :ref:`api_trainer_config_helpers_optimizers` ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 :ref:`api_trainer_config_helpers_optimizers_settings` 文档。
 
 网络结构配置
 ------------
@@ -82,14 +82,13 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
  
   这个配置文件网络由 ``data_layer`` 、 ``simple_img_conv_pool`` 、 ``fc_layer`` 组成。
 
-  - `data_layer`_  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
-  - `simple_img_conv_pool`_ ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
-  - `fc_layer`_ ：全连接层，激活函数为Softmax，这里也可叫分类层。
+  - :ref:`api_trainer_config_helpers_layers_data_layer`  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
+  - :ref:`api_trainer_config_helpers_network_simple_img_conv_pool` ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
+  - :ref:`api_trainer_config_helpers_layers_fc_layer` ：全连接层，激活函数为Softmax，这里也可叫分类层。
 
-  
 - 损失函数和评估器：损失函数即为网络的优化目标，评估器可以评价模型结果。
 
-  PaddlePaddle包括很多损失函数和评估起，详细可以参考 `损失函数层`_ 和 `评估器`_ 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
+  PaddlePaddle包括很多损失函数和评估起，详细可以参考 :ref:`api_trainer_config_helpers_layers_cost_layers` 和 :ref:`api_trainer_config_helpers_evaluators` 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
   
 - ``outputs``: 标记网络输出的函数为 ``outputs`` 。
 
@@ -100,13 +99,13 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 
 例如，和 ``fc_layer`` 同样功能的 ``mixed_layer`` 是:
 
-..	code-block:: python
+..    code-block:: python
    
-   	data = data_layer(name='data', size=200)
-   	with mixed_layer(size=200) as out:
-   		out += full_matrix_projection(input=data)
+       data = data_layer(name='data', size=200)
+       with mixed_layer(size=200) as out:
+           out += full_matrix_projection(input=data)
 
-PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 :ref:`api_trainer_config_helpers_layers_mixed_layer` 的相关文档进行配置。
 
 
 分布式训练
@@ -114,13 +113,13 @@ PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚
 
 PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trainer 进行同步。多机训练的经典拓扑结构如下\:
 
-..	graphviz:: pserver_topology.dot
+..    graphviz:: src/pserver_topology.dot
 
 图中每个灰色方块是一台机器，在每个机器中，先使用命令 ``paddle pserver`` 启动一个pserver进程，并指定端口号，可能的参数是\:
 
-..	code-block:: bash
+..    code-block:: bash
 
-	paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
+    paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
 
 * ``--port=5000`` : 指定 pserver 进程端口是 5000 。
 * ``--gradient_servers=4`` : 有四个训练进程(PaddlePaddle 将 trainer 也称作 GradientServer ，因为其为负责提供Gradient) 。
@@ -128,9 +127,9 @@ PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trai
 
 启动之后 pserver 进程之后，需要启动 trainer 训练进程，在各个机器上运行如下命令\:
 
-..	code-block:: bash
+..    code-block:: bash
 
-	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+    paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
 
 对于简单的多机协同训练使用上述方式即可。另外，pserver/train 通常在高级情况下，还需要设置下面两个参数\：
 
@@ -138,18 +137,3 @@ PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trai
 * --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0。
 
 使用手工指定端口数量，是因为Paddle的网络通信中，使用了 int32 作为消息长度，比较容易在大模型下溢出。所以，在 pserver 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，对性能尤其是内存占用有一定的开销，另外稀疏更新的端口如果太大的话，很容易导致某一个参数服务器没有分配到任何参数。
-
-详细的说明可以参考，使用 `集群训练Paddle`_ 。
-
-
-..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
-.. _settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
-.. _优化算法: ../../doc/ui/api/trainer_config_helpers/optimizers.html#optimizers
-.. _trainer_config_helper: ../../doc/ui/api/trainer_config_helpers/index.html
-.. _data_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#data-layer
-.. _simple_img_conv_pool: ../../doc/ui/api/trainer_config_helpers/networks.html#simple-img-conv-pool
-.. _fc_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#fc-layer
-.. _损失函数层: ../../doc/ui/api/trainer_config_helpers/layers.html#cost-layers
-.. _评估器: ../../doc/ui/api/trainer_config_helpers/evaluators.html
-.. _mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
-..  _集群训练Paddle: ../cluster/index.html
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
new file mode 100644
index 0000000000..b04bfba590
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -0,0 +1,666 @@
+# Kubernetes on AWS
+
+## Create AWS Account and IAM Account
+
+To use AWS, we need to sign up an AWS account on Amazon's Web site.
+An AWS account allows us to login to the AWS Console Web interface to
+create IAM users and user groups. Usually, we create a user group with
+privileges required to run PaddlePaddle, and we create users for
+those who are going to run PaddlePaddle and add these users into the
+group. IAM users can identify themselves using password and tokens,
+where passwords allows users to log in to the AWS Console, and tokens
+make it easy for users to submit and inspect jobs from the command
+line.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges in
+the user group:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+
+
+By the time we write this tutorial, we noticed that Chinese AWS users
+might suffer from authentication problems when running this tutorial.
+Our solution is that we create a VM instance with the default Amazon
+AMI and in the same zone as our cluster runs, so we can SSH to this VM
+instance as a tunneling server and control our cluster and jobs from
+it.
+
+
+## PaddlePaddle on AWS
+
+Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
+
+
+###Download kube-aws and kubectl
+
+####kube-aws
+
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+####kubectl
+
+Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+
+Extract the tarball and then concate the kubernetes binaries directory into PATH:
+
+```
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+
+```
+
+User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
+
+
+###Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+
+
+And then configure your AWS account information:
+
+```
+aws configure
+
+```
+
+
+Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+
+```
+
+Test that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+###Define Cluster Parameters
+
+####EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+
+####KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key in the AWS console, or with the aws command line tool:
+
+```
+$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+
+And then you need to add several inline policies in your user permission.
+
+kms inline policy:
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:xxxxxxxxx:key/*"
+            ]
+        }
+    ]
+}
+```
+cloudformation inline policy:
+
+```
+"Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+
+
+####External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+
+####S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+####Initialize an asset directory
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+$ kube-aws init \
+--cluster-name=my-cluster-name \
+--external-dns-name=my-cluster-endpoint \
+--region=us-west-1 \
+--availability-zone=us-west-1c \
+--key-name=key-pair-name \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+####Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+$ kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+sh $ kube-aws render stack
+```
+
+Here's what the directory structure looks like:
+
+```
+$ tree
+.
+├── cluster.yaml
+├── credentials
+│   ├── admin-key.pem
+│   ├── admin.pem
+│   ├── apiserver-key.pem
+│   ├── apiserver.pem
+│   ├── ca-key.pem
+│   ├── ca.pem
+│   ├── worker-key.pem
+│   └── worker.pem
+│   ├── etcd-key.pem
+│   └── etcd.pem
+│   ├── etcd-client-key.pem
+│   └── etcd-client.pem
+├── kubeconfig
+├── stack-template.json
+└── userdata
+    ├── cloud-config-controller
+    └── cloud-config-worker
+```
+
+These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+
+
+###Kubernetes Cluster Start Up
+
+####Create the instances defined in the CloudFormation template
+
+Now for the exciting part, creating your cluster:
+
+```
+$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+```
+
+####Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+
+####Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS                     AGE
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+```
+
+
+###Setup PaddlePaddle Environment on AWS
+
+Now, we've created a cluster with following network capability:
+
+1. All Kubernetes nodes can communicate with each other.
+
+1. All Docker containers on Kubernetes nodes can communicate with each other.
+
+1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
+
+1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
+
+
+For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+
+
+1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+
+1. Create the Elastic File System in AWS console, and attach the new VPC with it.
+<center>![](src/create_efs.png)</center>
+
+
+1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
+<center>![](src/add_security_group.png)</center>
+
+
+1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
+<center>![](src/efs_mount.png)</center>
+
+
+Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+
+
+
+###Core Concept of PaddlePaddle Training on AWS
+
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+
+####Use Kubernetes Job
+
+We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+
+We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+
+In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+
+####Create PaddlePaddle Node
+
+After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+
+
+####Start up Training
+
+After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+
+With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+
+1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+1. Copy the training data from EFS sharing volume into container.
+1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
+1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+
+
+###Start PaddlePaddle Training Demo on AWS
+
+Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+
+1. Build PaddlePaddle Docker image.
+1. Divide the training data file and upload it onto the EFS sharing volume.
+1. Create the training job yaml file, and start up the job.
+1. Check the result after training.
+
+####Build PaddlePaddle Docker Image
+
+PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+
+1. Copy the training data into container.
+1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+
+
+Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+
+```
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
+
+`start_paddle.py` will start parsing the parameters.
+
+```
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+
+```
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
+
+```
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+
+In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+
+```
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+Use `docker build` to build toe Docker Image:
+
+```
+docker build -t your_repo/paddle:mypaddle .
+```
+
+And then push the built image onto docker registry.
+
+```
+docker push  your_repo/paddle:mypaddle
+```
+
+####Upload Training Data File
+
+Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
+
+```
+efs
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+
+
+####Create Kubernetes Job
+
+Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
+
+In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+
+The yaml file content is as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/admin/efs
+      containers:
+      - name: trainer
+        image: drinkcode/paddle:k8s-job
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+        ports:
+        - name: jobport
+          hostPort: 30001
+          containerPort: 30001
+      restartPolicy: Never
+
+```
+
+In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
+
+`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+
+`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+
+`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+
+`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
+
+`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+
+`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+
+After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+
+```
+kubectl create -f job.yaml
+```
+
+After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+
+
+
+####Check Training Results
+
+During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+
+```
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can always check the container training status through logs, for example:
+
+```
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+
+
+###Kubernetes Cluster Tear Down
+
+
+If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+
+```
+kube-aws destroy
+```
+It's an async call, it might take 5 min to tear down the whole cluster.
+
+If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+
+
+
+## For Experts with Kubernetes and AWS
+
+Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+
+### Some Presumptions
+
+* Instances run on CoreOS, the official IAM.
+* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
+* For networking, we use Flannel network at this moment, we will use Calico solution later on.
+* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc_cn/build_and_install/paddle_on_kubernetes.md b/doc/howto/usage/k8s/k8s_cn.md
similarity index 99%
rename from doc_cn/build_and_install/paddle_on_kubernetes.md
rename to doc/howto/usage/k8s/k8s_cn.md
index f8c9f19a9f..ab07cb9cd5 100644
--- a/doc_cn/build_and_install/paddle_on_kubernetes.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Paddle On Kubernetes：单机训练
+# Kubernetes单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
similarity index 95%
rename from doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
rename to doc/howto/usage/k8s/k8s_distributed_cn.md
index 64f8fd4b43..2063b98ca8 100644
--- a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -1,5 +1,4 @@
-
-# PaddlePaddle on Kubernetes：分布式训练
+# Kubernetes分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
@@ -23,7 +22,7 @@
 
 首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
-![paddle on kubernetes结构图](k8s-paddle-arch.png)
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
 上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
@@ -83,7 +82,7 @@ COPY start_paddle.py /root/
 CMD ["bash"," -c","/root/start.sh"]
 ```
 
-[`start.sh`](start.sh)文件拷贝训练文件到容器内，然后执行[`start_paddle.py`](start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
+[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
 
 `start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
 
@@ -160,6 +159,8 @@ docker build -t your_repo/paddle:mypaddle .
 docker push  your_repo/paddle:mypaddle
 ```
 
+注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+
 ### 上传训练文件
 
 本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
@@ -245,6 +246,8 @@ spec:
 
 `CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
 
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
 ```bash
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md
new file mode 100644
index 0000000000..0c3ab05b70
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_en.md
@@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc_cn/cluster/k8s/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile
similarity index 100%
rename from doc_cn/cluster/k8s/Dockerfile
rename to doc/howto/usage/k8s/src/Dockerfile
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
new file mode 100644
index 0000000000..50eed4c657
Binary files /dev/null and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
new file mode 100644
index 0000000000..f4d448d151
Binary files /dev/null and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/k8s/src/efs_mount.png
new file mode 100644
index 0000000000..0f9e3cab98
Binary files /dev/null and b/doc/howto/usage/k8s/src/efs_mount.png differ
diff --git a/doc_cn/cluster/k8s/job.yaml b/doc/howto/usage/k8s/src/job.yaml
similarity index 100%
rename from doc_cn/cluster/k8s/job.yaml
rename to doc/howto/usage/k8s/src/job.yaml
diff --git a/doc_cn/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
similarity index 100%
rename from doc_cn/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/k8s/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/k8s/src/managed_policy.png
new file mode 100644
index 0000000000..c7ecda555b
Binary files /dev/null and b/doc/howto/usage/k8s/src/managed_policy.png differ
diff --git a/doc_cn/cluster/k8s/start.sh b/doc/howto/usage/k8s/src/start.sh
similarity index 100%
rename from doc_cn/cluster/k8s/start.sh
rename to doc/howto/usage/k8s/src/start.sh
diff --git a/doc_cn/cluster/k8s/start_paddle.py b/doc/howto/usage/k8s/src/start_paddle.py
similarity index 100%
rename from doc_cn/cluster/k8s/start_paddle.py
rename to doc/howto/usage/k8s/src/start_paddle.py
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
new file mode 100644
index 0000000000..460fedb565
--- /dev/null
+++ b/doc/index_cn.rst
@@ -0,0 +1,11 @@
+PaddlePaddle 文档
+======================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  tutorials/index_cn.md
+  howto/index_cn.rst
+  api/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/index.rst b/doc/index_en.rst
similarity index 88%
rename from doc/index.rst
rename to doc/index_en.rst
index c107239438..1d9cca7de7 100644
--- a/doc/index.rst
+++ b/doc/index_en.rst
@@ -8,4 +8,5 @@ PaddlePaddle Documentation
   tutorials/index_en.md
   howto/index_en.rst
   api/index_en.rst
-  about/index_en.rst 
+  about/index_en.rst
+ 
\ No newline at end of file
diff --git a/doc_cn/conf.py.in b/doc/templates/conf.py.cn.in
similarity index 98%
rename from doc_cn/conf.py.in
rename to doc/templates/conf.py.cn.in
index 4f3afb814f..418d718fbd 100644
--- a/doc_cn/conf.py.in
+++ b/doc/templates/conf.py.cn.in
@@ -62,7 +62,7 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_cn'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -79,7 +79,7 @@ language = 'zh_CN'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_en*', '*_en*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
diff --git a/doc/conf.py.in b/doc/templates/conf.py.en.in
similarity index 97%
rename from doc/conf.py.in
rename to doc/templates/conf.py.en.in
index 01d156e887..e96c25cb75 100644
--- a/doc/conf.py.in
+++ b/doc/templates/conf.py.en.in
@@ -63,7 +63,7 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_en'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -80,7 +80,7 @@ language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -144,6 +144,6 @@ def setup(app):
     # no c++ API for now
     app.add_config_value('recommonmark_config', {
             'url_resolver': lambda url: github_doc_root + url,
-	    'enable_eval_rst': True,
+        'enable_eval_rst': True,
             }, True)
     app.add_transform(AutoStructify)
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
new file mode 100644
index 0000000000..fe800308d8
--- /dev/null
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -0,0 +1,138 @@
+# 中文词向量模型的使用 #
+----------
+本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
+
+在此感谢 @lipeng 提出的代码需求，并给出的相关模型格式的定义。
+
+## 介绍 ###
+### 中文字典 ###
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+  - `<s>`: 分词序列的开始
+  - `<e>`: 分词序列的结束
+  - `<unk>`: 未知词
+
+### 中文词向量的预训练模型 ###
+遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法，模型采用 n-gram 语言模型，结构如下图：6元上下文作为输入层->全连接层->softmax层 。对应于字典，我们预训练得到4种不同维度的词向量，分别为：32维、64维、128维和256维。
+<center>![](./neural-n-gram-model.png)</center>
+<center>Figure 1. neural-n-gram-model</center>
+
+### 下载和数据抽取 ###
+运行以下的命令下载和获取我们的字典和预训练模型：
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    ./pre_DictAndModel.sh
+
+## 中文短语改写的例子 ##
+以下示范如何使用预训练的中文字典和词向量进行短语改写。
+
+### 数据的准备和预处理 ###
+首先，运行以下的命令下载数据集。该数据集（utf8编码）包含20个训练样例，5个测试样例和2个生成式样例。
+
+    cd $PADDLE_ROOT/demo/seqToseq/data
+    ./paraphrase_data.sh
+
+第二步，将数据处理成规范格式，在训练数集上训练生成词向量字典（数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`）:
+
+    cd $PADDLE_ROOT/demo/seqToseq/
+    python preprocess.py -i data/paraphrase [--mergeDict]
+
+- 其中，如果使用`--mergeDict`选项，源语言短语和目标语言短语的字典将被合并（源语言和目标语言共享相同的编码字典）。本实例中，源语言和目标语言都是相同的语言，因此可以使用该选项。
+
+
+### 使用用户指定的词向量字典 ###
+使用如下命令，从预训练模型中，根据用户指定的字典，抽取对应的词向量构成新的词表:
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
+
+- `--preModel PREMODEL`: 预训练词向量字典模型的路径
+- `--preDict PREDICT`:  预训练模型使用的字典的路径
+- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
+- `--usrDict USRDICT`: 用户指定新的字典的路径，用于构成新的词表
+- `-d DIM`: 参数（词向量）的维度
+
+此处，你也可以简单的运行以下的命令：
+
+    cd $PADDLE_ROOT/demo/seqToseq/data/
+    ./paraphrase_model.sh
+
+运行成功以后，你将会看到以下的模型结构：
+
+    paraphrase_model
+    |--- _source_language_embedding
+    |--- _target_language_embedding
+
+### 在PaddlePaddle平台训练模型 ###
+首先，配置模型文件，配置如下（可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置）:
+
+    from seqToseq_net import *
+    is_generating = False
+
+    ################## Data Definition #####################
+    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                                 job_mode = job_mode)
+
+    ############## Algorithm Configuration ##################
+    settings(
+          learning_method = AdamOptimizer(),
+          batch_size = 50,
+          learning_rate = 5e-4)
+
+    ################# Network configure #####################
+    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
+
+这个配置与`demo/seqToseq/translation/train.conf` 基本相同
+
+然后，使用以下命令进行模型训练:
+
+    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
+    ./train.sh
+
+其中，`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同，只有2个配置不一样:
+
+- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
+- `--load_missing_parameter_strategy`：如果参数模型文件缺失，除词向量模型外的参数将使用正态分布随机初始化
+
+如果用户想要了解详细的数据集的格式、模型的结构和训练过程，请查看 [Text generation Tutorial](../text_generation/index_cn.md).
+
+## 可选功能 ##
+###  观测词向量
+PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+
+- `-i INPUT`: 输入的（二进制）词向量模型名称
+- `-o OUTPUT`: 输出的文本模型名称
+- `-d DIM`: （词向量）参数维度
+
+运行完以上命令，用户可以在输出的文本模型中看到:
+
+    0,4,32156096
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+
+- 其中，第一行是`PaddlePaddle` 输出文件的格式说明，包含3个属性：:
+  - `PaddlePaddle`的版本号，本例中为0
+  - 浮点数占用的字节数，本例中为4
+  - 总计的参数个数，本例中为32,156,096
+- 其余行是（词向量）参数行（假设词向量维度为32）
+  - 每行打印32个参数以','分隔
+  - 共有32,156,096/32 = 1,004,877行，也就是说，模型共包含1,004,877个被向量化的词
+
+### 词向量模型的修正
+`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+- `-i INPUT`: 输入的文本词向量模型名称
+- `-o OUTPUT`: 输出的二进制词向量模型名称
+
+请注意，输入的文本格式如下:
+
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+- 输入文本中没有头部（格式说明）行
+- （输入文本）每行存储一个词，以逗号','分隔
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
new file mode 100644
index 0000000000..0eafd7cb49
Binary files /dev/null and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
new file mode 100644
index 0000000000..ac9ed37b22
--- /dev/null
+++ b/doc/tutorials/gan/index_en.md
@@ -0,0 +1,137 @@
+# Generative Adversarial Networks (GAN) 
+
+This demo implements GAN training described in the original [GAN paper](https://arxiv.org/abs/1406.2661) and deep convolutional generative adversarial networks [DCGAN paper](https://arxiv.org/abs/1511.06434).
+
+The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
+
+<center>![](./gan.png)</center>
+<p align="center">
+    Figure 1. GAN-Model-Structure
+    <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
+</p>
+
+The generator and discriminator take turn to be trained using SGD. The objective function of the generator is for its generated images being classified as real by the discriminator, and the objective function of the discriminator is to correctly classify real and fake images. When the GAN model is trained to converge to the equilibrium state, the generator will transform the given noise distribution to the distribution of real images, and the discriminator will not be able to distinguish between real and fake images at all. 
+
+## Implementation of GAN Model Structure
+Since GAN model involves multiple neural networks, it requires to use paddle python API. So the code walk-through below can also partially serve as an introduction to the usage of Paddle Python API.
+
+There are three networks defined in gan_conf.py, namely **generator_training**, **discriminator_training** and **generator**. The relationship to the model structure we defined above is that **discriminator_training** is the discriminator, **generator** is the generator, and the **generator_training** combined the generator and discriminator since training generator would require the discriminator to provide loss function. This relationship is described in the following code:
+```python
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
+```
+
+In order to train the networks defined in gan_conf.py, one first needs to initialize a Paddle environment, parse the config, create GradientMachine from the config and create trainer from GradientMachine as done in the code chunk below:
+```python
+import py_paddle.swig_paddle as api
+# init paddle environment
+api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+               '--log_period=100', '--gpu_id=' + args.gpu_id,
+               '--save_dir=' + "./%s_params/" % data_source)
+
+# Parse config
+gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
+dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
+
+# Create GradientMachine
+dis_training_machine = api.GradientMachine.createFromConfigProto(
+dis_conf.model_config)
+gen_training_machine = api.GradientMachine.createFromConfigProto(
+gen_conf.model_config)
+generator_machine = api.GradientMachine.createFromConfigProto(
+generator_conf.model_config)
+
+# Create trainer
+dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
+gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
+```
+
+In order to balance the strength between generator and discriminator, we schedule to train whichever one is performing worse by comparing their loss function value. The loss function value can be calculated by a forward pass through the GradientMachine.
+```python
+def get_training_loss(training_machine, inputs):
+    outputs = api.Arguments.createArguments(0)
+    training_machine.forward(inputs, outputs, api.PASS_TEST)
+    loss = outputs.getSlotValue(0).copyToNumpyMat()
+    return numpy.mean(loss)
+```
+
+After training one network, one needs to sync the new parameters to the other networks. The code below demonstrates one example of such use case:
+```python
+# Train the gen_training
+gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
+
+# Copy the parameters from gen_training to dis_training and generator
+copy_shared_parameters(gen_training_machine,
+dis_training_machine)
+copy_shared_parameters(gen_training_machine, generator_machine)
+```
+
+
+## A Toy Example 
+With the infrastructure explained above, we can now walk you through a toy example of generating two dimensional uniform distribution using 10 dimensional Gaussian noise. 
+
+The Gaussian noises are generated using the code below:
+```python
+def get_noise(batch_size, noise_dim):
+    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
+```
+
+The real samples (2-D uniform) are generated using the code below:
+```python
+# synthesize 2-D uniform data in gan_trainer.py:114
+def load_uniform_data():
+    data = numpy.random.rand(1000000, 2).astype('float32')
+    return data
+```
+
+The generator and discriminator network are built using fully-connected layer and batch_norm layer, and are defined in gan_conf.py. 
+
+To train the GAN model, one can use the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+```bash
+$python gan_trainer.py -d uniform --useGpu 1
+```
+The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
+
+<center>![](./uniform_sample.png)</center>
+<p align="center">
+    Figure 2. Uniform Sample
+</p>
+
+## MNIST Example
+### Data preparation
+To download the MNIST data, one can use the following commands:
+```bash
+$cd data/
+$./get_mnist_data.sh
+```
+
+### Model description
+Following the DC-Gan paper (https://arxiv.org/abs/1511.06434), we use convolution/convolution-transpose layer in the discriminator/generator network to better deal with images. The details of the network structures are defined in gan_conf_image.py. 
+
+### Training the model
+To train the GAN model on mnist data, one can use the following command:
+```bash
+$python gan_trainer.py -d mnist --useGpu 1
+```
+The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
+<center>![](./mnist_sample.png)</center>
+<p align="center">
+    Figure 3. MNIST Sample
+</p>
diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/tutorials/gan/mnist_sample.png
new file mode 100644
index 0000000000..f9c7bf7ddd
Binary files /dev/null and b/doc/tutorials/gan/mnist_sample.png differ
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
new file mode 100644
index 0000000000..e716c48e78
Binary files /dev/null and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md
new file mode 100644
index 0000000000..87f465522a
--- /dev/null
+++ b/doc/tutorials/image_classification/index_cn.md
@@ -0,0 +1,205 @@
+图像分类教程
+==========
+
+在本教程中，我们将使用CIFAR-10数据集训练一个卷积神经网络，并使用这个神经网络来对图片进行分类。如下图所示，卷积神经网络可以辨识图片中的主体，并给出分类结果。
+<center>![Image Classification](./image_classification.png)</center>
+
+## 数据准备
+首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址：
+
+<https://www.cs.toronto.edu/~kriz/cifar.html>
+
+我们准备了一个脚本，可以用于从官方网站上下载CIFAR-10数据集，转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装：
+
+1. 安装pillow
+
+```bash
+sudo apt-get install libjpeg-dev
+pip install pillow
+```
+
+2. 下载数据集
+
+```bash
+cd demo/image_classification/data/
+sh download_cifar.sh
+```
+
+CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类，每个类包含6000张。其中50000张图片作为训练集，10000张作为测试集。
+
+下图展示了所有的图片类别，每个类别中随机抽取了10张图片。
+<center>![Image Classification](./cifar.png)</center>
+
+脚本运行完成后，我们应当会得到一个名为cifar-out的文件夹，其下子文件夹的结构如下
+
+
+```
+train
+---airplane
+---automobile
+---bird
+---cat
+---deer
+---dog
+---frog
+---horse
+---ship
+---truck
+test
+---airplane
+---automobile
+---bird
+---cat
+---deer
+---dog
+---frog
+---horse
+---ship
+---truck
+```
+
+cifar-out下包含`train`和`test`两个文件夹，其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹，每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后，我们就可以着手对分类模型进行训练了。
+
+## 预处理
+数据下载之后，还需要进行预处理，将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作：
+
+```
+cd demo/image_classification/
+sh preprocess.sh
+```
+
+其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理
+```sh
+export PYTHONPATH=$PYTHONPATH:../../
+data_dir=./data/cifar-out
+python preprocess.py -i $data_dir -s 32 -c 1
+```
+
+`./demo/image_classification/preprocess.py` 使用如下参数：
+
+- `-i` 或 `--input` 给出输入数据所在路径；
+- `-s` 或 `--size` 给出图片尺寸；
+- `-c` 或 `--color` 标示图片是彩色图或灰度图
+
+## 模型训练
+在开始训练之前，我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**，这里的列出的和`vgg_16_cifar.py`文件稍有差别，因为该文件可适用于预测。
+
+```python
+from paddle.trainer_config_helpers import *
+data_dir='data/cifar-out/batches/'
+meta_path=data_dir+'batches.meta'
+args = {'meta':meta_path, 'mean_img_size': 32,
+        'img_size': 32, 'num_classes': 10,
+        'use_jpeg': 1, 'color': "color"}
+define_py_data_sources2(train_list=data_dir+"train.list",
+                        test_list=data_dir+'test.list',
+                        module='image_provider',
+                        obj='processData',
+                        args=args)
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128))
+
+img = data_layer(name='image', size=3*32*32)
+lbl = data_layer(name="label", size=10)
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img, num_channels=3)
+outputs(classification_cost(input=predict, label=lbl))
+```
+
+在第一行中我们载入用于定义网络的函数。
+```python
+from paddle.trainer_config_helpers import *
+```
+
+之后定义的`define_py_data_sources2`使用Python数据提供器，其中 `args`将在`image_provider.py`进行使用，该文件负责产生图片数据并传递给Paddle系统
+ - `meta`: 训练集平均值。
+ - `mean_img_size`: 平均特征图的高度及宽度。
+ - `img_size`：输入图片的高度及宽度。
+ - `num_classes`：类别个数。
+ - `use_jpeg`：处理过程中数据存储格式。
+ - `color`：标示是否为彩色图片。
+ 
+ `settings`用于设置训练算法。在下面的例子中，learning rate被设置为0.1除以batch size，而weight decay则为0.0005乘以batch size。
+ 
+ ```python
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+```
+
+`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考：[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。
+```python
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img, num_channels=3)
+```
+配置创建完毕后，可以运行脚本train.sh来训练模型。
+
+```bash
+config=vgg_16_cifar.py
+output=./cifar_vgg_model
+log=train.log
+
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=1 \
+--save_dir=$output \
+2>&1 | tee $log
+
+python -m paddle.utils.plotcurve -i $log > plot.png
+```
+- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境，可以设置`use_gpu=0`。
+- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。
+- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败，也许是因为需要安装`matplotlib`。
+在训练完成后，训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例：
+
+<center>![Training and testing curves.](./plot.png)</center>
+
+## 预测
+在训练完成后，模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。
+
+要对一个图片的进行分类预测，我们可以使用`predict.sh`，该脚本将输出预测分类的标签：
+
+```
+sh predict.sh
+```
+
+predict.sh:
+```
+model=cifar_vgg_model/pass-00299/
+image=data/cifar-out/test/airplane/seaplane_s_000978.png
+use_gpu=1
+python prediction.py $model $image $use_gpu
+```
+
+## 练习
+在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载，其中包含了200种鸟类的照片（主要来自北美洲）。
+
+<http://www.vision.caltech.edu/visipedia/CUB-200.html>
+
+
+
+
+## 细节探究
+### 卷积神经网络
+卷积神经网络是一种使用卷积层的前向神经网络，很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示：
+
+![Convolutional Neural Network](./lenet.png)
+
+一个卷积神经网络包含如下层：
+
+- 卷积层：通过卷积操作从图片或特征图中提取特征
+- 池化层：使用max-pooling对特征图下采样
+- 全连接层：使输入层到隐藏层的神经元是全部连接的。
+
+卷积神经网络在图片分类上有着惊人的性能，这是因为它发掘出了图片的两类重要信息：局部关联性质和空间不变性质。通过交替使用卷积和池化处理， 卷积神经网络能够很好的表示这两类信息。
+
+关于如何定义网络中的层，以及如何在层之间进行连接，请参考Layer文档。
diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md
index 29cfc99702..60c81a6a53 100644
--- a/doc/tutorials/image_classification/index_en.md
+++ b/doc/tutorials/image_classification/index_en.md
@@ -147,7 +147,7 @@ for classification. A description of VGG network can be found here [http://www.r
 # small_vgg is predined in trainer_config_helpers.network
 predict = small_vgg(input_image=img, num_channels=3)
 ```
-After writing the config, we can train the model by running the script train.sh. Notice that the following script assumes the you run the script in the `./demo/image_classification` folder. If you run the script in a different folder, you need to change the paths of the scripts and the configuration files accordingly.
+After writing the config, we can train the model by running the script train.sh.
 
 ```bash
 config=vgg_16_cifar.py
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
new file mode 100644
index 0000000000..f54a0c5883
Binary files /dev/null and b/doc/tutorials/image_classification/src/cifar.png differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
new file mode 100644
index 0000000000..14f2558050
Binary files /dev/null and b/doc/tutorials/image_classification/src/image_classification.png differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
new file mode 100644
index 0000000000..1e6f2b32ba
Binary files /dev/null and b/doc/tutorials/image_classification/src/lenet.png differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
new file mode 100644
index 0000000000..a31f99791c
Binary files /dev/null and b/doc/tutorials/image_classification/src/plot.png differ
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/tutorials/imagenet_model/resnet_model_cn.md
new file mode 100644
index 0000000000..82ec9d70b3
--- /dev/null
+++ b/doc/tutorials/imagenet_model/resnet_model_cn.md
@@ -0,0 +1,284 @@
+# Model Zoo - ImageNet #
+
+[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。
+
+## ResNet 介绍
+
+论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练，所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中，而右图的瓶颈连接模块用于50层，101层和152层的网络结构中。
+
+<center>![resnet_block](./resnet_block.jpg)</center>
+<center>图 1. ResNet 网络模块</center>
+
+本教程中我们给出了三个ResNet模型，这些模型都是由原作者提供的模型<https://github.com/KaimingHe/deep-residual-networks>转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率，其中输入图像的颜色通道顺序为**BGR**，保持宽高比缩放到短边为256，只截取中心方形的图像区域。分类错误率和模型大小由下表给出。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">ResNet</th>
+<th scope="col" class="left">Top-1</th>
+<th scope="col" class="left">Model Size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">ResNet-50</td>
+<td class="left">24.9%</td>
+<td class="left">99M</td>
+</tr>
+<tr>
+<td class="left">ResNet-101</td>
+<td class="left">23.7%</td>
+<td class="left">173M</td>
+</tr>
+<tr>
+<td class="left">ResNet-152</td>
+<td class="left">23.2%</td>
+<td class="left">234M</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+## ResNet 模型
+
+50层，101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。
+
+### 网络可视化
+
+你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件，然后可以转换为图片。需要安装graphviz来转换dot文件为图片。
+
+```
+cd demo/model_zoo/resnet
+./net_diagram.sh
+```
+
+### 模型下载
+
+```
+cd demo/model_zoo/resnet
+./get_model.sh
+```
+你可以执行上述命令来下载所有的模型和均值文件，如果下载成功，这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。
+
+```
+mean_meta_224  resnet_101  resnet_152  resnet_50
+```
+   * resnet_50: 50层网络模型。
+   * resnet_101: 101层网络模型。
+   * resnet_152: 152层网络模型。
+   * mean\_meta\_224: 均值图像文件，图像大小为3 x 224 x 224，颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。
+
+### 参数信息
+
+* **卷积层权重**
+
+  由于每个卷积层后面连接的是batch normalization层，因此该层中没有偏置(bias)参数，并且只有一个权重。
+  形状: `(Co, ky, kx, Ci)`
+   * Co: 输出特征图的通道数目
+   * ky: 滤波器核在垂直方向上的尺寸
+   * kx: 滤波器核在水平方向上的尺寸
+   * Ci: 输入特征图的通道数目
+
+  二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。
+
+* **全连接层权重**
+
+  二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。
+
+* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) 层权重**
+
+本层有四个参数，实际上只有.w0和.wbias是需要学习的参数，另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">参数名</th>
+<th scope="col" class="left">尺寸</th>
+<th scope="col" class="left">含义</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">_res2_1_branch1_bn.w0</td>
+<td class="left">256</td>
+<td class="left">gamma, 缩放参数</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w1</td>
+<td class="left">256</td>
+<td class="left">特征图均值</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w2</td>
+<td class="left">256</td>
+<td class="left">特征图方差</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.wbias</td>
+<td class="left">256</td>
+<td class="left">beta, 偏置参数</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+### 参数读取
+
+使用者可以使用下面的Python脚本来读取参数值:
+
+```
+import sys
+import numpy as np
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+if __name__=='__main__':
+    weight = load(sys.argv[1])
+```
+
+或者直接使用下面的shell命令:
+
+```
+od -j 16 -f _res2_1_branch1_bn.w0
+```
+
+## 特征提取
+
+我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据，详细地展示了整个特征提取的过程。
+
+### C++接口
+
+首先，在配置文件中的`define_py_data_sources2`里指定图像数据列表，具体请参照示例`demo/model_zoo/resnet/resnet.py`。
+
+```
+    train_list = 'train.list' if not is_test else None
+    # mean.meta is mean file of ImageNet dataset.
+    # mean.meta size : 3 x 224 x 224.
+    # If you use three mean value, set like:
+    # "mean_value:103.939,116.779,123.68;"
+    args={
+        'mean_meta': "model/mean_meta_224/mean.meta",
+        'image_size': 224, 'crop_size': 224,
+        'color': True,'swap_channel:': [2, 1, 0]}
+    define_py_data_sources2(train_list,
+                           'example/test.list',
+                           module="example.image_list_provider",
+                           obj="processData",
+                           args=args)
+```
+
+第二步，在`resnet.py`文件中指定要提取特征的网络层的名字。例如，
+
+```
+Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+```
+
+第三步，在`extract_fea_c++.sh`文件中指定模型路径和输出的目录，然后执行下面的命令。
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_c++.sh
+```
+
+如果执行成功，特征将会存到`fea_output/rank-00000`文件中，如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。
+
+```
+-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
+-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
+```
+
+* 每行存储的是一个样本的特征。其中，第一行存的是图像`example/dog.jpg`的特征，第二行存的是图像`example/cat.jpg`的特征。
+* 不同层的特征由分号`;`隔开，并且它们的顺序与`Outputs()`中指定的层顺序一致。这里，左边是`res5_3_branch2c_conv`层的特征，右边是`res5_3_branch2c_bn`层特征。
+
+### Python接口
+
+示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下：
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_py.sh
+```
+
+extract_fea_py.sh:
+
+```
+python classify.py \
+     --job=extract \
+     --conf=resnet.py\
+     --use_gpu=1 \
+     --mean=model/mean_meta_224/mean.meta \
+     --model=model/resnet_50 \
+     --data=./example/test.list \
+     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
+     --output_dir=features
+
+```
+* \--job=extract:              指定工作模式来提取特征。
+* \--conf=resnet.py:           网络配置文件。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+* \--output_layer="xxx,xxx":   指定提取特征的层。
+* \--output_dir=features:      输出目录。
+
+如果运行成功，你将会看到特征存储在`features/batch_0`文件中，该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件，它将返回如下的字典：
+
+```
+{
+'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
+'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
+}
+```
+
+仔细观察，这些特征值与上述使用C++接口提取的结果是一致的。
+
+## 预测
+
+`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`，它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。
+
+```
+cd demo/model_zoo/resnet
+./predict.sh
+```
+
+predict.sh调用了`classify.py`:
+
+```
+python classify.py \
+     --job=predict \
+     --conf=resnet.py\
+     --multi_crop \
+     --model=model/resnet_50 \
+     --use_gpu=1 \
+     --data=./example/test.list
+```
+* \--job=extract:              指定工作模型进行预测。
+* \--conf=resnet.py:           网络配置文件。network configure.
+* \--multi_crop:               使用10个裁剪图像块，预测概率取平均。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+
+如果运行成功，你将会看到如下结果，其中156和285是这些图像的分类标签。
+
+```
+Label of example/dog.jpg is: 156
+Label of example/cat.jpg is: 282
+```
diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/tutorials/imagenet_model/resnet_model_en.md
index 5403ab9f17..478ad06193 100644
--- a/doc/tutorials/imagenet_model/resnet_model_en.md
+++ b/doc/tutorials/imagenet_model/resnet_model_en.md
@@ -52,7 +52,7 @@ See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 1
 
 ### Network Visualization
 
-You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert.
 
 ```
 cd demo/model_zoo/resnet
@@ -138,7 +138,7 @@ There are four parameters in this layer. In fact, only .w0 and .wbias are the le
 
 ### Parameter Observation
 
-Users who want to observe the parameters can use python to read:
+Users who want to observe the parameters can use Python to read:
 
 ```
 import sys
@@ -209,7 +209,7 @@ If successful, features are saved in `fea_output/rank-00000` as follows. And you
 
 ### Python Interface
 
-`demo/model_zoo/resnet/classify.py` is an example to show how to use python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
+`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
 
 ```
 cd demo/model_zoo/resnet
@@ -238,8 +238,6 @@ python classify.py \
 * \--output_layer="xxx,xxx":   specify layers to extract features.
 * \--output_dir=features:      output diretcoty.
 
-Note, since the convolution layer in these ResNet models is suitable for the cudnn implementation which only support GPU. It not support CPU mode because of compatibility issue and we will fix later.
-
 If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
 
 ```
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
new file mode 100644
index 0000000000..6a27004d58
--- /dev/null
+++ b/doc/tutorials/index_cn.md
@@ -0,0 +1,13 @@
+# 完整教程
+
+* [快速入门](quick_start/index_cn.rst)
+* [个性化推荐](rec/ml_regression_cn.rst)
+* [图像分类](image_classification/index_cn.md)
+* [情感分析](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)
+* [机器翻译](text_generation/index_cn.md)
+
+## 常用模型
+
+* [ResNet模型](imagenet_model/resnet_model_cn.md)
+* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index 97de356665..77331a703b 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -1,22 +1,13 @@
 # TUTORIALS
-There are serveral examples and demos here.
-
-## [Quick Start](quick_start/index_en.md)
-
-## Image
+There are several examples and demos here.
 
+* [Quick Start](quick_start/index_en.md)
+* [MovieLens Regression](rec/ml_regression_en.rst)
 * [Image Classification](image_classification/index_en.md)
-
-## NLP
-
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Text Generation](text_generation/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-
-## Recommendation
-
-* [MovieLens Dataset](rec/ml_dataset_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
+* [Text Generation](text_generation/index_en.md)
+* [Image Auto-Generation](gan/index_en.md)
 
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
diff --git a/doc_cn/demo/quick_start/index.rst b/doc/tutorials/quick_start/index_cn.rst
similarity index 82%
rename from doc_cn/demo/quick_start/index.rst
rename to doc/tutorials/quick_start/index_cn.rst
index 0536936dc4..d565fcf95e 100644
--- a/doc_cn/demo/quick_start/index.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
@@ -1,5 +1,6 @@
-PaddlePaddle快速入门教程
-========================
+=============
+快速入门教程
+=============
 
 我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
 介绍PaddlePaddle的基本使用方法。
@@ -7,7 +8,7 @@ PaddlePaddle快速入门教程
 安装
 ====
 
-请参考 `安装教程 <../../build_and_install/index.html>`_ 安装PaddlePaddle。
+请参考 :ref:`install_steps` 安装PaddlePaddle。
 
 使用概述
 ========
@@ -21,7 +22,7 @@ PaddlePaddle快速入门教程
 
 使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。
 
-    ..  image:: Pipeline.jpg
+    ..  image:: src/Pipeline_cn.jpg
         :align: center
         :scale: 80%
 
@@ -59,7 +60,7 @@ PaddlePaddle快速入门教程
 Python脚本读取数据
 ------------------
 
-`DataProvider <../../ui/data_provider/index.html>`_ 是PaddlePaddle负责提供数据的模块。``DataProvider`` 主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：
+`DataProvider` 是PaddlePaddle负责提供数据的模块，主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：
 
 * initializer：PaddlePaddle会在调用读取数据的Python脚本之前，先调用initializer函数。在下面例子里，我们在initialzier函数里初始化词表，并且在随后的读取数据过程中填充词表。
 * process：PaddlePaddle调用process函数来读取数据。每次读取一条数据后，process函数会用yield语句输出这条数据，从而能够被PaddlePaddle 捕获 (harvest)。
@@ -72,6 +73,7 @@ Python脚本读取数据
      :linenos:
      :emphasize-lines: 8,33
 
+详细内容请参见 :ref:`api_dataprovider` 。
 
 配置中的数据加载定义
 --------------------
@@ -92,19 +94,19 @@ Python脚本读取数据
 - obj="process": 指定生成数据的函数
 - args={"dictionary": word_dict}: 额外的参数，这里指定词典
 
-更详细数据格式和用例请参考 `PyDataProvider2 <../../ui/data_provider/pydataprovider2.html>`_ 。
+更详细数据格式和用例请参考 :ref:`api_pydataprovider2` 。
 
 模型网络结构
 ============
 
 本小节我们将介绍模型网络结构。
 
-    ..  image:: PipelineNetwork.jpg
+    ..  image:: src/PipelineNetwork_cn.jpg
         :align: center
         :scale: 80%
 
 
-我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 `Layer文档 <../../../doc/layer.html>`_ 。
+我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 :ref:`api_trainer_config_helpers_layers` 。
 所有配置都能在 `源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录下找到。
 
 逻辑回归模型
@@ -112,7 +114,7 @@ Python脚本读取数据
 
 具体流程如下:
 
-    ..  image:: NetLR.jpg
+    ..  image:: src/NetLR_cn.jpg
         :align: center
         :scale: 80%
 
@@ -147,9 +149,9 @@ Python脚本读取数据
 **效果总结**：我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构，我们总结了各个网络的复杂度和效果。
 
     =====================  ===============================  =================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  =================
-    逻辑回归	                  252 KB                       8.652 %
+    逻辑回归                      252 KB                       8.652 %
     =====================  ===============================  =================
 
 词向量模型
@@ -176,7 +178,7 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 
 该模型依然使用逻辑回归分类网络的框架， 只是将句子用连续向量表示替换为用稀疏向量表示， 即对第三步进行替换。句子表示的计算更新为两步：
 
-..  image:: NetContinuous.jpg
+..  image:: src/NetContinuous_cn.jpg
     :align: center
     :scale: 80%
 
@@ -197,9 +199,9 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 **效果总结：**
 
     =====================  ===============================  ==================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  ==================
-    词向量模型	                  15 MB                       8.484 %
+    词向量模型                      15 MB                       8.484 %
     =====================  ===============================  ==================
 
 卷积模型
@@ -207,7 +209,7 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 
 卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型进一步演化为三个新步骤。
 
-..  image:: NetConv.jpg
+..  image:: src/NetConv_cn.jpg
     :align: center
     :scale: 80%
 
@@ -230,15 +232,15 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 **效果总结：**
 
     =====================  ===============================  ========================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  ========================
-    卷积模型	                  16 MB                       5.628 %
+    卷积模型                      16 MB                       5.628 %
     =====================  ===============================  ========================
 
 时序模型
 ----------
 
-..  image:: NetRNN.jpg
+..  image:: src/NetRNN_cn.jpg
     :align: center
     :scale: 80%
 
@@ -260,9 +262,9 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 本次试验，我们采用单层LSTM模型，并使用了Dropout，**效果总结：**
 
     =====================  ===============================  =========================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  =========================
-    时序模型	                  16 MB                       4.812 %
+    时序模型                      16 MB                       4.812 %
     =====================  ===============================  =========================
 
 优化算法
@@ -284,7 +286,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
 在数据加载和网络配置完成之后， 我们就可以训练模型了。
 
-..  image:: PipelineTrain.jpg
+..  image:: src/PipelineTrain_cn.jpg
     :align: center
     :scale: 80%
 
@@ -294,7 +296,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
         ./train.sh
 
-``train.sh``中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
+``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
 
     .. code-block:: bash
 
@@ -305,19 +307,19 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
         --num_passes=15 \
         --use_gpu=false
 
-这里只简单介绍了单机训练，如何进行分布式训练，可以参考教程 `分布式训练 <../../cluster/index.html>`_ 。
+这里只简单介绍了单机训练，如何进行分布式训练，请参考 :ref:`cluster_train` 。
 
 预测
 =====
 
 当模型训练好了之后，我们就可以进行预测了。
 
-..  image:: PipelineTest.jpg
+..  image:: src/PipelineTest_cn.jpg
     :align: center
     :scale: 80%
 
 之前配置文件中 ``test.list`` 指定的数据将会被测试，这里直接通过预测脚本 ``predict.sh`` 进行预测,
-更详细的说明，可以参考 `Python API预测 <../../ui/predict/swig_py_paddle.html>`_ 教程。
+更详细的说明，请参考 :ref:`api_swig_py_paddle` 。
 
     .. code-block:: bash
 
@@ -348,12 +350,12 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 对于Amazon-Elec测试集(25k), 如下表格，展示了上述网络模型的训练效果:
 
     =====================  ===============================  =============  ==================================
-    网络名称	                   参数数量                    错误率          配置文件
+    网络名称                       参数数量                    错误率          配置文件
     =====================  ===============================  =============  ==================================
-    逻辑回归模型	                  252 KB                     8.652%          trainer_config.lr.py
-    词向量模型      	               15 MB                      8.484%         trainer_config.emb.py
+    逻辑回归模型                      252 KB                     8.652%          trainer_config.lr.py
+    词向量模型                         15 MB                      8.484%         trainer_config.emb.py
     卷积模型                        16 MB                     5.628%          trainer_config.cnn.py
-    时序模型 	                    16 MB                     4.812%          trainer_config.lstm.py
+    时序模型                         16 MB                     4.812%          trainer_config.lstm.py
     =====================  ===============================  =============  ==================================
 
 
@@ -372,7 +374,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
 默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
 可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考 `命令行参数文档 <../../ui/index.html#command-line-argument>`_ 。
+其他参数请参考 命令行参数文档（链接待补充）。
 
 输出日志
 ---------
@@ -384,12 +386,12 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 模型训练会看到类似上面这样的日志信息，详细的参数解释，请参考如下表格：
 
     ===========================================  ==============================================================
-    名称	                                         解释
+    名称                                             解释
     ===========================================  ==============================================================
-    Batch=20	                                  表示过了20个batch
-    samples=2560	                              表示过了2560个样本
-    AvgCost	                                      每个pass的第0个batch到当前batch所有样本的平均cost
-    CurrentCost	                                  当前log_period个batch所有样本的平均cost
-    Eval: classification_error_evaluator	      每个pass的第0个batch到当前batch所有样本的平均分类错误率
-    CurrentEval: classification_error_evaluator	  当前log_period个batch所有样本的平均分类错误率
+    Batch=20                                      表示过了20个batch
+    samples=2560                                  表示过了2560个样本
+    AvgCost                                          每个pass的第0个batch到当前batch所有样本的平均cost
+    CurrentCost                                      当前log_period个batch所有样本的平均cost
+    Eval: classification_error_evaluator          每个pass的第0个batch到当前batch所有样本的平均分类错误率
+    CurrentEval: classification_error_evaluator      当前log_period个batch所有样本的平均分类错误率
     ===========================================  ==============================================================
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index 29637293fa..70dec2eb2a 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -32,7 +32,7 @@ The monitor breaks down two months after purchase.
 the classifier should output “negative“.
 
 To build your text classification system, your code will need to perform five steps:
-<center> ![](./Pipeline_en.jpg) </center>
+<center> ![](./src/Pipeline_en.jpg) </center>
 
   - Preprocess data into a standardized format.
   - Provide data to the learning model.
@@ -159,15 +159,15 @@ define_py_data_sources2(train_list='data/train.list',
 You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
-You will describe four kinds of network architectures in this section.
-<center> ![](./PipelineNetwork_en.jpg) </center>
+We will describe four kinds of network architectures in this section.
+<center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
 For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
-<center> ![](./NetLR_en.png) </center>
+<center> ![](./src/NetLR_en.png) </center>
 
 - You need define the data for text features. The size of the data layer is the number of words in the dictionary.
 
@@ -182,10 +182,10 @@ label = data_layer(name="label", size=label_dim)
 ```
 
 - It uses logistic regression model to classify the vector, and it will output the classification error during training.
-	- Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
-	- *size* for each layer means the number of neurons of the layer.
-	- *act_type* means activation function applied to the output of each neuron independently.
-	- Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
+    - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
+    - *size* for each layer means the number of neurons of the layer.
+    - *act_type* means activation function applied to the output of each neuron independently.
+    - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
 ```python
 # Define a fully connected layer with logistic activation (also called softmax activation).
 output = fc_layer(input=word,
@@ -240,7 +240,7 @@ def process(settings, file_name):
 ```
 
 This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
-<center> ![](./NetContinuous_en.png) </center>
+<center> ![](./src/NetContinuous_en.png) </center>
 
 - It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
 
@@ -283,7 +283,7 @@ The performance is summarized in the following table:
 
 ### Convolutional Neural Network Model
 Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
-<center> ![](./NetConv_en.png) </center>
+<center> ![](./src/NetConv_en.png) </center>
 
 
 Text convolution has 3 steps:
@@ -295,8 +295,8 @@ Text convolution has 3 steps:
 # context_len means convolution kernel size.
 # context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
 text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
+                               context_start=k,
+                               context_len=2 * k + 1)
 ```
 
 The performance is summarized in the following table：
@@ -324,7 +324,7 @@ The performance is summarized in the following table：
 <br>
 
 ### Recurrent Model
-<center> ![](./NetRNN_en.png) </center>
+<center> ![](./src/NetRNN_en.png) </center>
 
 You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
 
@@ -378,7 +378,7 @@ settings(batch_size=128,
 
 ## Training Model
 After completing data preparation and network architecture specification, you will run the training script.
-<center> ![](./PipelineTrain_en.png) </center>
+<center> ![](./src/PipelineTrain_en.png) </center>
 
 Training script: our training script is in `train.sh` file. The training arguments are listed below:
 
@@ -391,11 +391,11 @@ paddle train \
 --use_gpu=false
 ```
 
-We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
+We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/usage/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
 
 ## Inference
 You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
-<center> ![](./PipelineTest_en.png) </center>
+<center> ![](./src/PipelineTest_en.png) </center>
 
 The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
 
@@ -509,7 +509,7 @@ The scripts of data downloading, network configurations, and training scrips are
 * \--config_args：Other configuration arguments.
 * \--init_model_path：The path of the initial model parameter.
 
-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/cmd_parameter/index_en.html">command line argument documentation</a>。
+By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/usage/cmd_parameter/index_en.html">command line argument documentation</a>。
 
 ### Log
 
diff --git a/doc_cn/demo/quick_start/NetContinuous.jpg b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetContinuous.jpg
rename to doc/tutorials/quick_start/src/NetContinuous_cn.jpg
diff --git a/doc/tutorials/quick_start/NetContinuous_en.png b/doc/tutorials/quick_start/src/NetContinuous_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetContinuous_en.png
rename to doc/tutorials/quick_start/src/NetContinuous_en.png
diff --git a/doc_cn/demo/quick_start/NetConv.jpg b/doc/tutorials/quick_start/src/NetConv_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetConv.jpg
rename to doc/tutorials/quick_start/src/NetConv_cn.jpg
diff --git a/doc/tutorials/quick_start/NetConv_en.png b/doc/tutorials/quick_start/src/NetConv_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetConv_en.png
rename to doc/tutorials/quick_start/src/NetConv_en.png
diff --git a/doc_cn/demo/quick_start/NetLR.jpg b/doc/tutorials/quick_start/src/NetLR_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetLR.jpg
rename to doc/tutorials/quick_start/src/NetLR_cn.jpg
diff --git a/doc/tutorials/quick_start/NetLR_en.png b/doc/tutorials/quick_start/src/NetLR_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetLR_en.png
rename to doc/tutorials/quick_start/src/NetLR_en.png
diff --git a/doc_cn/demo/quick_start/NetRNN.jpg b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetRNN.jpg
rename to doc/tutorials/quick_start/src/NetRNN_cn.jpg
diff --git a/doc/tutorials/quick_start/NetRNN_en.png b/doc/tutorials/quick_start/src/NetRNN_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetRNN_en.png
rename to doc/tutorials/quick_start/src/NetRNN_en.png
diff --git a/doc_cn/demo/quick_start/PipelineNetwork.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineNetwork.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/PipelineNetwork_en.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
diff --git a/doc_cn/demo/quick_start/PipelineTest.jpg b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTest.jpg
rename to doc/tutorials/quick_start/src/PipelineTest_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineTest_en.png b/doc/tutorials/quick_start/src/PipelineTest_en.png
similarity index 100%
rename from doc/tutorials/quick_start/PipelineTest_en.png
rename to doc/tutorials/quick_start/src/PipelineTest_en.png
diff --git a/doc_cn/demo/quick_start/PipelineTrain.jpg b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTrain.jpg
rename to doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineTrain_en.png b/doc/tutorials/quick_start/src/PipelineTrain_en.png
similarity index 100%
rename from doc/tutorials/quick_start/PipelineTrain_en.png
rename to doc/tutorials/quick_start/src/PipelineTrain_en.png
diff --git a/doc_cn/demo/quick_start/Pipeline.jpg b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/Pipeline.jpg
rename to doc/tutorials/quick_start/src/Pipeline_cn.jpg
diff --git a/doc/tutorials/quick_start/Pipeline_en.jpg b/doc/tutorials/quick_start/src/Pipeline_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/Pipeline_en.jpg
rename to doc/tutorials/quick_start/src/Pipeline_en.jpg
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
new file mode 100644
index 0000000000..2207a776f0
--- /dev/null
+++ b/doc/tutorials/rec/ml_dataset_cn.md
@@ -0,0 +1,105 @@
+```eval_rst
+.. _demo_ml_dataset:
+
+```
+
+# MovieLens数据集
+
+[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
+该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
+我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
+集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
+
+## 数据集特征
+
+在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
+(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
+分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
+
+### 评分文件描述(ratings.dat)
+
+
+所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
+
+用户ID::电影ID::评分::时间戳
+
+- 用户ID范围从1到6040
+- 电影ID范围从1到3952
+- 评分被调整为5星的规模(只允许整数的星级)
+- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
+- 每位用户至少有20条评分
+
+### 用户文件描述(users.dat)
+
+所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
+
+用户ID::性别::年龄::职业::邮编
+
+所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
+口统计学信息的用户才被包含在数据集中。
+
+- 性别，用"M"表示男性，"F"表示女性
+- 年龄从下列列表范围中选取:
+
+	*   1:	"18岁以下"
+	*  18:	"18-24岁"
+	*  25:	"25-34岁"
+	*  35:	"35-44岁"
+	*  45:	"45-49岁"
+	*  50:	"50-55岁"
+	*  56:	"56+"
+
+- 职业从下面所列中选择:
+
+	*   0:  "其他"或不确定
+	*   1:  "学术/教育工作者"
+	*   2:  "艺术家"
+	*   3:  "文书工作/管理员"
+	*   4:  "大学生/研究生"
+	*   5:  "客户服务"
+	*   6:  "医生/医疗保健"
+	*   7:  "行政工作/管理人员"
+	*   8:  "农民"
+	*   9:  "操持家务者"
+	*  10:  "高中毕业生"
+	*  11:  "律师"
+	*  12:  "程序员"
+	*  13:  "退休人员"
+	*  14:  "销售/市场"
+	*  15:  "科学家"
+	*  16:  "自由职业者"
+	*  17:  "技术员/工程师"
+	*  18:  "推销员/手工艺者"
+	*  19:  "无业人士"
+	*  20:  "作家"
+
+### 电影文件描述(movies.dat)
+
+所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
+
+电影ID::电影名称::电影类型
+
+- 电影名称（包括发行时间）与IMDB网站提供的一致
+- 电影类型如符合多种用管道符号|分割，选自下列类型:
+
+	*	动作片
+	*	冒险片
+	*	动画片
+	*	儿童片
+	*	喜剧片
+	*	犯罪片
+	*	纪录片
+	*	戏剧
+	*	奇幻片
+	*	黑色电影
+	*	恐怖片
+	*	音乐剧
+	*	悬疑片
+	*	浪漫片
+	*	科幻片
+	*	惊险电影
+	*	战争片
+	*	西部片
+
+- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
+- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md
index dc11a5e060..25dea5c4af 100644
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ b/doc/tutorials/rec/ml_dataset_en.md
@@ -1,6 +1,5 @@
 ```eval_rst
-..  _demo_ml_dataset_en:
-
+..  _demo_ml_dataset:
 ```
 
 # MovieLens Dataset
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
new file mode 100644
index 0000000000..9278c9f603
--- /dev/null
+++ b/doc/tutorials/rec/ml_regression_cn.rst
@@ -0,0 +1,349 @@
+MovieLens数据集评分回归模型
+===========================
+
+这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
+该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
+的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
+需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
+没有进行结构的微调。
+
+
+**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
+让这个示例变得更好，希望能让我们知晓。**
+
+数据准备
+`````````
+下载并解压数据集
+'''''''''''''''''
+这里我们使用 :ref:`demo_ml_dataset` 。
+要下载和解压数据集，只需要简单的运行下面的命令即可。
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	./ml_data.sh
+
+:code:`demo/recommendation/data/ml-1m` 的目录结构为:
+
+.. code-block:: text
+
+	+--ml-1m
+		+--- movies.dat 	# 电影特征
+		+--- ratings.dat 	# 评分
+		+--- users.dat 		# 用户特征
+		+--- README 		# 数据集描述
+
+字段配置文件
+'''''''''''''
+**字段配置文件** 用来具体说明数据集的字段和文件格式，
+例如，说明每个特征文件具体字段是 **什么** 类型。
+
+ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
+其具体说明了字段类型和文件名称:
+
+1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
+
+2) 文件名称为"users.dat"，文件的分隔符为"::"。
+
+.. include:: ../../../demo/recommendation/data/config.json
+   :code: json
+   :literal:
+
+准备数据
+`````````
+你需要安装python的第三方库。
+**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
+
+.. code-block:: bash
+
+	pip install -r requirements.txt
+
+预处理数据一般的命令为:
+
+.. code-block:: bash
+
+	cd demo/recommendation
+	./preprocess.sh
+
+下面介绍预处理过程具体的步骤。
+
+提取电影或用户的特征并生成python对象
+'''''''''''''''''''''''''''''''''''''
+
+在movielens 1m数据集中，电影和用户有许多的特征。
+评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
+我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
+
+Meta配置文件
+.............
+
+**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
+该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
+为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
+
+要将字段配置文件转化为meta配置文件，只需要运行：
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	python config_generator.py config.json > meta_config.json
+
+生成的meta配置文件如下所示：
+
+.. include:: ../../../demo/recommendation/data/meta_config.json
+	:code: json
+	:literal:
+
+在meta文件中有两种特征\: 电影和用户。
+
+* 在电影文件movies.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是电影名
+		* 利用正则表达式来解析该特征
+		* 基于字母的词嵌入特征
+		* 是序列
+	* pos 2 特征：
+		* name是体裁
+		* type是one hot稠密向量
+		* dictionary由解析自动生成，每一个key由'|'分隔
+* 在用户文件users.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是性别
+		* 简单的基于字母的词嵌入
+	* pos 2 特征：
+		* name是年龄
+		* 是整个的词嵌入
+		* 嵌入编号会根据单词排序
+	* pos 3 特征：
+		* name是职业
+		* 简单的整个词嵌入
+
+
+Meta文件
+''''''''
+
+有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
+存储着电影或用户信息。可以运行下面的命令来生成。
+
+.. code-block:: bash
+
+	python meta_generator.py ml-1m meta.bin --config=meta_config.json
+
+meta文件 :code:`meta.bin` 的结构如下：
+
+.. code-block:: text
+
+    +--+ movie
+    |      +--+ __meta__
+    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
+    |      |       |       +
+    |      |       |       |     # 编号字段，我们用编号作为key 
+    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
+    |      |       |       |
+    |      |       |       |     # 电影名字段，嵌入特征字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
+    |      |       |       |
+    |      |       |       |     # 体裁字段，体裁字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
+    |      |       |
+    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
+    |      |                               # it means there are 2 features for each key.
+    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
+    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
+    |      |
+    |      +--+ 1 # 电影1的特征
+    |      |    +
+    |      |    +---+ [[...], [...]] # title ids, genres dense vector
+    |      |
+    |      +--+ 2
+    |      |
+    |      +--+ ...
+    |
+    +--- user
+           +--+ __meta__
+           |       +
+           |       +--+ raw_meta
+           |       |       +
+           |       |       +--+ id field as user
+           |       |       |
+           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
+           |       |
+           |       +--+ feature_map [1, 2, 3]
+           |
+           +--+ 1 # 用户1的特征
+           |
+           +--+ 2
+           +--+ ...
+
+
+分割训练/测试文件
+''''''''''''''''''
+
+我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
+这样的话每位用户在测试文件中将与训练文件含有同样的信息。
+
+用 :code:`separate.py` 来分离训练和测试文件。
+
+.. code-block:: bash
+
+	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
+
+这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
+将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
+
+..  code-block:: bash
+
+    shuf ml-1m/ratings.dat.train > ratings.dat.train
+    cp ml-1m/ratings.dat.test .
+    echo "./data/ratings.dat.train" > train.list
+    echo "./data/ratings.dat.test" > test.list
+
+
+神经网络结构配置
+`````````````````
+
+训练器配置文件
+'''''''''''''''
+
+网络结构如下图所示：
+
+..  image:: rec_regression_network.png
+    :align: center
+    :alt: rec_regression_network
+
+该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
+
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-
+
+在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
+展示了如何将每个特征映射到一个向量。
+
+* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
+* :code:`embedding` \:
+    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
+      然后得到平均采样的结果。
+    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
+* :code:`one_host_dense` \:
+    - 仅仅是两个全连接层。
+
+然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
+并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
+
+在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
+
+*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
+*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
+*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
+
+数据提供脚本
+'''''''''''''
+
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-
+
+数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
+在脚本 :code:`dataprovider.py` 中，我们需要设置：
+
+* obj.slots\: 特征的类型和维度。
+* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
+* process\: 返回数据的每一条样本给 :code:`paddle` 。
+
+数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。
+
+训练
+````
+
+准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
+
+代码 :code:`run.sh` 如下：
+
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-
+
+该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
+打印在屏幕上。
+
+脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
+这些参数的简短介绍如下：
+
+*  config\: 告诉paddle哪个文件是神经网络的配置文件。
+*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
+*  use_gpu\: 是否使用GPU，默认为不使用。
+*  trainer_count\: 一台机器上面的线程数量。
+*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
+   每个测试周期测试: code:`batch_size` 批次的数据。
+*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
+*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
+*  num_passes\: 训练至多: code:`num_passes` 轮。
+
+如果训练过程启动成功的话，输出应该类似如下：
+
+..  code-block:: text
+
+    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
+
+    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
+
+    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
+
+    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
+
+    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
+    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
+    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
+    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
+    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
+    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
+    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
+
+模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
+
+模型评估和预测
+```````````````
+
+在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
+
+.. code-block:: bash
+
+    ./evaluate.sh 
+
+你将看到如下的信息：
+
+.. code-block:: text
+
+    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
+    evaluating from pass output/pass-00009
+
+然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
+
+..  code-block:: bash
+
+    python prediction.py 'output/pass-00009/'
+
+预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
+
+..  code-block:: text
+
+    Input movie_id: 9
+    Input user_id: 4
+    Prediction Score is 2.56
+    Input movie_id: 8
+    Input user_id: 2
+    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst
index 6346090a84..993b9a516f 100644
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
@@ -16,7 +16,7 @@ Data Preparation
 ````````````````
 Download and extract dataset
 ''''''''''''''''''''''''''''
-We use :ref:`demo_ml_dataset_en` here. 
+We use :ref:`demo_ml_dataset` here. 
 To download and unzip the dataset, simply run the following commands.
 
 ..  code-block:: bash
@@ -36,7 +36,7 @@ And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
 
 Field config file
 '''''''''''''''''
-**Field config file** is used to specific the fields dataset and file format,
+**Field config file** is used to specify the fields of the dataset and the file format,
 i.e, specific **WHAT** type it is in each feature file.
 
 The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
@@ -188,7 +188,7 @@ Split Training/Testing files
 We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
 rating by two parts. So each user in testing file will have some rating information in training file.
 
-Use separate.py to separate the training and testing file.
+Use :code:`separate.py` to separate the training and testing file.
 
 ..  code-block:: bash
 
@@ -217,7 +217,7 @@ The network structure shows below.
     :align: center
     :alt: rec_regression_network
 
-The demo's neural network config file "trainer_config.py" show as below.
+The demo's neural network config file :code:`trainer_config.py` show as below.
 
 ..  literalinclude:: ../../../demo/recommendation/trainer_config.py
     :language: python
@@ -239,7 +239,7 @@ Then we combine each features of movie into one movie feature by a
 get one user feature. Then we calculate the cosine similarity of these two
 features.
 
-In these network, we use several api in :ref:`api_trainer_config` . There are
+In these networks, we use several APIs in :ref:`api_trainer_config` . There are
 
 *  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
 *  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
@@ -264,26 +264,26 @@ In this :code:`dataprovider.py`, we should set\:
 * use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
 * process\: Return each sample of data to :code:`paddle`.
 
-The data provider details document see :ref:`api_pydataprovider2_en`.
+The data provider details document see :ref:`api_pydataprovider2`.
 
 Train
 `````
 
 After prepare data, config network, writting data provider, now we can run paddle training.
 
-The run.sh is shown as follow:
+The :code:`run.sh` is shown as follow:
 
 ..  literalinclude:: ../../../demo/recommendation/run.sh
     :language: bash
     :lines: 16-
 
-It just start a paddle training process, write the log to `log.txt`,
+It just start a paddle training process, write the log to :code:`log.txt`,
 then print it on screen.
 
-Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index_en` page. The short description of these arguments is shown as follow.
+Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
 
 *  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into './output'
+*  save_dir\: Tell paddle save model into :code:`./output`.
 *  use_gpu\: Use gpu or not. Default is false.
 *  trainer_count\: The compute thread in one machine.
 *  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
index c7e0a78f50..f6061766c0 100644
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ b/doc/tutorials/semantic_role_labeling/index_cn.md
@@ -149,7 +149,7 @@ paddle train \
 
 训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
 <center>
-![pic](./curve.jpg)
+![pic](./src/curve.jpg)
 </center>
 
 ### 测试
diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md
index bdd12c0d9a..92d7c63483 100644
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ b/doc/tutorials/semantic_role_labeling/index_en.md
@@ -1,5 +1,5 @@
 ```eval_rst
-..  _semantic_role_labeling_en:
+..  _semantic_role_labeling:
 ```
 
 # Semantic Role labeling Tutorial #
@@ -45,13 +45,13 @@ Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM ado
 
 The following figure shows a temporal expanded 2-layer DB-LSTM network.
 <center>
-![pic](./network_arch.png)
+![pic](./src/network_arch.png)
 </center>
 
 ### Features
 Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
 <center>
-![pic](./feature.jpg)
+![pic](./src/feature.jpg)
 </center>
 
 In this sample, the coresponding labelled sentence is:
@@ -152,7 +152,7 @@ paddle train \
 
 After training, the models  will be saved in directory `output`. Our training curve is as following:
 <center>
-![pic](./curve.jpg)
+![pic](./src/curve.jpg)
 </center>
 
 ### Run testing
diff --git a/doc/tutorials/semantic_role_labeling/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
similarity index 100%
rename from doc/tutorials/semantic_role_labeling/curve.jpg
rename to doc/tutorials/semantic_role_labeling/src/curve.jpg
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
new file mode 100644
index 0000000000..0e3310e4ac
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/feature.jpg differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
new file mode 100644
index 0000000000..4ae7864212
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/network_arch.png differ
diff --git a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/index_cn.md
similarity index 96%
rename from doc_cn/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/index_cn.md
index ba307e97e3..1323ec1a6a 100644
--- a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/tutorials/sentiment_analysis/index_cn.md
@@ -1,325 +1,325 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* unsup: 未标记的评价样本，包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](../../../doc/demo/sentiment_analysis/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
-
-在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络，后面连全连接层和softmax层。
-
-<center>![BiLSTM](../../../doc/demo/sentiment_analysis/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](../../../doc/demo/sentiment_analysis/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降（sgd）算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
-* `predict.py` : 预测接口脚本。
-* `--tconf=$config` : 设置网络配置。
-* `--model=$model` : 设置模型路径。
-* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
-* `--batch_size=1` : 设置batch size。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注，并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
+# 情感分析教程
+
+情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
+
+情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
+
+另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
+
+本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
+
+## 数椐准备
+
+### IMDB 数椐介绍
+
+训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
+
+```
+cd demo/sentiment/data
+./get_imdb.sh
+```
+如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
+
+```
+aclImdb  get_imdb.sh  imdb  mosesdecoder-master
+```
+
+* aclImdb: 从外部网站上下载的原始数椐集。
+* imdb: 仅包含训练和测试数椐集。
+* mosesdecoder-master: Moses 工具。
+
+IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
+
+```
+imdbEr.txt  imdb.vocab  README  test  train
+```
+* train: 训练数椐集。
+* test : 测试数椐集。
+* imdb.vocab: 字典文件。
+* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
+* README: 数椐说明文档。
+
+测试集和训练集目录包含下面的文件:
+
+```
+labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
+```
+
+* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* unsup: 未标记的评价样本，包含50,000个txt文件。
+* urls_xx.txt: 每个评论的网址。
+* xxBow.feat: 用于统计词频的Bow模型特征。
+
+### IMDB 数椐准备
+
+在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
+
+```
+cd demo/sentiment/
+./preprocess.sh
+```
+preprocess.sh:
+
+```
+data_dir="./data/imdb"
+python preprocess.py -i data_dir
+```
+
+* data_dir: 输入数椐所在目录。
+* preprocess.py: 预处理脚本。
+
+运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
+
+```
+dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
+```
+* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
+* train.list and test.list: 训练集和测试集文件列表。
+* dict.txt: 利用训练集生成的字典。
+* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
+
+### 用户自定义数椐预处理
+
+如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
+
+```
+dataset
+|----train
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+|----test
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+```
+* dataset: 一级目录。
+* train, test: 二级目录。
+* class1,class2,...: 三级目录。
+* text_files: 文本格式的实例文件。
+
+所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
+
+## 训练模型
+
+在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
+
+<center>![LSTM](src/lstm.png)</center>
+<center>图表 1. LSTM [3]</center>
+
+情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
+
+在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
+
+#### 双向LSTM
+
+图2是双向LSTM网络，后面连全连接层和softmax层。
+
+<center>![BiLSTM](src/bi_lstm.jpg)</center>
+<center>图 2. Bidirectional-LSTM </center>
+
+#### Stacked-LSTM
+图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
+
+<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
+<center>图 3. Stacked-LSTM for sentiment analysis </center>
+
+**配置**
+
+进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
+
+trainer_config.py:
+
+```python
+from sentiment_net import *
+
+data_dir  = "./data/pre-imdb"
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
+```
+
+* **数椐定义**:
+   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
+   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
+
+* **算法配置**:
+   * 使用随机梯度下降（sgd）算法。
+   * 使用 adam 优化。
+   * 设置batch size大小为128。
+   * 设置平均sgd窗口。
+   * 设置全局学习率。
+* **网络配置**:
+   * dict_dim: 获取字典维度。
+   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
+   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
+   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
+
+**训练**
+
+首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
+
+```
+cd demo/sentiment/
+./train.sh
+```
+
+train.sh:
+
+```
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=20 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
+```
+
+* \--config=$config: 设置网络配置。
+* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
+* \--job=train: 设置工作模式为训练。
+* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
+* \--trainer\_count=4:设置线程数（或GPU个数）。
+* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
+* \--log\_period=20: 每20个batch打印一次日志。
+* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
+* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
+
+如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
+
+```
+Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
+...
+Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
+Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
+```
+- Batch=xx: 表示训练了xx个Batch。
+- samples=xx: 表示训练了xx个样本。。
+- AvgCost=xx: 从第0个batch到当前batch的平均损失。
+- CurrentCost=xx: 最新log_period个batch处理的当前损失。
+- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
+- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
+- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
+
+默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
+
+## 测试模型
+
+测试模型是指使用训练出的模型评估已标记的验证集。
+
+```
+cd demo/sentiment
+./test.sh
+```
+
+test.sh:
+
+```bash
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
+```
+
+函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
+
+```
+Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
+```
+
+## 预测
+
+`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
+
+```
+cd demo/sentiment
+./predict.sh
+```
+predict.sh:
+
+```
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
+```
+
+* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
+* `predict.py` : 预测接口脚本。
+* `--tconf=$config` : 设置网络配置。
+* `--model=$model` : 设置模型路径。
+* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
+* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
+* `--batch_size=1` : 设置batch size。
+
+注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
+
+本示例的预测结果：
+
+```
+Loading parameters from model_output/pass-00002/
+./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
+```
+我们真诚地感谢您的关注，并欢迎您来参与贡献。
+
+## 参考文档
+[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
+[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
+[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
+[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
+[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
new file mode 100644
index 0000000000..adec1606d6
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
new file mode 100644
index 0000000000..aaf1fc690d
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/lstm.png differ
diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
new file mode 100644
index 0000000000..4239055050
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg differ
diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md
new file mode 100644
index 0000000000..41a87b926d
--- /dev/null
+++ b/doc/tutorials/text_generation/index_cn.md
@@ -0,0 +1,339 @@
+# 文本生成教程 #
+
+在语言生成领域中，“序列到序列”（sequence to sequence）的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译（machine translation）、query改写（query rewriting）、图像描述（image captioning）等等。
+
+本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译（NMT）模型来将法语翻译成英语。
+
+我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章，其中详细说明了模型架构，以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。
+
+我们感谢@caoying的pull request，其中定义了模型架构和solver配置。
+
+## 数据准备 ##
+### 下载与解压缩 ###
+从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集，然后解压，并将Develop和Test数据分别放入不同的文件夹。
+
+- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
+- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
+
+在Linux下，只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。
+
+```bash
+cd demo/seqToseq/data
+./wmt14_data.sh
+```
+
+我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+
+<thead>
+<tr>
+<th scope="col" class="left">folder name</th>
+<th scope="col" class="left">French-English parallel corpora file</th>
+<th scope="col" class="left">number of total file</th>
+<th scope="col" class="left">size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">train_data</td>
+<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
+<td class="left">12</td>
+<td class="left">3.55G</td>
+</tr>
+
+<tr>
+<td class="left">test_data</td>
+<td class="left">ntst1213.src, ntst1213.trg</td>
+<td class="left">2</td>
+<td class="left">1636k</td>
+</tr>
+
+<tr>
+<td class="left">gen_data</td>
+<td class="left">ntst14.src, ntst14.trg</td>
+<td class="left">2</td>
+<td class="left">864k</td>
+</tr>
+</tbody>
+</table>
+<br/>
+
+- 每个文件夹都包含法语到英语的平行语料库
+- **XXX.src** 是原始法语文件；**XXX.trg** 是目标英语文件
+- **XXX.src** 和 **XXX.trg** 的行数应该一致
+- 每行都是一个法语或者英语的句子
+- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系
+
+### 用户自定义数据集 ###
+
+如果你想进行诸如语义转述（Paraphrasing）等其他“序列到序列”的任务，你只需要按照如下方式组织数据，并将它们放在`demo/seqToseq/data`目录下：
+
+    dataset
+      train
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      test
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      gen
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+  
+- 一级目录：数据集文件夹名称
+- 二级目录：train、test和gen这三个文件夹是固定的
+- 三级目录：源语言到目标语言的平行语料库文件
+  - **XXX.src** 是源语言的文件，**XXX.trg** 时目标语言的文件
+  - 文件中的每行都必须是一个句子
+  - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系
+
+## 数据预处理 ##
+### 预处理工作流程 ###
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+  - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX**
+  - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行
+- 创建训练数据的“源字典”和“目标字典”，每个字典都有DICTSIZE个单词，包括：
+  - 词频最高的（DICTSIZE - 3）个单词
+  - 3个特殊符号
+  - `<s>`：序列的开始
+  - `<e>`：序列的结束
+  - `<unk>`：未包含在字典中的单词
+
+### 预处理命令和结果
+对数据集进行预处理的基本命令是：
+
+```python
+cd demo/seqToseq/
+python preprocess.py -i INPUT [-d DICTSIZE] [-m]
+```
+
+- `-i INPUT`：输入的原始数据集路径
+- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词
+- `-m --mergeDict`：合并 “源字典”和“目标字典”，使得两个字典有相同的上下文
+
+你将会看到如下消息：
+
+    concat parallel corpora for dataset
+    build source dictionary for train data
+    build target dictionary for train data
+    dictionary size is XXX
+
+然后你只需要运行以下命令：
+
+```python
+python preprocess.py -i data/wmt14 -d 30000
+```
+
+这将花费数分钟的时间，并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下：
+
+    train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial #
+
+- **train, test, gen**：分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分，首先是法语序列，然后是对应的英语序列。
+- **train.list, test.list, gen.list**：分别为train，test，gen文件夹中的文件列表
+- **src.dict, trg.dict**：源（法语）/目标（英语）字典，每个字典包含总共30000个单词：29997个最高频单词和3个特殊符号
+
+## 模型训练 ##
+### 简介###
+
+神经网络机器翻译（NMT）旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型（encoder–decoder models）的一种。编解码模型将一个源语句编码为一个定长的向量，然后解码器通过这个向量生成一个目标语句。
+
+在这个任务中，我们使用了一个编解码模型的扩展，它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词，它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词，这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释，可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。
+
+这个模型对于编解码模型来说，最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反，它将输入语句编码为向量的序列，其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时，会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来，不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显，在任意长度语句翻译的场景下都可以观察到其效果的提升。
+<center>![](./encoder-decoder-attention-model.png)</center>
+<center>Figure 1. Encoder-Decoder-Attention-Model</center>
+
+### 使用PaddlePaddle训练模型 ###
+我们在训练之前需要常见一个模型配置文件，这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
+
+```python
+from seqToseq_net import *
+is_generating = False
+
+### Data Definiation
+train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                             is_generating = is_generating)
+
+### Algorithm Configuration
+settings(
+    learning_method = AdamOptimizer(),
+    batch_size = 50,
+    learning_rate = 5e-4)
+
+### Network Architecture
+gru_encoder_decoder(train_conf, is_generating)
+```
+
+1. **Data Definiation**：在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置，其输入参数如下：
+  - data_dir：训练数据和测试数据的目录
+  - is_generating：这个配置是否用来生成，这里设置为False
+2. **Algorithm Configuration**：在示例中我们使用SGD训练算法（默认），和ADAM学习方法，指定batch_size为50，learning_rate为5e-4
+3. **Network Architecture**：在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器，它模拟了解码翻译过程中在源语句中的搜索。
+
+### 训练模型的命令与结果###
+写完模型配置之后，我们可以通过以下命令来训练模型：
+
+```bash
+cd demo/seqToseq/translation
+./train.sh
+```
+
+`train.sh` 的内容如下所示：
+
+```bash
+paddle train \
+--config='translation/train.conf' \
+--save_dir='translation/model' \
+--use_gpu=false \
+--num_passes=16 \
+--show_parameter_stats_period=100 \
+--trainer_count=4 \
+--log_period=10 \
+--dot_period=5 \
+2>&1 | tee 'translation/train.log'
+```
+- config: 设置神经网络的配置文件
+- save_dir: 设置保存模型的输出路径
+- use_gpu: 是否使用GPU训练，这里设置为使用CPU
+- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次
+- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息
+- trainer_count: 设置CPU线程数或者GPU设备数
+- log_period: 这里每隔10个batch打印一次日志
+- dot_period: 这里每个5个batch打印一个点"."
+
+训练的损失函数默认每隔10个batch打印一次，你将会看到如下消息：
+
+    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
+    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
+    .....
+- AvgCost：从第0个batch到当前batch的平均cost
+- CurrentCost:：当前batch的cost
+- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率
+- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率
+
+当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
+
+## 文本生成 ##
+### 简介###
+
+一般而言，NMT模型受制于源语句的编码，并且通过给出当前目标单词来预测下一个目标单词。在训练过程中，当前单词在相比之下总是被当作真值（ground truth）。在生成过程中，当前单词是解码器最后一步的输出，这来自于PaddlePaddle的内存中。
+
+而且，我们使用集束搜索（Beam Search）来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层，生成当前层的所有后继状态，并将它们按照启发代价（heuristic cost）升序排列。但是这种方法在每层只保存预设数量的最优状态（这个数量称为beam size）。
+
+### 预训练的模型 ###
+我们在拥有50个节点的集群中训练模型，每个节点有两个6核CPU。我们在5天里训练了16个pass，其中每条pass花费了7个小时。model_dir中有16个子目录，每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77（参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)）。要下载解压这个模型，只需在linux下运行如下命令：
+
+```bash
+cd demo/seqToseq/data
+./wmt14_model.sh
+```
+
+### 使用PaddlePaddle生成模型 ###
+在翻译法语句子之前，我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
+
+```python
+from seqToseq_net import *
+is_generating = True
+
+################## Data Definiation #####################
+gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                           is_generating = is_generating,
+                           gen_result = "./translation/gen_result")
+
+############## Algorithm Configuration ##################
+settings(
+  learning_method = AdamOptimizer(),
+  batch_size = 1,
+  learning_rate = 0)
+
+################# Network configure #####################
+gru_encoder_decoder(gen_conf, is_generating)
+```
+
+1. **Data Definiation**：在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置，其输入参数如下：
+  - data_dir：生成数据的目录
+  - is_generating：这个配置是否用来生成，这里设置为True
+  - gen_result：保存生成结果的文件
+2. **Algorithm Configuration**：在生成过程中我们使用SGD训练算法，并指定batch_size为1（每次生成1个序列），learning_rate为0
+3. **Network Architecture**：本质上与训练模型一样
+
+### 生成模型的命令与结果 ###
+写完模型配置之后，我们可以通过以下命令来进行从法语到英语的文本翻译：
+
+```bash
+cd demo/seqToseq/translation
+./gen.sh
+```
+
+ `gen.sh` 的内容如下所示。与训练模型不同的是，这里有一些不同的参数需要指定：
+
+```bash
+paddle train \
+--job=test \
+--config='translation/gen.conf' \
+--save_dir='data/wmt14_model' \
+--use_gpu=true \
+--num_passes=13 \
+--test_pass=12 \
+--trainer_count=1 \
+2>&1 | tee 'translation/gen.log'
+```
+- job：设置任务的模式为测试
+- save_dir：存储模型的路径
+- num_passes and test_pass：从test_pass到（num_passes - 1）加载模型参数，这里只加载 `data/wmt14_model/pass-00012`
+
+你将会看到这样的消息：
+
+    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
+    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
+    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
+    ...
+
+然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示：
+
+    0
+    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
+    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
+    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
+
+    1
+    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
+    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
+    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
+    ...
+
+- 这是集束搜索的结果，其中beam size是3
+- 第一行的“0”和第6行的“1”表示生成数据的序列id
+- 其他六行列出了集束搜索的结果
+  - 第二列是集束搜索的得分（从大到小）
+  - 第三列是生成的英语序列
+- 有两个特殊标识：
+  - `<e>`：序列的结尾
+  - `<unk>`：不包含在字典中的单词
+
+### BLEU评估 ###
+对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法，当需要快速或者频繁的评估时，使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本：
+
+```bash
+cd demo/seqToseq/translation
+./moses_bleu.sh
+```
+
+由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`，我们可以运行以下命令来做BLEU评估。
+
+```bash
+cd demo/seqToseq/translation
+./eval_bleu.sh FILE BEAMSIZE
+```
+
+- FILE：生成的结果文件
+- BEAMSIZE：集束搜索中的扩展广度
diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md
index d63f5cb607..5d8e667c20 100644
--- a/doc/tutorials/text_generation/index_en.md
+++ b/doc/tutorials/text_generation/index_en.md
@@ -260,8 +260,8 @@ gru_encoder_decoder(gen_conf, is_generating)
 
 1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
    - data\_dir: directory of gen data
-   - is\_generating: whether this config is used for generating, here is false
-   - gen\_result: file to store the generation result
+   - is\_generating: whether this config is used for generating, here is true
+   - gen\_result: file to store the generation result
 2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
 3. **Network Architecture**: Essentially the same as the training model.
 
diff --git a/doc_cn/CMakeLists.txt b/doc_cn/CMakeLists.txt
deleted file mode 100644
index 314b34525c..0000000000
--- a/doc_cn/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-if(NOT DEFINED SPHINX_THEME)
-    set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/doc_cn/algorithm/rnn/hrnn_demo.rst b/doc_cn/algorithm/rnn/hrnn_demo.rst
deleted file mode 100644
index 96396ff105..0000000000
--- a/doc_cn/algorithm/rnn/hrnn_demo.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..	_algo_hrnn_demo:
-
-#################
-双层RNN的使用示例
-#################
-
-TBD
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/index.rst b/doc_cn/build_and_install/cmake/index.rst
deleted file mode 100644
index e2a12c5001..0000000000
--- a/doc_cn/build_and_install/cmake/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-使用cmake编译PaddlePaddle
-=========================
-
-..  toctree::
-    
-    install_deps.rst
-    compile_options.rst
-    make_and_install.rst
diff --git a/doc_cn/build_and_install/cmake/install_deps.rst b/doc_cn/build_and_install/cmake/install_deps.rst
deleted file mode 100644
index 7fa4665a95..0000000000
--- a/doc_cn/build_and_install/cmake/install_deps.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-安装编译PaddlePaddle需要的依赖
-==============================
-
-参见 `安装编译依赖 <../../../doc/build/build_from_source.html#install-dependencies>`_
diff --git a/doc_cn/build_and_install/cmake/make_and_install.rst b/doc_cn/build_and_install/cmake/make_and_install.rst
deleted file mode 100644
index 212b9c9352..0000000000
--- a/doc_cn/build_and_install/cmake/make_and_install.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-make和make install
-==================
-
-参见 `make和make install <../../../doc/build/build_from_source.html#build-and-install>`_
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
deleted file mode 100644
index 48163fb36e..0000000000
--- a/doc_cn/build_and_install/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-编译与安装
-========================
-
-安装
-++++
-
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
-
-.. toctree::
-   :maxdepth: 1
-   
-   install/docker_install.rst 
-   install/ubuntu_install.rst
-
-
-
-编译
-++++
-
-..  warning::
-
-	编译选项主要推荐高级用户查看，普通用户请走安装流程。
-
-.. toctree::
-   :maxdepth: 1
-
-   cmake/index.rst
diff --git a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile b/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
deleted file mode 100644
index 7cb947bddf..0000000000
--- a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
-
-RUN apt-get update
-RUN apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-
-EXPOSE 22
-
-CMD    ["/usr/sbin/sshd", "-D"]
diff --git a/doc_cn/build_and_install/install/paddle_version.txt b/doc_cn/build_and_install/install/paddle_version.txt
deleted file mode 100644
index a80873303f..0000000000
--- a/doc_cn/build_and_install/install/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b1, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning:
-    with_timer: OFF
-    with_predict_sdk:
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
deleted file mode 100644
index 25313a9635..0000000000
--- a/doc_cn/cluster/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-集群训练
-========
-
-* `集群训练 <../../doc/cluster/index.html>`_
-
-.. toctree::
-    :maxdepth: 2
-    :glob:
-
-    集群训练(对内) <internal/index.md>
-
diff --git a/doc_cn/concepts/nn.rst b/doc_cn/concepts/nn.rst
deleted file mode 100644
index f4d2cf490d..0000000000
--- a/doc_cn/concepts/nn.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/program_concepts.rst b/doc_cn/concepts/program_concepts.rst
deleted file mode 100644
index af5bbdac26..0000000000
--- a/doc_cn/concepts/program_concepts.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/demo/index.rst b/doc_cn/demo/index.rst
deleted file mode 100644
index e15e839f93..0000000000
--- a/doc_cn/demo/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-使用示例
-========
-
-图像
-''''
-
-* `图像分类 <../../doc/demo/image_classification/index.html>`_
-
-自然语言处理
-''''''''''''
-
-* `情感分析 <sentiment_analysis/index.html>`_
-* `文本生成 <../../doc/demo/text_generation/index.html>`_
-* `词性标注 <../../doc/demo/semantic_role_labeling/index.html>`_
-
-推荐
-''''
-
-* `MovieLens数据集 <../../doc/demo/rec/ml_dataset.html>`_
-* `MovieLens评分回归 <../../doc/demo/rec/ml_regression.html>`_
-
-常用模型
-''''''''
-
-* `ImageNet: ResNet <../../doc/demo/imagenet_model/resnet_model.html>`_
-* `Embedding: Chinese Word <../../doc/demo/embedding_model/index.html>`_
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
deleted file mode 100644
index 4a6e07ee1f..0000000000
--- a/doc_cn/demo/quick_start/index.md
+++ /dev/null
@@ -1,543 +0,0 @@
-# PaddlePaddle快速入门教程
-
-我们以文本分类问题作为背景，介绍PaddlePaddle使用流程和常用的网络基础单元的配置方法。
-
-## 安装(Install)
-
-首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。
-
-## 使用概述(Overview)
-
-**文本分类问题**：对于给定的一条文本， 我们从提前给定的类别集合中选择其所属类
-别。比如通过用户对电子商务网站评论，评估产品的质量：
-
-- 这个显示器很棒！ （好评）
-- 用了两个月之后这个显示器屏幕碎了。（差评）
-
-每一个任务流程都可以分为如下5个基础部分。
-<center> ![](./Pipeline.jpg) </center>
-
-1. 数据格式准备
-    - 每行保存一条样本，类别Id 和文本信息用Tab间隔， 文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：```类别Id ‘\t’ 这 个 显 示 器 很 棒 ！```
-2. 数据向模型传送
-    - PaddlePaddle可以读取Python写的传输数据脚本，所有字符都将转换为连续整数表示的Id传给模型
-3. 网络结构（由易到难展示4种不同的网络配置）
-    - 逻辑回归模型
-    - 词向量模型
-    - 卷积模型
-    - 时序模型
-    - 优化算法
-4. 训练模型
-5. 预测
-
-## 数据格式准备(Data Preparation)
-在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本（如果想从最原始的数据处理，可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`）。
-
-```bash
-cd demo/quick_start
-./data/get_data.sh
-```
-
-## 数据向模型传送(Transfer Data to Model)
-
-### Python数据加载脚本(Data Provider Script)
-
-下面dataprovider_bow.py文件给出了完整例子，主要包括两部分：
-
-* initalizer： 定义文本信息、类别Id的数据类型。
-* process： yield文本信息和类别Id，和initalizer里定义顺序一致。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = [
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        integer_value(2)]
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield word_vector, int(label)
-```
-
-### 配置中的数据加载定义(Data Provider in Configure)
-
-在模型配置中利用`define_py_data_sources2`加载数据：
-
-```python
-from paddle.trainer_config_helpers import *
-
-file = "data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list='data/train.list',
-                        test_list='data/test.list',
-                        module="dataprovider_bow",
-                        obj="process",
-                        args={"dictionary": word_dict})
-```
-* data/train.list,data/test.list: 指定训练、测试数据
-* module="dataprovider": 数据处理Python文件名
-* obj="process": 指定生成数据的函数
-* args={"dictionary": word_dict}: 额外的参数，这里指定词典
-
-更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
-PyDataProvider2</a>。
-
-## 网络结构(Network Architecture)
-本节我们将专注于网络结构的介绍。
-<center> ![](./PipelineNetwork.jpg) </center>
-
-我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
-连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
-所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
-
-### 逻辑回归模型(Logistic Regression)
-
-流程如下：
-<center> ![](./NetLR.jpg) </center>
-
-- 获取利用one-hot vector表示的每个单词，维度是词典大小
-
-```python
-word = data_layer(name="word",  size=word_dim)
-```
-
-- 获取该条样本类别Id，维度是类别个数。
-
-```python
-label = data_layer(name="label", size=label_dim)
-```
-
-- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
-
-```python
-# Define a fully connected layer with logistic activation (also called softmax activation).
-output = fc_layer(input=word,
-                  size=label_dim,
-                  act_type=SoftmaxActivation())
-# Define cross-entropy classification loss and error.
-classification_cost(input=output, label=label)
-```
-
- - input: 除过data层，每个层都有一个或多个input,多个input以list方式输入
- - size: 该层神经元个数
- - act_type: 激活函数类型
-
-效果总结：我们将在后面介绍训练和预测的流程的脚本。在此为方便对比不同网络结构，
-我们随时总结了各个网络的复杂度和效果。
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归</td>
-<td class="left">252 KB</td>
-<td class="left">8.652%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-### 词向量模型(Word Vector)
-
-embedding模型需要稍微改变数据提供的脚本，即`dataprovider_emb.py`，词向量模型、
-卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
-
-```
-def initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = [
-        # Define the type of the first input as sequence of integer.
-        # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        integer_value(2)]
-
-@provider(init_hook=initializer)
-def process(settings, file_name):
-    ...
-    # omitted, it is same as the data provider for LR model
-```
-
-该模型依然是使用逻辑回归分类网络的框架， 只是将句子利用连续向量表示替换稀疏
-向量表示， 即对第3步进行替换。句子表示的计算更新为2步：
-<center> ![](./NetContinuous.jpg) </center>
-
-- 利用单词Id查找对应的该单词的连续表示向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
-
-```python
-emb = embedding_layer(input=word, size=word_dim)
-```
-
-- 将该句话包含的所有单词向量求平均得到句子的表示
-
-```python
-avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-```
-
-其它部分和逻辑回归网络结构一致。
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">词向量模型</td>
-<td class="left">15 MB</td>
-<td class="left">8.484%</td>
-</tr>
-
-</tbody>
-</table>
-</html></center>
-<br>
-
-### 卷积模型(Convolution)
-卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型额步
-骤3-2进行进一步演化， 变为3个新的子步骤。
-<center> ![](./NetConv.jpg) </center>
-
-文本卷积分为三个步骤：
-1. 获取每个单词左右各k个近邻， 拼接成一个新的向量表示；
-2. 对该表示进行非线性变换 （例如Sigmoid变换）, 成为维度为hidden_dim的新的向量；
-3. 在每个维度上取出在该句话新的向量集合上该维度的最大值作为最后的句子表示向量。 这3个子步骤可配置为:
-
-```python
-text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
-```
-
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">卷积模型</td>
-<td class="left">16 MB</td>
-<td class="left">5.628%</td>
-</tr>
-
-</tbody>
-</table></center>
-<br>
-
-### 时序模型(Time Sequence)
-<center> ![](./NetRNN.jpg) </center>
-
-时序模型即为RNN模型, 包括简单的RNN模型、GRU模型、LSTM模型等。
-
-- GRU模型配置：
-
-```python
-gru = simple_gru(input=emb, size=gru_size)
-```
-
-- LSTM模型配置：
-
-```python
-lstm = simple_lstm(input=emb, size=lstm_size)
-```
-
-针对本问题，我们采用单层LSTM模型，并使用了Dropout，效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">时序模型</td>
-<td class="left">16 MB</td>
-<td class="left">4.812%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-## 优化算法(Optimization Algorithm)
-<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
-Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
-
-```python
-settings(batch_size=128,
-         learning_rate=2e-3,
-         learning_method=AdamOptimizer(),
-         regularization=L2Regularization(8e-4),
-         gradient_clipping_threshold=25)
-```
-
-## 训练模型(Training Model)
-在完成了数据和网络结构搭建之后， 我们进入到训练部分。
-<center> ![](./PipelineTrain.jpg) </center>
-
-训练脚本：我们将训练的命令行保存在了 `train.sh`文件中。训练时所需设置的主要参数如下：
-
-```bash
-paddle train \
---config=trainer_config.py \
---log_period=20 \
---save_dir=./output \
---num_passes=15 \
---use_gpu=false
-```
-这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。
-
-## 预测(Prediction)
-可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
-<center> ![](./PipelineTest.jpg) </center>
-
-测试脚本如下，将会测试配置文件中test.list指定的数据。
-
-```bash
-paddle train \
---use_gpu=false \
---job=test \
---init_model_path=./output/pass-0000x
-```
-
-可以参考<a href = "../../ui/predict/swig_py_paddle.html">Python API预测</a>
-教程，或其他<a href = "../../demo/index.html">demo</a>的Python预测过程。也可以通过如下方式预测。
-
-预测脚本(`predict.sh`)：
-
-```bash
-model="output/pass-00003"
-paddle train \
-    --config=trainer_config.lstm.py \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-
-mv rank-00000 result.txt
-```
-这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
-指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
-
-预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
-
-```
-预测ID;ID为0的概率 ID为1的概率
-预测ID;ID为0的概率 ID为1的概率
-```
-
-```
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-obj = 'process' if not is_predict else 'process_pre'
-batch_size = 128 if not is_predict else 1
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid,output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
-```
-
-## 总体效果总结(Summary)
-这些流程中的数据下载、网络配置、训练脚本在`/demo/quick_start`目录，我们在此总
-结上述网络结构在Amazon-Elec测试集(25k)上的效果:
-
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-<th scope="col" class="left">配置文件</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归模型</td>
-<td class="left"> 252KB </td>
-<td class="left">8.652%</td>
-<td class="left">trainer_config.lr.py</td>
-</tr>
-
-<tr>
-<td class="left">词向量模型</td>
-<td class="left"> 15MB </td>
-<td class="left"> 8.484%</td>
-<td class="left">trainer_config.emb.py</td>
-</tr>
-
-<tr>
-<td class="left">卷积模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 5.628%</td>
-<td class="left">trainer_config.cnn.py</td>
-</tr>
-
-<tr>
-<td class="left">时序模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 4.812%</td>
-<td class="left">trainer_config.lstm.py</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
-
-## 附录(Appendix)
-### 命令行参数(Command Line Argument)
-
-* \--config：网络配置
-* \--save_dir：模型存储路径
-* \--log_period：每隔多少batch打印一次日志
-* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
-* \--config_args：命令指定的参数会传入网络配置中。
-* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
-
-默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
-可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考<a href = "../../ui/index.html#command-line-argument">令行参数文档</a>。
-
-### 输出日志(Log)
-
-```
-TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-```
-模型训练会看到这样的日志，详细的参数解释如下面表格：
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">名称</th>
-<th scope="col" class="left">解释</th>
-</tr>
-</thead>
-
-<tr>
-<td class="left">Batch=20</td>
-<td class="left"> 表示过了20个batch </td>
-</tr>
-
-<tr>
-<td class="left">samples=2560</td>
-<td class="left"> 表示过了2560个样本 </td>
-</tr>
-
-<tr>
-<td class="left">AvgCost</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">CurrentCost</td>
-<td class="left"> 当前log_period个batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">Eval: classification_error_evaluator</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均分类错误率 </td>
-</tr>
-
-<tr>
-<td class="left">CurrentEval: classification_error_evaluator</td>
-<td class="left"> 当前log_period个batch所有样本的平均分类错误率 </td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/doc_cn/demo/sentiment_analysis/index.rst b/doc_cn/demo/sentiment_analysis/index.rst
deleted file mode 100644
index 9d7972b219..0000000000
--- a/doc_cn/demo/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-情感分析教程
-===========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <sentiment_analysis.md>
\ No newline at end of file
diff --git a/doc_cn/howto/build_docker_image.rst b/doc_cn/howto/build_docker_image.rst
deleted file mode 100644
index 46ba07d9ad..0000000000
--- a/doc_cn/howto/build_docker_image.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-构建PaddlePaddle的Docker Image
-==============================
-PaddlePaddle的Docker Image构建源码放置在 ``${源码根目录}/paddle/scripts/docker/`` 目录下。该目录有三类文件：
-
-- Dockerfile：Docker Image的描述文件，包括构建步骤、各种参数和维护人员等。
-  
-  - 一共维护了12个Dockerfile，Dockerfile.m4是它们的模板。
-  - PaddlePaddle中所有的Image都基于ubuntu 14.04。
-
-- build.sh：Docker Image的构建脚本，使用方式见下一小节。
-- generate.sh：通过Dockerfile.m4模板生成不同的Dockerfile。
-
-使用脚本构建Docker Image
-------------------------
-
-进入源码目录，执行 ``docker build`` 命令，即可在本地编译出PaddlePaddle的镜像。简单的使用样例为
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K \
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-其中，``--build-arg`` 传入的配置参数包括:
-
-- LOWEST\_DL\_SPEED\: 在多线程下载过程中，设置下载线程的最低速度。
-
-  - 默认单位是Bytes，但可以传入10K、10M、或10G等这样的单位。
-  - 如果小于这个速度，那么这个线程将会关闭。当所有的线程都关闭了，那么下载进程将会重启。
--  WITH\_GPU\: ON or OFF，是否开启GPU功能。注意，
-  - **编译** PaddlePaddle的GPU版本 **不一定** 要在具有GPU的机器上进行。
-  - **运行** PaddlePaddle的GPU版本 **一定** 要在具有GPU的机器上运行。
-
-注意：所有Image的构建在Docker 1.12版本测试通过, 低于1.12的版本并没有测试。原因是旧版本可能缺乏 ``--build-arg`` 参数，从而不能在运行编译命令的时候接受参数。
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
deleted file mode 100644
index 88a9f79fd2..0000000000
--- a/doc_cn/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-PaddlePaddle文档
-================
-
-使用指南
---------
-* `介绍 <introduction/index.html>`_
-* `快速入门 <demo/quick_start/index.html>`_
-* `基本使用概念 <concepts/use_concepts.html>`_
-* `编译与安装 <build_and_install/index.html>`_
-* `用户接口 <ui/index.html>`_
-* `使用示例 <demo/index.html>`_
-* `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <cluster/index.html>`_
-
-开发指南
---------
-* `新写Layer <../doc/dev/new_layer/index.html>`_
-* `如何贡献文档 <howto/how_to_write_docs/index.html>`_
-* `如何构建Docker Image <howto/build_docker_image.html>`_
-
-算法教程
---------
-
-* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
-* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
-* :ref:`algo_hrnn_rnn_api_compare`
-* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
-
-常见问题
---------
-
-* `常见问题 <faq/index.html>`_
diff --git a/doc_cn/introduction/parameters.png b/doc_cn/introduction/parameters.png
deleted file mode 100644
index 2ec6748095..0000000000
Binary files a/doc_cn/introduction/parameters.png and /dev/null differ
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
deleted file mode 100644
index 31a8b8a79f..0000000000
--- a/doc_cn/ui/cmd/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-命令
-====
-
-安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令。
-
-* ``train`` Start a paddle_trainer
-    启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程；也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
-* ``pserver`` Start a paddle_pserver_main
-    在多机分布式训练下启动PaddlePaddle的parameter server进程。
-* ``version`` Print paddle version
-    用于打印当前PaddlePaddle的版本和编译选项相关信息。常见的输出格式如下：1）第一行说明了PaddlePaddle的版本信息；2）第二行开始说明了一些主要的编译选项，具体意义可以参考 `编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_ 。
-
-    ..  literalinclude:: paddle_version.txt
-
-* ``merge_model`` Start a paddle_merge_model
-    用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件，方便做部署分发。
-* ``dump_config`` Dump the trainer config as proto string
-    用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
-* ``make_diagram``
-    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/paddle_version.txt b/doc_cn/ui/cmd/paddle_version.txt
deleted file mode 100644
index 33e2e4de7c..0000000000
--- a/doc_cn/ui/cmd/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b, compiled with
-    with_avx: ON
-    with_gpu: ON
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning: OFF
-    with_timer: OFF
-    with_predict_sdk: OFF
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
deleted file mode 100644
index ff36c9adb6..0000000000
--- a/doc_cn/ui/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-########
-用户接口
-########
-
-数据提供
-========
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider.rst
-    data_provider/pydataprovider2.rst
-
-命令及命令行参数
-================
-
-..  toctree::
-    :maxdepth: 1
-
-    cmd/index.rst
-
-* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
-* `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
-
-预测
-=======
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle.rst
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index fb3af8ea92..503024cff3 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_subdirectory(cuda)
+add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 0cafbd896e..41beed38a8 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -137,6 +137,10 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
   a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
 
+float Arguments::sumCosts() const {
+  return paddle::Argument::sumCosts(m->outputs);
+}
+
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return a.getBatchSize();
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 9b2d122a09..6e8fcd114d 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,10 +1,30 @@
+FUNCTION(generate_python_api target_name)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
+        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
+                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
+                ${external_project_dependencies}
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+        COMMENT "Generate Python API from swig")
+    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
+                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                ${PROJ_ROOT}/paddle/Paddle_wrap.h
+                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                ${external_project_dependencies})
+ENDFUNCTION(generate_python_api)
+
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
+    Evaluator.cpp
     GradientMachine.cpp
     Matrix.cpp
     Parameter.cpp
     ParameterOptimizer.cpp
+    ParameterUpdater.cpp
     SequenceGenerator.cpp
     Trainer.cpp
     Util.cpp
@@ -17,22 +37,18 @@ add_library(paddle_api STATIC
         ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
+list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
 
-if(WITH_GFLAGS)
-  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
-
-  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-    # Because gflags compiled by cmake, so it is imported by cmake target,
-    # not a real library path. Get the real library path here.
-    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-  else()
-    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-  endif()
+if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+# Because gflags compiled by cmake, so it is imported by cmake target,
+# not a real library path. Get the real library path here.
+message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+else()
+set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
 endif()
 
-
 configure_file(
     paddle_api_config.py.in
     ${PROJ_ROOT}/paddle/api/paddle_api_config.py
@@ -44,12 +60,13 @@ file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
     DEPENDS python_swig_sources
             paddle_parameter
+            paddle_function
             paddle_math
             paddle_utils
             paddle_gserver
@@ -57,7 +74,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
-	    ${PY_PADDLE_PYTHON_FILES}
+        ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
@@ -66,7 +83,30 @@ install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
 
 add_custom_target(python_api_wheel ALL DEPENDS
   ${PROJ_ROOT}/paddle/dist/.timestamp)
+add_dependencies(python_api_wheel python_swig_sources
+  paddle_parameter
+  paddle_math
+  paddle_utils
+  paddle_gserver
+  paddle_pserver
+  paddle_trainer
+  paddle_api
+  paddle_cuda)
 
 if(WITH_TESTING)
+    IF(NOT PY_PIP_FOUND)
+        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+        ExternalProject_Add(pip
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/pypa/pip.git
+            GIT_TAG             9.0.1
+            PREFIX              ${PIP_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+            BUILD_IN_SOURCE     1
+            DEPENDS python setuptools python_api_wheel
+        )
+    ENDIF()
     add_subdirectory(test)
 endif()
diff --git a/paddle/utils/DisableCopy.h b/paddle/api/Evaluator.cpp
similarity index 60%
rename from paddle/utils/DisableCopy.h
rename to paddle/api/Evaluator.cpp
index 41de98bbde..c30e098763 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/api/Evaluator.cpp
@@ -11,13 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <sstream>
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
-#pragma once
+Evaluator::Evaluator() : m(new EvaluatorPrivate()) {}
+Evaluator::~Evaluator() { delete m; }
 
-/**
- * Disable copy macro.
- */
-#define DISABLE_COPY(CLASS_NAME)                \
-  CLASS_NAME(CLASS_NAME &&) = delete;           \
-  CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
+void Evaluator::start() { m->rawPtr->start(); }
+
+void Evaluator::finish() { m->rawPtr->finish(); }
+
+std::string Evaluator::toString() {
+  std::ostringstream sout;
+  m->rawPtr->printStats(sout);
+  return sout.str();
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 297eaa19bb..66115f8293 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -64,6 +64,18 @@ GradientMachine* GradientMachine::createByModelConfig(
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
+void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
+
+void GradientMachine::prefetch(const Arguments& inArgs) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  m->machine->prefetch(in);
+}
+
 void GradientMachine::forward(const Arguments& inArgs,
                               Arguments* outArgs,
                               PassType passType) {
@@ -158,3 +170,13 @@ SequenceGenerator* GradientMachine::asSequenceGenerator(
   r->setBeamSize(beam_size);
   return r;
 }
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 9194a6371b..068ba286c0 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -96,7 +96,9 @@ namespace std {
 %rename(__getitem__) Vector::get;
 %rename(__setitem__) Vector::set;
 %rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
 %rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
 
 %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
   (float* data, int dim1, int dim2) 
@@ -167,6 +169,7 @@ namespace std {
 %newobject GradientMachine::asSequenceGenerator;
 %newobject GradientMachine::getParameter;
 %newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
 %newobject TrainerConfig::createFromTrainerConfigFile;
 %newobject TrainerConfig::getModelConfig;
 %newobject TrainerConfig::getOptimizationConfig;
@@ -174,6 +177,8 @@ namespace std {
 %newobject Parameter::getConfig;
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
@@ -193,4 +198,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
\ No newline at end of file
+%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 84a66719c3..f5af8b0035 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/TypeDefs.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
 
-#define DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname& other);       \
-  classname& operator=(const classname& other)
-
 /**
  * @brief Initialize paddle.
  *
@@ -102,7 +98,7 @@ const size_t NO_SPARSE_ID = -1UL;
 struct MatrixPrivate;
 class Matrix {
   Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY_AND_ASSIGN(Matrix);
+  DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
 public:
@@ -242,7 +238,7 @@ private:
 
 struct VectorPrivate;
 class Vector {
-  DISABLE_COPY_AND_ASSIGN(Vector);
+  DISABLE_COPY(Vector);
   Vector();
   static Vector* createByPaddleVectorPtr(void* ptr);
 
@@ -322,7 +318,7 @@ private:
 struct IVectorPrivate;
 class IVector {
   IVector();
-  DISABLE_COPY_AND_ASSIGN(IVector);
+  DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
 public:
@@ -402,7 +398,7 @@ struct ArgumentsPrivate;
 class Arguments {
 private:
   Arguments();  // Internal Create.
-  DISABLE_COPY_AND_ASSIGN(Arguments);
+  DISABLE_COPY(Arguments);
 
 public:
   /**
@@ -454,6 +450,8 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
+  float sumCosts() const;
+
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
   void* getInternalArgumentsPtr() const;
@@ -472,7 +470,7 @@ enum GradientMatchineCreateMode {
 
 struct ParameterConfigPrivate;
 class ParameterConfig {
-  DISABLE_COPY_AND_ASSIGN(ParameterConfig);
+  DISABLE_COPY(ParameterConfig);
   ParameterConfig();
 
   /**
@@ -502,7 +500,7 @@ private:
 
 struct OptimizationConfigPrivate;
 class OptimizationConfig {
-  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
+  DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
 public:
@@ -519,6 +517,7 @@ private:
 
   friend class TrainerConfig;
   friend class ParameterOptimizer;
+  friend class ParameterUpdater;
   friend class Trainer;
 };
 
@@ -526,7 +525,7 @@ struct ParameterPrivate;
 class Parameter {
 private:
   Parameter();
-  DISABLE_COPY_AND_ASSIGN(Parameter);
+  DISABLE_COPY(Parameter);
 
 public:
   virtual ~Parameter();
@@ -549,6 +548,12 @@ public:
   ParameterConfig* getConfig();
   void setValueUpdated();
 
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
+  size_t getSize() const;
+
 private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
@@ -557,6 +562,7 @@ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
+  friend class ParameterUpdater;
 };
 
 struct ModelConfigPrivate;
@@ -568,7 +574,7 @@ struct ModelConfigPrivate;
 class ModelConfig {
 private:
   ModelConfig();
-  DISABLE_COPY_AND_ASSIGN(ModelConfig);
+  DISABLE_COPY(ModelConfig);
 
 public:
   virtual ~ModelConfig();
@@ -589,7 +595,7 @@ struct TrainerConfigPrivate;
 class TrainerConfig {
 private:
   TrainerConfig();
-  DISABLE_COPY_AND_ASSIGN(TrainerConfig);
+  DISABLE_COPY(TrainerConfig);
 
 public:
   virtual ~TrainerConfig();
@@ -629,7 +635,7 @@ public:
 
 struct ParameterTraverseCallbackPrivate;
 class ParameterTraverseCallback {
-  DISABLE_COPY_AND_ASSIGN(ParameterTraverseCallback);
+  DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
 public:
@@ -651,7 +657,7 @@ private:
  */
 struct ParameterOptimizerPrivate;
 class ParameterOptimizer {
-  DISABLE_COPY_AND_ASSIGN(ParameterOptimizer);
+  DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
 public:
@@ -683,12 +689,12 @@ private:
 };
 
 class SequenceGenerator;
-
+class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
 private:
   GradientMachine();
-  DISABLE_COPY_AND_ASSIGN(GradientMachine);
+  DISABLE_COPY(GradientMachine);
 
 public:
   virtual ~GradientMachine();
@@ -714,6 +720,23 @@ public:
       GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  void prefetch(const Arguments& inArgs);
+
+  /**
+   * Do some thing when train pass ended.
+   */
+  void onPassEnd();
+
   /**
    * The forward stage of GradientMachine.
    *
@@ -761,6 +784,10 @@ public:
       size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
 private:
   GradientMachinePrivate* m;
 
@@ -772,6 +799,111 @@ private:
   // Not to use c++ 11 init-list, so we use static var as function default arg.
   static std::vector<int> defaultParamTypes;
   friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+private:
+  ParameterUpdater();
+
+public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
 };
 
 struct TrainerPrivate;
@@ -780,7 +912,7 @@ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY_AND_ASSIGN(Trainer);
+  DISABLE_COPY(Trainer);
 
 public:
   virtual ~Trainer();
@@ -846,7 +978,7 @@ public:
 
 struct SequenceGeneratorPrivate;
 class SequenceGenerator {
-  DISABLE_COPY_AND_ASSIGN(SequenceGenerator);
+  DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
 public:
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
index d2b56fc41c..f41352bfec 100644
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
 #include "paddle/trainer/TrainerConfigHelper.h"
 
-#pragma once
-
 struct GradientMachinePrivate {
   std::shared_ptr<paddle::GradientMachine> machine;
 
@@ -65,3 +67,31 @@ struct ArgumentsPrivate {
     return *(std::shared_ptr<T>*)(rawPtr);
   }
 };
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 4eed00a84a..19f7a898d6 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -14,21 +14,7 @@ limitations under the License. */
 
 #include "paddle/parameter/Parameter.h"
 #include "PaddleAPI.h"
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
+#include "PaddleAPIPrivate.h"
 
 Parameter::Parameter() : m(new ParameterPrivate()) {}
 
@@ -70,3 +56,13 @@ ParameterConfig* Parameter::getConfig() {
 size_t Parameter::getID() const { return m->getPtr()->getID(); }
 
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
new file mode 100644
index 0000000000..75b0ae7cb6
--- /dev/null
+++ b/paddle/api/ParameterUpdater.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
+#include "paddle/trainer/ThreadParameterUpdater.h"
+
+ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
+
+ParameterUpdater *ParameterUpdater::createLocalUpdater(
+    OptimizationConfig *config) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
+}
+
+ParameterUpdater::~ParameterUpdater() { delete m; }
+
+void ParameterUpdater::init(const GradientMachine &gm) {
+  m->updater->init(gm.m->machine->getNonStaticParameters());
+}
+
+void ParameterUpdater::startPass() { m->updater->startPass(); }
+
+void ParameterUpdater::finishPass() { m->updater->finishPass(); }
+
+PassType ParameterUpdater::startBatch(size_t batchSize) {
+  return m->updater->startBatch((int64_t)batchSize);
+}
+
+void ParameterUpdater::finishBatch(float cost) {
+  m->updater->finishBatch(cost);
+}
+
+void ParameterUpdater::update(Parameter *param) {
+  auto paddleParam = param->m->getPtr();
+  m->updater->update(paddleParam);
+}
+
+void ParameterUpdater::restore() { m->updater->restore(); }
+
+void ParameterUpdater::apply() { m->updater->apply(); }
+
+void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 59b47d4b1c..d83dc380be 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -27,9 +27,9 @@ limitations under the License. */
 
 using paddle::real;
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
 
 struct TrainerPrivate : public paddle::Trainer {
   bool _trainOneBatch(size_t batchSize);
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index c3f739568f..54d67aa62f 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-#include <fenv.h>
 #include <algorithm>
 #include <iostream>
 #include <iterator>
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 874f2fd044..db8f005929 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -253,7 +253,7 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   *view_m_data = new float[*dim1];
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
     hl_memcpy_device2host(
         *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index a2352250c3..82f45ba6cc 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,19 +1,17 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
-ZLIB_LIB="@ZLIB_LIBRARIES@"
+PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
+ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-WITH_GLOG="@WITH_GLOG@"
-LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
-WITH_GFLAGS="@WITH_GFLAGS@"
+GLOG_LIBRARIES="@GLOG_LIBRARIES@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBS@"
+CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
 
-CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 85cc54700f..ad5dce209b 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -30,8 +30,8 @@ try:
         whole_end = ""
 
     LIB_DIRS = [
-        "math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver",
-        "trainer"
+        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
+        "pserver", "trainer"
     ]
     PARENT_LIB_DIRS = ['proto']
 
@@ -40,17 +40,15 @@ try:
             self.paddle_build_dir = PADDLE_BUILD_DIR
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIB
-            self.zlib = ZLIB_LIB
+            self.protolib = PROTOBUF_LIBRARY
+            self.zlib = ZLIB_LIBRARIES
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
-            self.glog_libs = LIBGLOG_LIBRARY
+            self.glog_libs = GLOG_LIBRARIES
 
-            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
@@ -77,6 +75,7 @@ try:
             libs = [
                 whole_start,
                 "-lpaddle_gserver",
+                "-lpaddle_function",
                 whole_end,
                 "-lpaddle_pserver",
                 "-lpaddle_trainer_lib",
@@ -88,6 +87,8 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.glog_libs),
+                self.normalize_flag(self.gflags_libs),
                 self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
@@ -96,10 +97,6 @@ try:
 
             if self.with_python:
                 libs.append(self.normalize_flag(self.python_libs))
-            if self.with_glog:
-                libs.append(self.normalize_flag(self.glog_libs))
-            if self.with_gflags:
-                libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
             if self.with_coverage:
@@ -144,9 +141,12 @@ try:
 
         def c_flag(self):
             if self.with_coverage:
-                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+                return [
+                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
+                    "-std=c++11"
+                ]
             else:
-                return None
+                return ["-std=c++11"]
 except ImportError:
 
     class PaddleLDFlag(object):
diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
new file mode 100644
index 0000000000..b7948824a1
--- /dev/null
+++ b/paddle/api/test/.gitignore
@@ -0,0 +1,2 @@
+*.w0
+*.wbias
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 08a0fe96a0..a2fa623c80 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh ${PYTHON_EXECUTABLE})
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 2f12ba0264..bcf06afa86 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-rm -rf .test_env
-virtualenv .test_env
-source .test_env/bin/activate
-
-pip --timeout 600  install ../../dist/*.whl
+$1 -m pip install ../../dist/*.whl
 
 test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
@@ -33,7 +29,7 @@ export PYTHONPATH=$PWD/../../../python/
 for fn in $test_list
 do
   echo "test $fn"
-  python $fn
+  $1 $fn
   if [ $? -ne 0 ]; then
     exit 1
   fi
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 8cabecd242..a04a805d7a 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
         args = swig_paddle.Arguments.createArguments(1)
         args.setSlotValue(0, m)
 
+        self.assertAlmostEqual(27.0, args.sumCosts())
+
         mat = args.getSlotValue(0)
         assert isinstance(mat, swig_paddle.Matrix)
         np_mat = mat.toNumpyMatInplace()
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index b81eafa967..4b705f66ec 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
             assert isinstance(val, swig_paddle.Vector)
             arr = numpy.full((len(val), ), 0.1, dtype="float32")
             val.copyFromNumpyArray(arr)
+            self.assertTrue(param.save(param.getName()))
             param_config = param.getConfig().toProto()
             assert isinstance(param_config,
                               paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
 
         self.assertTrue(self.isCalled)
 
+        for param in machine.getParameters():
+            self.assertTrue(param.load(param.getName()))
+
     def test_train_one_pass(self):
         conf_file_path = './testTrainConfig.py'
         trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index aa1ff4a771..a28ccd6f07 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
@@ -88,6 +87,8 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
+add_dependencies(paddle_cuda ${external_project_dependencies})
+
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 84c5f2d5c9..5b9884b786 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -16,7 +16,31 @@ limitations under the License. */
 #define HL_BASE_H_
 
 #include <cstddef>
-#include "paddle/utils/TypeDefs.h"
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
 
 /**
  * HPPL is an internal high performance parallel computing library
@@ -181,46 +205,6 @@ typedef struct {
   size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
-#ifndef PADDLE_TYPE_DOUBLE
-/**
- * HPPL data type: real (float or double)
- *
- * if real == float
- *
- * HL_FLOAT_MAX: 3.40282347e+38F
- *
- * HL_FLOAT_MIN: 1.17549435e-38F
- */
-#define HL_FLOAT_MAX 3.40282347e+38F
-/**
- * if real == double
- *
- * HL_FLOAT_MAX: 1.7976931348623157e+308
- *
- * HL_FLOAT_MIN: 2.2250738585072014e-308
- */
-#define HL_FLOAT_MIN 1.17549435e-38F
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- *
- * Currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
 #ifdef __NVCC__
 
 #include "cuda_runtime.h"
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 06ee3b3654..c5787630ab 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -240,62 +240,6 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
-/**
- * @brief   Cross-map-respose normalize forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   in          input data.
- * @param[in]   scale       buffer.
- * @param[out]  out         output data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   sizeX       size.
- * @param[in]   alpha       scale.
- * @param[in]   beta        scale.
- *
- */
-extern void hl_CMRNorm_forward(size_t frameCnt,
-                               const real* in,
-                               real* scale,
-                               real* out,
-                               size_t channels,
-                               size_t height,
-                               size_t width,
-                               size_t sizeX,
-                               real alpha,
-                               real beta);
-
-/**
- * @brief   Cross-map-respose normalize backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inV         input data.
- * @param[in]   scale       buffer.
- * @param[out]  outV        output value.
- * @param[out]  outDiff     output grad.
- * @param[out]  inDiff      input grad.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   sizeX       size.
- * @param[in]   alpha       scale.
- * @param[in]   beta        scale.
- *
- */
-extern void hl_CMRNorm_backward(size_t frameCnt,
-                                const real* inV,
-                                const real* scale,
-                                const real* outV,
-                                const real* outDiff,
-                                real* inDiff,
-                                size_t channels,
-                                size_t height,
-                                size_t width,
-                                size_t sizeX,
-                                real alpha,
-                                real beta);
-
 /**
  * @brief   Bilinear interpolation forward.
  *
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 20c13f21e6..276a07d3c7 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
  */
 void GetCudnnDsoHandle(void** dso_handle);
 
-/**
- * @brief    load the DSO of CUDA Run Time
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
-
 /**
  * @brief    load the DSO of CURAND
  *
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9bcd25b062..9f9d8f972e 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
-
 /**
  * @brief   Memory copy from sequence to batch.
  *
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
index 79bf6c3db7..7885ae5701 100644
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
 
+#include "ctc.h"
 #include "hl_base.h"
-#include "warp-ctc/include/ctc.h"
 
 typedef ctcStatus_t hl_warpctc_status_t;
 typedef ctcOptions hl_warpctc_options_t;
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 52c9787352..039551c6cc 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -117,30 +117,6 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride) {}
 
-inline void hl_CMRNorm_forward(size_t frameCnt,
-                               const real* in,
-                               real* scale,
-                               real* out,
-                               size_t channels,
-                               size_t height,
-                               size_t width,
-                               size_t sizeX,
-                               real alpha,
-                               real beta) {}
-
-inline void hl_CMRNorm_backward(size_t frameCnt,
-                                const real* inV,
-                                const real* scale,
-                                const real* outV,
-                                const real* outDiff,
-                                real* inDiff,
-                                size_t channels,
-                                size_t height,
-                                size_t width,
-                                size_t sizeX,
-                                real alpha,
-                                real beta) {}
-
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
                                 const size_t inImgW,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index d6b07556f8..05e51bce9e 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
-
 inline void hl_sequence2batch_copy(real* batch,
                                    real* sequence,
                                    const int* batchIndex,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 0992286f36..b94f4d8fe4 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -381,164 +381,6 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
-__global__ void KeCMRNormFillScale(size_t nthreads, const real* in,
-                                   real* scale, size_t channels,
-                                   size_t height, size_t width, size_t size,
-                                   real alpha) {
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
-    // find out the local offset
-    size_t w = index % width;
-    size_t h = (index / width) % height;
-    size_t n = index / width / height;
-    size_t offset = (n * channels * height + h) * width + w;
-    size_t step = height * width;
-    in += offset;
-    scale += offset;
-    size_t head = 0;
-    size_t pre_pad = (size - 1) / 2;
-    size_t post_pad = size - pre_pad - 1;
-    real accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad) {
-      accum_scale += in[head * step] * in[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
-      accum_scale += in[head * step] * in[head * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
-    }
-  }
-}
-
- __global__ void KeCMRNormOutput(size_t nthreads, const real* in,
-                                 const real* scale, real negative_beta,
-                                 real* out) {
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale,
-                        real* out, size_t channels,
-                        size_t height, size_t width, size_t sizeX,
-                        real alpha, real beta) {
-  size_t threadsNum = frameCnt * height * width;
-  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
-  size_t blocksY = 1;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeCMRNormFillScale<<<grid, threads, 0, STREAM_DEFAULT>>>
-      (threadsNum, in, scale, channels, height, width, sizeX, alpha);
-
-  threadsNum = frameCnt * height * width *channels;
-  blocksX = (threadsNum + 1024 -1) / 1024;
-  dim3 threads2(1024, 1);
-  dim3 grid2(blocksX, blocksY);
-  KeCMRNormOutput<<<grid2, threads2, 0, STREAM_DEFAULT>>>
-           (threadsNum, in, scale, beta, out);
-  CHECK_SYNC("hl_CMRNorm_forward");
-}
-
-__global__ void KeCMRNormDiff(size_t nthreads, const real* bottom_data,
-                              const real* top_data, const real* scale,
-                              const real* top_diff, size_t channels,
-                              size_t height, size_t width, size_t size,
-                              real negative_beta, real cache_ratio,
-                              real* bottom_diff ) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
-    // find out the local offset
-    size_t w = index % width;
-    size_t h = (index / width) % height;
-    size_t n = index / width / height;
-    size_t offset = (n * channels * height + h) * width + w;
-    size_t step = height * width;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-    int head = 0;
-    int pre_pad = size - (size + 1) / 2;
-    int post_pad = size - pre_pad - 1;
-    real accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad) {
-      accum_ratio += top_diff[head * step] *
-        top_data[head * step] / scale[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
-      accum_ratio += top_diff[head * step] *
-        top_data[head * step] / scale[head * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-  }
-}
-
-void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
-                         const real* scale,
-                         const real* outV, const real* outDiff,
-                         real *inDiff, size_t channels,
-                         size_t height, size_t width, size_t sizeX,
-                         real alpha, real beta) {
-  size_t threadsNum = frameCnt * height * width;
-  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
-  size_t blocksY = 1;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeCMRNormDiff <<<grid, threads, 0, STREAM_DEFAULT>>>
-           (threadsNum, inV, outV, scale, outDiff, channels,
-           height, width, sizeX, alpha, beta, inDiff);
-  CHECK_SYNC("hl_CMRNorm_backward");
-}
-
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 7111224d59..6198f067ba 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #include "hl_cuda_cudnn.h"
 #include <cudnn.h>
+#include <gflags/gflags.h>
 #include <mutex>
 #include "hl_cuda_cudnn.ph"
 #include "hl_dso_loader.h"
 #include "hl_thread.ph"
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-               4096,
-               "Specify cuDNN max workspace limit, in units MB, "
-               "4096MB=4GB by default.");
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -175,11 +175,15 @@ void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
       << "PaddlePaddle Requirement: "
       << "(header v[2-3] with libcudnn v[2-3]) Or "
       << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5).";
+      << "(header v5 with libcudnn v5) Or"
+      << "(header v6 with libcudnn v6).";
 
-  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
       << "cudnn v5 requires cuda version >= 7.5";
 
+  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
+      << "cudnn v6 requires cuda version >= 8.0";
+
   CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
   CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
 
@@ -610,6 +614,23 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
   CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
 
   cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode,
+                                                       data_type));
+#else
   CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                        padding_height,
                                                        padding_width,
@@ -618,6 +639,7 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                        1,
                                                        1,
                                                        mode));
+#endif
 
   hl_conv->input_image = image;
   hl_conv->filter = filter;
@@ -645,6 +667,23 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
 
   cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
   cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode,
+                                                       data_type));
+#else
   CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
                                                        padding_height,
                                                        padding_width,
@@ -653,6 +692,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        1,
                                                        1,
                                                        mode));
+#endif
 
   cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
   hl_conv->input_image = image;
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index b0bba73594..6dfb12e00b 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
+// Because clang-format 4.X and clang-format 3.8+ format
+// following lines in different. So disable clang-format.
 #include "hl_cuda.h"
 #include <cuda_profiler_api.h>
 #include <string.h>
@@ -20,9 +23,10 @@ limitations under the License. */
 #include <unistd.h>
 #include <mutex>
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+// clang-format on
 
 namespace dynload {
 
@@ -72,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
-
 } /* namespace dynload */
 
 /**
@@ -166,11 +98,11 @@ int g_cuda_lib_version = 0;
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                                                  \
-  do {                                                                        \
-    cudaError_t cudaStat = cudaFunc;                                          \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
@@ -279,13 +211,13 @@ void hl_fini() {
       tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
     }
 
     /* free device memory */
     hl_free_mem_device(t_device[dev]->gpu_mem);
     hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
   }
 
   free(tmp);
@@ -303,7 +235,7 @@ void hl_set_device(int device) {
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
       << "Device: " << device << " is not specified in startup.";
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* switch thread stream */
   for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -331,7 +263,7 @@ void hl_set_device(int device) {
 
 int hl_get_device() {
   int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
   return device;
 }
 
@@ -339,7 +271,7 @@ void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -347,7 +279,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
   CHECK_NOTNULL(dest_d);
 
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -356,8 +288,7 @@ void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -365,7 +296,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -376,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
   }
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -389,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -398,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -407,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -422,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -434,8 +363,7 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -445,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
   if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
   } else {
     CHECK_CUDA(err);
   }
@@ -458,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   int device = device_prop->device;
   global_device_resources device_res = device_prop->device_resources;
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
   /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
 
   device_prop->major = cu_prop.major;
   device_prop->minor = cu_prop.minor;
@@ -469,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create device stream */
   for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* cublas init */
@@ -496,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 
 void hl_create_thread_resources(int device,
                                 thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* create thread stream */
   for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* allocation device memory */
@@ -516,14 +444,14 @@ void hl_create_thread_resources(int device,
   /* allocation host memory */
   device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 
 void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
   CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
   if (device == NULL) {
     number = g_system_device_num;
@@ -635,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 
 void hl_create_event(hl_event_t *event) {
@@ -644,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
   struct _hl_event_st *st_event =
       (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
 
   *event = st_event;
 }
@@ -654,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -667,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -678,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
 
   free(event);
   event = NULL;
@@ -691,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
 
 void hl_event_synchronize(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 
 void hl_get_device_name(char *name, int len, int device) {
@@ -720,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
   CHECK(cudaSuccess == err || cudaErrorNotReady == err);
 
   if (cudaErrorNotReady == err) {
@@ -746,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   return true;
 }
 
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
 
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 4e33ac443c..ba823de272 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template <bool padding>
-__global__ void KeContextProjectionForward(real* input,
-                                           const int* sequence,
-                                           real* weightData,
-                                           real* output,
-                                           int inputDim,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  output += seqStart * inputDim * contextLength;
-  input += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        // i + contextStart;
-        if ((i + contextStart) < 0) {
-          if (padding) {
-            value = weightData[i * inputDim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          if (padding) {
-            value =
-              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
-                         inputDim + idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          output + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_forward(real* input,
-                                   const int* sequence,
-                                   real* weightData,
-                                   real* output,
-                                   int numSequences,
-                                   int inputDim,
-                                   int contextLength,
-                                   int contextStart,
-                                   int beginPad,
-                                   bool isPadding) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK(!isPadding || weightData);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  if (isPadding) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-__global__ void KeContextProjectionBackwardData(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  outputGrad += seqStart * inputDim * contextLength;
-  inputGrad += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + contextStart) < 0) {
-          continue;
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          continue;
-        } else {
-          // value = 0;
-          value = inputGrad[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          outputGrad + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-        inputGrad[(i + contextStart) * inputDim + idx] = value;
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_backward_data(real* outputGrad,
-                                         const int* sequence,
-                                         real* inputGrad,
-                                         int numSequences,
-                                         int inputDim,
-                                         int contextLength,
-                                         int contextStart) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(inputGrad);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / padOfBlock;
-  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weightIdx < weightDim) {
-    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
-      int seqStart = sequence[seqId];
-      int seqEnd = sequence[seqId+1];
-      output_r = outputGrad + seqStart * weightDim * contextLength;
-
-      if (contextStart < 0) {
-        if (padId + contextStart < 0) {
-          instanceId = padId;
-        } else {
-          // beginPad > 0;
-          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
-        }
-      } else {
-        if (padId + (seqEnd - seqStart) < contextStart) {
-          continue;
-        } else {
-          // beginPad == 0;
-          instanceId = padId + (seqEnd - seqStart) - contextStart;
-        }
-      }
-
-      int outx = (instanceId - contextLength) < 0 ?
-                 instanceId : (contextLength - 1);
-      int outy = (instanceId - contextLength) < 0 ?
-                 0 : (instanceId - (contextLength - 1));
-      output_r += outy * weightDim * contextLength + outx * weightDim;
-      for (int j = outy; j < seqEnd - seqStart; j++) {
-        value += output_r[weightIdx];
-        if (j - outy == outx) break;
-        output_r += (contextLength - 1) * weightDim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weightIdx < weightDim) {
-    if (idy == 0) {
-      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
-    }
-  }
-}
-
-void hl_context_projection_backward_weight(real* outputGrad,
-                                           const int* sequence,
-                                           real* weightGrad,
-                                           int numSequences,
-                                           int weightDim,
-                                           int totalPad,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(weightGrad);
-
-  int threadsX = 32;
-  int threadsY = 32;
-  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
-  dim3 threads(threadsX, threadsY);
-  dim3 grid(blocksX, 1);
-
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, weightGrad, numSequences, weightDim,
-     contextLength, contextStart, beginPad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
 template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
deleted file mode 100644
index ecc03a729d..0000000000
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_USE_DSO
-
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-
-#undef CUDA_ROUNTINE_EACH
-
-} /* namespace dynload */
-
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-
-extern "C" {
-
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index f509b89243..53164dd27c 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -13,24 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "hl_dso_loader.h"
-#include "paddle/utils/CommandLineParser.h"
+#include <gflags/gflags.h>
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir,
-                "",
-                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib. If empty [default], dlopen "
-                "will search cudnn from LD_LIBRARY_PATH");
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir,
-                "",
-                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-                "libcudart can not be specified by cuda_dir, since some "
-                "build-in function in cudart already ran before main entry). "
-                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
 
-P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
-
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 9ae8bc0f22..55b940ca67 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -29,7 +29,6 @@ void* warpctc_dso_handle = nullptr;
  * false, you need to add the path of libwarp-ctc.so to
  * the linked-libs of paddle or to LD_PRELOAD.
  */
-#ifdef PADDLE_USE_DSO
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
   struct DynLoad__##__name {                                           \
     template <typename... Args>                                        \
@@ -41,15 +40,6 @@ void* warpctc_dso_handle = nullptr;
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
     }                                                                  \
   } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name;  // struct DynLoad__##__name
-#endif
 
 // include all needed warp-ctc functions
 DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000..fde48a73b6
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000..12352ba29e
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_NORMAL = 0,
+  TENSOR_SEQUENCE_ID = 1,
+  TENSOR_SEQUENCE_DATA = 2,
+  TENSOR_SPARSE = 3
+};
+
+enum SparseDataType {
+  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
+  SPARSE_FLOAT_VALUE = 1
+};
+
+enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+public:
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
+
+  BufferArg(void* buf, ValueType valueType)
+      : buf_(buf), valueType_(valueType) {}
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_;
+  ArgType argType_ = UNSPECIFIED;
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+private:
+  size_t numSeqs_;
+};
+
+// sequence data
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {}
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {}
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+
+private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseDataFormat format,
+                  SparseDataType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(format),
+        type_(type) {
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), (size_t)2);
+    CHECK_EQ(row_.shape().ndims(), (size_t)1);
+    CHECK_EQ(col_.shape().ndims(), (size_t)1);
+    if (format == SPARSE_CSR_FORMAT) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format == SPARSE_CSC_FORMAT) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000..b345597435
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/MemoryHandle.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+TEST(BufferTest, asArgument) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  VectorPtr vector = Vector::create(100, false);
+  CpuSparseMatrix sparse(200, 300, 50);
+
+  // prepare arguments
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  argments.addArg(*vector);
+  argments.addArg(sparse);
+
+  // function
+  auto function = [=](const BufferArgs& inputs) {
+    EXPECT_EQ(inputs.size(), 3);
+
+    // check inputs[0]
+    EXPECT_EQ(inputs[0].shape().ndims(), 2);
+    EXPECT_EQ(inputs[0].shape()[0], 100);
+    EXPECT_EQ(inputs[0].shape()[1], 200);
+    EXPECT_EQ(inputs[0].data(), matrix->getData());
+
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
+              matrix->getHeight());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
+              matrix->getWidth());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+
+    // check inputs[1]
+    EXPECT_EQ(inputs[1].shape().ndims(), 1);
+    EXPECT_EQ(inputs[1].shape()[0], 100);
+    EXPECT_EQ(inputs[1].data(), vector->getData());
+    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+
+    // check inputs[2]
+    EXPECT_EQ(inputs[2].shape().ndims(), 2);
+    EXPECT_EQ(inputs[2].shape()[0], 200);
+    EXPECT_EQ(inputs[2].shape()[1], 300);
+    EXPECT_EQ(inputs[2].data(), sparse.getData());
+    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  };
+
+  // call function
+  function(argments);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
new file mode 100644
index 0000000000..75a2acc55e
--- /dev/null
+++ b/paddle/function/CMakeLists.txt
@@ -0,0 +1,35 @@
+file(GLOB h_files . *Op.h)
+file(GLOB cpp_files . *Op.cpp)
+
+list(APPEND h_files Function.h)
+list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
+
+if(WITH_GPU)
+    file(GLOB cu_files . *OpGpu.cu)
+    cuda_compile(cu_objs ${cu_files})
+endif()
+
+add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
+add_dependencies(paddle_function ${external_project_dependencies})
+
+
+if(WITH_GPU)
+if(WITH_TESTING)
+    # TODO:
+    # file(GLOB test_files . *OpTest.cpp)
+    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
+    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
+    # add_simple_unittest(ContextProjectionOpTest)
+endif()
+endif()
+
+add_style_check_target(paddle_function ${h_files})
+add_style_check_target(paddle_function ${cpp_files})
+if(WITH_GPU)
+    add_style_check_target(paddle_function ${cu_files})
+endif()
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000..cb448562eb
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -0,0 +1,362 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input value.
+ * \param inputs[1] input weight.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    auto in_mat = inputs[0].matrix<Device>();
+    auto w_mat = !inputs[1].data()
+                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                     : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat =
+              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] weight grad.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+
+    CHECK(outputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    auto out_grad_mat = outputs[0].matrix<Device>();
+    auto in_grad_mat =
+        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[0].matrix<Device>();
+    auto w_grad_mat = !inputs[1].data()
+                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+#if 0
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
+                                          in_grad_mat.get(),
+                                          seq_vec,
+                                          context_length_,
+                                          context_start_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * \param inputs[0] weight grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
+                                            w_grad_mat.get(),
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+#endif
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+#if 0
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
new file mode 100644
index 0000000000..a558df5e07
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType DType>
+void ContextProjectionForward(
+    typename Tensor<real, DType>::Matrix& output,
+    const typename Tensor<real, DType>::Matrix& input,
+    const typename Tensor<real, DType>::Matrix& weight,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad);
+
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType DType>
+void ContextProjectionBackward(
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad,
+    bool is_padding,
+    size_t total_pad);
+
+template <DeviceType DType>
+void ContextProjectionBackwardData(
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start);
+
+template <DeviceType DType>
+void ContextProjectionBackwardWeight(
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
new file mode 100644
index 0000000000..6a4a01a651
--- /dev/null
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "ContextProjectionOp.h"
+
+namespace paddle {
+
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weight          padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   const real* weight,
+                                   real* output,
+                                   size_t num_sequences,
+                                   size_t input_dim,
+                                   size_t context_length,
+                                   int context_start,
+                                   size_t begin_pad) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+
+  if (weight) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
+                                               const GpuMatrix& input,
+                                               const GpuMatrix& weight,
+                                               const GpuIVector& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  hl_context_projection_forward(input.getData(),
+                                sequence.getData(),
+                                weight ? weight.getData() : nullptr,
+                                output.getData(),
+                                sequence.getSize() - 1,
+                                input.getWidth(),
+                                context_length,
+                                context_start,
+                                begin_pad);
+}
+
+__global__ void KeContextProjectionBackwardData(real* out_grad,
+                                                const int* sequence,
+                                                real* in_grad,
+                                                int input_dim,
+                                                int context_length,
+                                                int context_start) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  out_grad += seq_start * input_dim * context_length;
+  in_grad += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + context_start) < 0) {
+          continue;
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          continue;
+        } else {
+          // value = 0;
+          value = in_grad[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          out_grad + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+        in_grad[(i + context_start) * input_dim + idx] = value;
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  input_grad       input gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ *
+ */
+void hl_context_projection_backward_data(real* out_grad,
+                                         const int* sequence,
+                                         real* input_grad,
+                                         size_t num_sequences,
+                                         size_t input_dim,
+                                         size_t context_length,
+                                         int context_start) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(input_grad);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+
+template <>
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                    GpuMatrix& in_grad,
+                                                    const GpuIVector& sequence,
+                                                    size_t context_length,
+                                                    int context_start) {
+  hl_context_projection_backward_data(out_grad.getData(),
+                                      sequence.getData(),
+                                      in_grad.getData(),
+                                      sequence.getSize() - 1,
+                                      in_grad.getWidth(),
+                                      context_length,
+                                      context_start);
+}
+
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+                                                  const int* sequence,
+                                                  real* w_grad,
+                                                  int num_sequences,
+                                                  int w_dim,
+                                                  int context_length,
+                                                  int context_start,
+                                                  int begin_pad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / pad_of_block;
+  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+
+  sum_s[idy][idx] = 0.0f;
+  if (weight_idx < w_dim) {
+    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
+      int seq_start = sequence[seqId];
+      int seq_end = sequence[seqId+1];
+      output_r = out_grad + seq_start * w_dim * context_length;
+
+      if (context_start < 0) {
+        if (padId + context_start < 0) {
+          instanceId = padId;
+        } else {
+          // begin_pad > 0;
+          instanceId = (padId - begin_pad) +
+            (seq_end - seq_start) - context_start;
+        }
+      } else {
+        if (padId + (seq_end - seq_start) < context_start) {
+          continue;
+        } else {
+          // begin_pad == 0;
+          instanceId = padId + (seq_end - seq_start) - context_start;
+        }
+      }
+
+      int outx = (instanceId - context_length) < 0 ?
+                 instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0 ?
+                 0 : (instanceId - (context_length - 1));
+      output_r += outy * w_dim * context_length + outx * w_dim;
+      for (int j = outy; j < seq_end - seq_start; j++) {
+        value += output_r[weight_idx];
+        if (j - outy == outx) break;
+        output_r += (context_length - 1) * w_dim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (weight_idx < w_dim) {
+    if (idy == 0) {
+      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
+    }
+  }
+}
+
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  w_grad           weight gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   w_dim            input sequence dimension.
+ * @param[in]   total_pad        number of extra timesteps.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_backward_weight(real* out_grad,
+                                           const int* sequence,
+                                           real* w_grad,
+                                           size_t num_sequences,
+                                           size_t w_dim,
+                                           size_t total_pad,
+                                           size_t context_length,
+                                           int context_start,
+                                           size_t begin_pad) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(w_grad);
+
+  int threads_x = 32;
+  int threads_y = 32;
+  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
+  dim3 threads(threads_x, threads_y);
+  dim3 grid(blocks_x, 1);
+
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, w_grad, num_sequences, w_dim,
+     context_length, context_start, begin_pad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+
+template <>
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+        GpuMatrix& out_grad,
+        GpuMatrix& w_grad,
+        const GpuIVector& seq_vec,
+        size_t context_length,
+        int context_start,
+        size_t total_pad,
+        size_t begin_pad) {
+  hl_context_projection_backward_weight(out_grad.getData(),
+                                        seq_vec.getData(),
+                                        w_grad.getData(),
+                                        seq_vec.getSize() - 1,
+                                        w_grad.getWidth(),
+                                        total_pad,
+                                        context_length,
+                                        context_start,
+                                        begin_pad);
+}
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                GpuMatrix& in_grad,
+                                                GpuMatrix& w_grad,
+                                                const GpuIVector& sequence,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+    if (in_grad) {
+        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+                out_grad,
+                in_grad,
+                sequence,
+                context_length,
+                context_start);
+    }
+    if (is_padding && w_grad) {
+        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                out_grad,
+                w_grad,
+                sequence,
+                context_length,
+                context_start,
+                total_pad,
+                begin_pad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000..6223d2fd23
--- /dev/null
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start)));
+
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionBackward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding)
+                              .set("total_pad", pad));
+
+  CpuMatrix cpu_in_grad(batch_size, input_dim);
+  cpu_in_grad.randomizeUniform();
+  GpuMatrix gpu_in_grad(batch_size, input_dim);
+  gpu_in_grad.copyFrom(cpu_in_grad);
+
+  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
+  cpu_out_grad.randomizeUniform();
+  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
+  gpu_out_grad.copyFrom(cpu_out_grad);
+
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  auto cpu_w_grad =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_w_grad =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_w_grad->randomizeUniform();
+    gpu_w_grad->copyFrom(*cpu_w_grad);
+  }
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
+  if (is_padding) {
+    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
+  }
+}
+
+TEST(ContextProjection, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
new file mode 100644
index 0000000000..92980c503f
--- /dev/null
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossMapNormalOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t oneImage = height * width;
+  size_t oneSample = channels * oneImage;
+
+  CpuVector outputsV(numSamples * oneSample, outputs);
+  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
+  CpuVector denomsV(numSamples * oneSample, denoms);
+
+  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
+  // x represents inputs
+  // f(x) represents outputs
+  // denoms save the intermediate result for backward
+  denomsV = denomsV.constant(1.0);
+  const int start = -((int)size - 1) / 2;
+  const int end = (int)size + start;
+  for (size_t i = 0; i < numSamples; i++) {
+    real* oneDenom = denoms + i * oneSample;
+    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
+    for (int c = 0; c < (int)channels; c++) {
+      CpuVector denom(oneImage, oneDenom + c * oneImage);
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
+          denom += input.square() * scale;
+        }
+      }
+    }
+  }
+
+  outputsV = inputsV * denomsV.pow(-pow);
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t oneSample = channels * height * width;
+  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
+                                                         size_t offset) {
+    return CpuVector(height * width, data + offset);
+  };
+
+  const int start = -((int)size) / 2;
+  const int end = (int)size + start;
+  const real ratio = -(real)2 * scale * pow;
+  for (size_t i = 0; i < numSamples; i++) {
+    size_t sOffset = i * oneSample;
+    real* oneInputGrad = inputsGrad + sOffset;
+    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
+    real* oneDenom = const_cast<real*>(denoms) + sOffset;
+    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
+    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
+
+    for (int c = 0; c < (int)channels; c++) {
+      size_t cOffset = c * height * width;
+      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
+      CpuVector inputValue = oneImage(oneInputValue, cOffset);
+      CpuVector denom = oneImage(oneDenom, cOffset);
+      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
+
+      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          size_t offset = (c + s) * height * width;
+          CpuVector output = oneImage(oneOutputValue, offset);
+          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
+          CpuVector denom = oneImage(oneDenom, offset);
+
+          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
+ * \param inputs[0] input value.
+ * \param outputs[0] output value.
+ * \param outputs[1] denoms.
+ */
+template <DeviceType Device>
+class CrossMapNormalFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)1, inputs.size());
+    CHECK_EQ((size_t)2, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
+
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           samples,
+                           channels,
+                           height,
+                           width,
+                           size_,
+                           scale_,
+                           pow_);
+  }
+
+private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+/**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
+ * \param inputs[0] input value.
+ * \param inputs[1] output value.
+ * \param inputs[2] output grad.
+ * \param inputs[3] denoms.
+ * \param outputs[0] input grad.
+ */
+template <DeviceType Device>
+class CrossMapNormalGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)4, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+
+    // TODO(hedaoyuan): need support ASSIGN_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               samples,
+                               channels,
+                               height,
+                               width,
+                               size_,
+                               scale_,
+                               pow_);
+  }
+
+private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.h b/paddle/function/CrossMapNormalOp.h
new file mode 100644
index 0000000000..b1e401ad0a
--- /dev/null
+++ b/paddle/function/CrossMapNormalOp.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Cross map respose normalize forward.
+ *          The data structure of image data is NCHW.
+ *
+ * \param[out]  outputs     output data.
+ * \param[in]   denoms      denoms buffer.
+ * \param[in]   inputs      input data.
+ * \param[in]   numSamples  batch size of input image.
+ * \param[in]   channels    number of channel.
+ * \param[in]   height      image height.
+ * \param[in]   width       image width.
+ * \param[in]   size        size.
+ * \param[in]   scale       scale.
+ * \param[in]   pow         scale.
+ *
+ */
+template <DeviceType Device>
+void CrossMapNormal(real* outputs,
+                    real* denoms,
+                    const real* inputs,
+                    size_t numSamples,
+                    size_t channels,
+                    size_t height,
+                    size_t width,
+                    size_t size,
+                    real scale,
+                    real pow);
+
+/**
+ * \brief   Cross map respose normalize backward.
+ *          The data structure of image data is NCHW.
+ *
+ * \param[out]  inputsGrad      input grad.
+ * \param[in]   inputsValue     input value.
+ * \param[out]  outputsValue    output value.
+ * \param[out]  outputsGrad     output grad.
+ * \param[in]   denoms          denoms buffer.
+ * \param[in]   numSamples      batch size of input image.
+ * \param[in]   channels        number of channel.
+ * \param[in]   height          image height.
+ * \param[in]   width           image width.
+ * \param[in]   size            size.
+ * \param[in]   scale           scale.
+ * \param[in]   pow             scale.
+ *
+ */
+template <DeviceType Device>
+void CrossMapNormalGrad(real* inputsGrad,
+                        const real* inputsValue,
+                        const real* outputsValue,
+                        const real* outputsGrad,
+                        const real* denoms,
+                        size_t numSamples,
+                        size_t channels,
+                        size_t height,
+                        size_t width,
+                        size_t size,
+                        real scale,
+                        real pow);
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
new file mode 100644
index 0000000000..b33dd10834
--- /dev/null
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "CrossMapNormalOp.h"
+
+namespace paddle {
+
+__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
+                                   real* scale, size_t channels,
+                                   size_t height, size_t width, size_t size,
+                                   real alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < imageSize) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int n = idx / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+
+    in += offset;
+    scale += offset;
+    const int step = height * width;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    real accum = 0;
+    int index = 0;
+    while (index < channels + post_pad) {
+      if (index < channels) {
+        accum += in[index * step] * in[index * step];
+      }
+      if (index >= size) {
+        accum -= in[(index - size) * step] * in[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        scale[(index - post_pad) * step] = 1. + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
+                                const real* scale, real negative_beta,
+                                real* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < inputSize) {
+    out[index] = in[index] * pow(scale[index], negative_beta);
+  }
+}
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t imageSize = numSamples * height * width;
+  int blockSize = 1024;
+  int gridSize = (imageSize + 1024 - 1) / 1024;
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (imageSize, inputs, denoms, channels, height, width, size, scale);
+
+  size_t inputSize = numSamples * height * width *channels;
+  blockSize = 1024;
+  gridSize = (inputSize + 1024 - 1) / 1024;
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inputSize, inputs, denoms, -pow, outputs);
+
+  CHECK_SYNC("CrossMapNormal");
+}
+
+__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
+                              const real* top_data, const real* scale,
+                              const real* top_diff, size_t channels,
+                              size_t height, size_t width, size_t size,
+                              real negative_beta, real cache_ratio,
+                              real* bottom_diff ) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < imageSize) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int n = idx / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+
+    const int step = height * width;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    real accum = 0;
+    while (index < channels + post_pad) {
+      if (index < channels) {
+        accum += top_diff[index * step] * top_data[index * step] /
+          scale[index * step];
+      }
+      if (index >= size) {
+        accum -= top_diff[(index - size) * step] *
+          top_data[(index - size) * step] / scale[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        bottom_diff[(index - post_pad) * step] +=
+          top_diff[(index - post_pad) * step] *
+          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t imageSize = numSamples * height * width;
+
+  int blockSize = 1024;
+  int gridSize = (imageSize + 1024 - 1) / 1024;
+  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
+      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+  CHECK_SYNC("CrossMapNormalGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
new file mode 100644
index 0000000000..d65d9310af
--- /dev/null
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(CrossMapNormal, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          for (size_t size : {1, 2, 3, 5, 7}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " size=" << size;
+
+            FunctionCompare compare("CrossMapNormal",
+                                    FuncConfig()
+                                        .set("size", size)
+                                        .set("scale", (real)1.5)
+                                        .set("pow", (real)0.5));
+            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
+            compare.cmpWithArg({Tensor(nullptr, dims)},
+                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
+                               {});
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CrossMapNormalGrad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          for (size_t size : {1, 2, 3, 5, 7}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " size=" << size;
+
+            FunctionCompare compare("CrossMapNormalGrad",
+                                    FuncConfig()
+                                        .set("size", size)
+                                        .set("scale", (real)1.5)
+                                        .set("pow", (real)0.5));
+            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
+            compare.cmpWithArg({Tensor(nullptr, dims),
+                                Tensor(nullptr, dims),
+                                Tensor(nullptr, dims),
+                                Tensor(nullptr, dims)},
+                               {Tensor(nullptr, dims)},
+                               {});
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
new file mode 100644
index 0000000000..dbe3a4e9f6
--- /dev/null
+++ b/paddle/function/Function.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+
+namespace paddle {
+
+template <>
+size_t FuncConfig::get<size_t>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.s;
+}
+
+template <>
+real FuncConfig::get<real>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.r;
+}
+
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
+template <>
+FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].s = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].r = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+}
+
+ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
+
+}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
new file mode 100644
index 0000000000..249f8f9cfa
--- /dev/null
+++ b/paddle/function/Function.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "BufferArg.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ClassRegistrar.h"
+
+namespace paddle {
+
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ * Follow-up will consider moving this data structure to Proto inside.
+ */
+class FuncConfig {
+public:
+  union value {
+    size_t s;
+    real r;
+    int i;
+    bool b;
+  };
+
+  template <typename T>
+  T get(const std::string& key) const;
+
+  template <typename T>
+  FuncConfig& set(const std::string& key, T v);
+
+protected:
+  std::map<std::string, value> valueMap_;
+};
+
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  template <typename Tensor>
+  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+private:
+  std::vector<BufferArgPtr> args_;
+};
+
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
+class FunctionBase {
+public:
+  virtual ~FunctionBase() {}
+
+  virtual void init(const FuncConfig& config) {}
+
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  static ClassRegistrar<FunctionBase> funcRegistrar_;
+};
+
+#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
+
+#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
+  static InitFunction __reg_type_##typeName##deviceName([]() { \
+    FunctionBase::funcRegistrar_                               \
+        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
+            FUNC_NAME(typeName, deviceName));                  \
+  })
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
new file mode 100644
index 0000000000..7ce908320a
--- /dev/null
+++ b/paddle/function/FunctionTest.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
new file mode 100644
index 0000000000..32131037f6
--- /dev/null
+++ b/paddle/function/FunctionTest.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "paddle/math/Vector.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+class FunctionCompare {
+public:
+  FunctionCompare(const std::string& name, const FuncConfig& config)
+      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpu->init(config);
+    gpu->init(config);
+  }
+
+  void cmpWithArg(const Arguments& inputs,
+                  const Arguments& outputs,
+                  const Arguments& inouts) {
+    // init cpu and gpu arguments
+    auto initArgs = [=](
+        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
+      for (const auto arg : inArgs) {
+        size_t size = sizeof(real);
+        for (const auto dim : arg.dims_) {
+          size *= dim;
+        }
+        if (arg.getData()) {
+          // todo(tianbing), waste unnecessary mem here
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
+        } else {
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(
+              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+          gpuArgs.emplace_back(
+              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          // will use an api to refactor this code.
+          CpuVector cpuVector(size / sizeof(real),
+                              (real*)cpuArgs.back().getData());
+          GpuVector gpuVector(size / sizeof(real),
+                              (real*)gpuArgs.back().getData());
+          cpuVector.uniform(0.001, 1);
+          gpuVector.copyFrom(cpuVector);
+        }
+      }
+    };
+    initArgs(cpuInputs, gpuInputs, inputs);
+    initArgs(cpuOutputs, gpuOutputs, outputs);
+    initArgs(cpuInouts, gpuInouts, inouts);
+
+    // function calculate
+    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
+    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
+
+    // check outputs and inouts
+    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
+      for (size_t i = 0; i < cpuArgs.size(); i++) {
+        auto cpu = cpuArgs[i];
+        auto gpu = gpuArgs[i];
+        size_t size = 1;
+        for (auto dim : cpu.dims_) {
+          size *= dim;
+        }
+        CpuVector cpuVector(size, (real*)cpu.getData());
+        GpuVector gpuVector(size, (real*)gpu.getData());
+
+        autotest::TensorCheckErr(cpuVector, gpuVector);
+      }
+    };
+    checkArgs(cpuOutputs, gpuOutputs);
+    checkArgs(cpuInouts, gpuInouts);
+  }
+
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+
+protected:
+  std::shared_ptr<FunctionBase> cpu;
+  std::shared_ptr<FunctionBase> gpu;
+  std::vector<CpuMemHandlePtr> cpuMemory;
+  std::vector<GpuMemHandlePtr> gpuMemory;
+  Arguments cpuInputs;
+  Arguments cpuOutputs;
+  Arguments cpuInouts;
+  Arguments gpuInputs;
+  Arguments gpuOutputs;
+  Arguments gpuInouts;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
new file mode 100644
index 0000000000..e491e3f1d6
--- /dev/null
+++ b/paddle/function/TensorShape.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    dims_.assign(dims);
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    dims_.assign(t.dims_.begin(), t.dims_.end());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
new file mode 100644
index 0000000000..45a2e106e7
--- /dev/null
+++ b/paddle/function/TensorShapeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
new file mode 100644
index 0000000000..98942cff9e
--- /dev/null
+++ b/paddle/function/TensorType.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
new file mode 100644
index 0000000000..e50e46f3e9
--- /dev/null
+++ b/paddle/function/TensorTypeTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorType.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
+}
+
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
+}
+
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index a066f80c22..4f92150ec8 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -27,16 +27,12 @@ if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvLayer.h
         layers/CudnnPoolLayer.h
-        layers/CudnnBatchNormLayer.h
-        layers/NormProjectionLayer.h
-        layers/NormLayer.h)
+        layers/CudnnBatchNormLayer.h)
 
     list(REMOVE_ITEM GSERVER_SOURCES
         layers/CudnnConvLayer.cpp
         layers/CudnnPoolLayer.cpp
-        layers/CudnnBatchNormLayer.cpp
-        layers/NormProjectionLayer.cpp
-        layers/NormLayer.cpp)
+        layers/CudnnBatchNormLayer.cpp)
     compile_cu_as_cpp(layers/LstmCompute.cu)
     compile_cu_as_cpp(layers/GruCompute.cu)
 endif()
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9b7f7e36ce..9a2ad7567f 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -30,11 +30,11 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 namespace paddle {
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index d16ecca2d9..c6f5cab191 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "DataProviderGroup.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_double(memory_threshold_on_load_data,
-                1.0,
-                "stop loading data when memory is not sufficient");
+DEFINE_double(memory_threshold_on_load_data,
+              1.0,
+              "stop loading data when memory is not sufficient");
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 5bdd55309c..b53790e764 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include <fenv.h>
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 460efc5adc..c26e242534 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -252,19 +252,9 @@ private:
     // only for instance will make python reference-count error.
     //
     // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
-        gModuleClsPtrs_.end()) {
-      // Multi instance use same module
-      Py_XINCREF(module.get());
-      Py_XINCREF(moduleDict.get());
-    } else {
-      gModuleClsPtrs_.insert((uintptr_t)module.get());
-    }
-    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
-      Py_XINCREF(cls.get());
-    } else {
-      gModuleClsPtrs_.insert((uintptr_t)cls.get());
-    }
+    Py_XINCREF(module.get());
+    Py_XINCREF(moduleDict.get());
+    Py_XINCREF(cls.get());
 
     PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
     PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
@@ -471,7 +461,6 @@ private:
   std::vector<std::string> fileLists_;
   std::vector<SlotHeader> headers_;
   static PyObjectPtr zeroTuple_;
-  static std::unordered_set<uintptr_t> gModuleClsPtrs_;
 
   class PositionRandom {
   public:
@@ -671,7 +660,6 @@ public:
   }
 };
 
-std::unordered_set<uintptr_t> PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 3d8af5bcd4..13f02e51fe 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <set>
 #include <vector>
 
 #include "paddle/math/Vector.h"
@@ -72,6 +73,7 @@ class ChunkEvaluator : public Evaluator {
 
   std::vector<Segment> labelSegments_;
   std::vector<Segment> outputSegments_;
+  std::set<int> excludedChunkTypes_;
 
 public:
   virtual void init(const EvaluatorConfig& config) {
@@ -105,6 +107,10 @@ public:
     }
     CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
     otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
+
+    // the chunks of types in excludedChunkTypes_ will not be counted
+    auto& tmp = config.excluded_chunk_types();
+    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
   }
 
   virtual void start() {
@@ -156,7 +162,8 @@ public:
     getSegments(label, length, labelSegments_);
     size_t i = 0, j = 0;
     while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j]) {
+      if (outputSegments_[i] == labelSegments_[j] &&
+          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
         ++numCorrect_;
       }
       if (outputSegments_[i].end < labelSegments_[j].end) {
@@ -168,8 +175,12 @@ public:
         ++j;
       }
     }
-    numLabelSegments_ += labelSegments_.size();
-    numOutputSegments_ += outputSegments_.size();
+    for (auto& segment : labelSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
+    }
+    for (auto& segment : outputSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
+    }
   }
 
   void getSegments(int* label, int length, std::vector<Segment>& segments) {
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7556d21e01..ae7508e2bb 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
@@ -78,7 +78,7 @@ public:
                                               useGpu(arguments[0].deviceId));
     errorMat->zeroMem();
     if (label != nullptr) {
-      errorMat->classificationError(output, label);
+      errorMat->classificationError(*output, *label);
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
       errorMat->classificationErrorMulti(
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 579eca71d4..1e35c7e2b8 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -181,12 +181,12 @@ public:
   /**
    * Create an evaluator which can be used for eval()
    */
-  virtual Evaluator* makeEvaluator() = 0;
+  virtual Evaluator* makeEvaluator() const = 0;
 
   /**
    * evaluate using the given evaluator
    */
-  virtual void eval(Evaluator* evaluator) = 0;
+  virtual void eval(Evaluator* evaluator) const = 0;
 
   std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
@@ -212,11 +212,7 @@ public:
    * @note    This function will only been implemented and used in a
    *          multithreaded environment.
    */
-  virtual void start(const TrainerConfig& config,
-                     DataProviderPtr dataProvider) {
-    (void)config;
-    (void)dataProvider;
-  }
+  virtual void start() {}
 
   /**
    * @brief   check  each work-thread whether is failed/error/finish,
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index a7324f5545..80f223824d 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu,
-              true,
-              "If true, do not allow multiple models on one GPU device");
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
-P_DECLARE_bool(external);
+DECLARE_bool(external);
 #endif
 
 namespace paddle {
@@ -327,11 +327,11 @@ void MultiGradientMachine::finish() {
   }
 }
 
-Evaluator* MultiGradientMachine::makeEvaluator() {
+Evaluator* MultiGradientMachine::makeEvaluator() const {
   return threads_[0]->getGradientMachine()->makeEvaluator();
 }
 
-void MultiGradientMachine::eval(Evaluator* evaluator) {
+void MultiGradientMachine::eval(Evaluator* evaluator) const {
   for (auto& thread : threads_) {
     SetDevice device(thread->getDeviceId());
     thread->getGradientMachine()->eval(evaluator);
@@ -441,7 +441,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
-  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr) nullptr);
+  gradientMachine_->start();
 
   computeThread_.reset(new std::thread([this]() { computeThread(); }));
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 5f9855c4be..9be15ef4bc 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -193,9 +193,9 @@ public:
 
   virtual void finish();
 
-  virtual Evaluator* makeEvaluator();
+  virtual Evaluator* makeEvaluator() const;
 
-  virtual void eval(Evaluator* evaluator);
+  virtual void eval(Evaluator* evaluator) const;
 
   bool useGpu() const { return useGpu_; }
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index 6eb3d8db96..5f52a5f3d4 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -109,10 +109,9 @@ void MultiNetwork::onPassEnd() {
   }
 }
 
-void MultiNetwork::start(const TrainerConfig& config,
-                         DataProviderPtr dataProvider) {
+void MultiNetwork::start() {
   for (auto& subNetwork : subNetworks_) {
-    subNetwork->start(config, dataProvider);
+    subNetwork->start();
   }
 }
 
@@ -172,7 +171,7 @@ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 };
 
-Evaluator* MultiNetwork::makeEvaluator() {
+Evaluator* MultiNetwork::makeEvaluator() const {
   MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
   for (size_t i = 0; i < subNetworks_.size(); i++) {
     std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
@@ -181,6 +180,6 @@ Evaluator* MultiNetwork::makeEvaluator() {
   return multiCombinedEvaluator;
 }
 
-void MultiNetwork::eval(Evaluator* evaluator) { evaluator->eval(*this); }
+void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index 89fbf32b4f..3ac2888c57 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -46,15 +46,15 @@ public:
 
   virtual void onPassEnd();
 
-  virtual Evaluator* makeEvaluator();
+  virtual Evaluator* makeEvaluator() const;
 
-  virtual void eval(Evaluator* evaluator);
+  virtual void eval(Evaluator* evaluator) const;
 
   const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
     return subNetworks_;
   }
 
-  virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider);
+  virtual void start();
 
   virtual void finish();
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index ee36a87b9d..22051e07ee 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -348,7 +348,7 @@ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 };
 
-Evaluator* NeuralNetwork::makeEvaluator() {
+Evaluator* NeuralNetwork::makeEvaluator() const {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
   auto subModelConfig = std::find_if(config_.sub_models().begin(),
                                      config_.sub_models().end(),
@@ -383,7 +383,7 @@ Evaluator* NeuralNetwork::makeEvaluator() {
   return combinedEvaluator;
 }
 
-void NeuralNetwork::eval(Evaluator* evaluator) { evaluator->eval(*this); }
+void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
 
 void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_GE(outputLayers_.size(), args.size());
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 384ca88f47..25af4abcf8 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -96,9 +96,9 @@ public:
 
   virtual void onPassEnd();
 
-  virtual Evaluator* makeEvaluator();
+  virtual Evaluator* makeEvaluator() const;
 
-  virtual void eval(Evaluator* evaluator);
+  virtual void eval(Evaluator* evaluator) const;
   virtual void resetState();
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 980a5851a2..c6e3a3b321 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -131,11 +131,7 @@ void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
   backward(callback);
 }
 
-void ParallelNeuralNetwork::start(const TrainerConfig& config,
-                                  DataProviderPtr dataProvider) {
-  (void)config;
-  (void)dataProvider;
-
+void ParallelNeuralNetwork::start() {
   for (auto& thread : threads_) {
     thread->start();
   }
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 8f445b1ded..39f5682a58 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -56,7 +56,7 @@ public:
                        PassType passType,
                        const UpdateCallback &callback = NULL);
 
-  virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider);
+  virtual void start();
 
   void addComputeThread(int deviceId);
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index ee1c92bdf5..a9a9f4f903 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
 
 static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
 static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
@@ -593,7 +593,7 @@ void RecurrentGradientMachine::forwardBackward(
   LOG(FATAL) << "should not use this function";
 }
 
-void RecurrentGradientMachine::eval(Evaluator* evaluator) {
+void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
   // call printers frame by frame
   for (int i = 0; i < maxSequenceLength_; ++i) {
     LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin";
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index db7d8aff6d..910ca4376b 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -63,7 +63,7 @@ public:
                        const UpdateCallback& callback);
 
   virtual void resetState() {}
-  virtual void eval(Evaluator* evaluator);
+  virtual void eval(Evaluator* evaluator) const;
 
   const std::vector<int>& getParameterIds() { return parameterIds_; }
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index e6a0624636..412762d384 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -59,24 +59,14 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
 
 void BatchNormalizationLayer::calMovingMeanAndVar() {
   // calculating and saving moving mean and variance
-  MatrixPtr movingMean = movingMean_->getW();
-  MatrixPtr movingVar = movingVar_->getW();
-
-  if (!useGpu_ && FLAGS_trainer_count > 1) {
-    auto mvMean = std::dynamic_pointer_cast<SharedCpuMatrix>(movingMean);
-    auto mvVar = std::dynamic_pointer_cast<SharedCpuMatrix>(movingVar);
-    CHECK(mvMean && mvVar);
-
-    mvMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-    mvVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  } else {
-    // movingMean =  movingMean * movingAvgFraction_
-    //            + savedMean_ * (1 - movingAvgFraction_)
-    movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-    // movingVar =  movingVar * movingAvgFraction_
-    //           + savedInvVar_ * (1 - movingAvgFraction_)
-    movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  }
+  auto& movingMean = movingMean_->getW();
+  auto& movingVar = movingVar_->getW();
+  // movingMean =  movingMean * movingAvgFraction_
+  //            + savedMean_ * (1 - movingAvgFraction_)
+  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // movingVar =  movingVar * movingAvgFraction_
+  //           + savedInvVar_ * (1 - movingAvgFraction_)
+  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
 }
 
 void BatchNormalizationLayer::setMeanAndStd() {
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 052c207732..195acbbfc5 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -58,6 +58,8 @@ protected:
   /// to batch, channels* imagePixels.
   void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
 
+  void onPassEnd() { firstTest_ = true; }
+
   MatrixPtr tmpMat_, tmpGrad_;
   MatrixPtr expandedIn_, expandedOut_;
   MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 7ac56e3a2a..ebcc87cbf4 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
     CHECK_EQ(inputDim * totalPad, parameter->getSize());
     weight_.reset(new Weight(totalPad, inputDim, parameter));
   }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
 }
 
 void ContextProjection::resetState() {
@@ -78,25 +104,31 @@ LayerStatePtr ContextProjection::getState() {
 }
 
 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
-
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  // size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
-      in_->value,
-      state_ ? state_ : isPadding ? weight_->getW() : nullptr,
-      *startPositions,
-      config_.context_length(),
-      config_.context_start(),
-      beginPad_,
-      state_ ? true : isPadding);
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  auto start_pos = in_->sequenceStartPositions;
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*in_->value);
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->value, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -118,41 +150,30 @@ void ContextProjection::forward() {
 }
 
 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
-    out_->grad->contextProjectionBackward(
-        in_->grad,
-        isPadding ? weight_->getWGrad() : nullptr,
-        *startPositions,
-        config_.context_length(),
-        config_.context_start(),
-        beginPad_,
-        isPadding);
-  } else {
-    if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(in_->grad,
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          weight_->getWGrad(),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(CpuMatrix(
+      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->grad, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 2df43bd04f..c87d6ed1d6 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -61,6 +61,8 @@ public:
 
   virtual LayerStatePtr getState();
 
+  virtual bool init();
+
 protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index aa634b3287..0281170bc5 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -130,6 +130,11 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
+           in_->value->getWidth())
+      << "Wrong input size for convolution"
+      << " channels=" << channels_ << " imageH=" << imageH_
+      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
 
   isSelectAlgo_ = (batchSize == batchNum_);
   batchNum_ = batchSize;
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index 3f4d77a2fe..ed57f2af3c 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -113,7 +113,7 @@ void ConvexCombinationLayer::forward(PassType passType) {
     tmpRow0->setData(inV0->getData() + i * weightDim);
     tmpRow1->setData(outV->getData() + i * dataDim);
 
-    tmpRow1->mul(tmpRow0, tmpMtx0, 1, 0);
+    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
   }
 }
 
@@ -136,7 +136,7 @@ void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
       tmpRow1->setData(outG->getData() + i * dataDim);
       tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
 
-      tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1);
+      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
     }
   }
 
@@ -146,7 +146,7 @@ void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
       tmpRow1->setData(outG->getData() + i * dataDim);
       tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
 
-      tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1, 1, 1);
+      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
     }
   }
 }
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 66f0606a38..3551df4e17 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -54,7 +54,7 @@ void DataLayer::copyDataToOutput(Argument& output) {
     output.setFrameWidth(config_.width());
   } else {
     output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameWidth(data_.getFrameWidth());
   }
   output.cpuSequenceDims = data_.cpuSequenceDims;
   output.sequenceStartPositions = data_.sequenceStartPositions;
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 25948747fe..9ddccc2027 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -150,7 +150,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
         Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
     MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
     MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(A, B, 1, 1);
+    C->mul(*A, *B, 1, 1);
 
     A->clear();
     B->clear();
@@ -185,7 +185,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
       MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
       MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
       MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
-      C->mul(A, B);  // mul
+      C->mul(*A, *B);  // mul
 
       // clear the temporary matrix
       A->clear();
@@ -252,7 +252,7 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
       MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
       MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
       MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
-      C->mul(B, A, 1, 1);
+      C->mul(*B, *A, 1, 1);
 
       A->clear();
       B->clear();
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
index 9e72a33a3c..b8b6f403d6 100644
--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -28,7 +28,7 @@ FullMatrixProjection::FullMatrixProjection(const ProjectionConfig& config,
 
 void FullMatrixProjection::forward() {
   REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(in_->value, weight_->getW(), 1, 1);
+  out_->value->mul(*(in_->value), *(weight_->getW()), 1, 1);
 }
 
 void FullMatrixProjection::backward(const UpdateCallback& callback) {
@@ -37,7 +37,8 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) {
   /* Calculate the W-gradient for the current layer */
   if (weight_->getWGrad()) {
     REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(in_->value->getTranspose(), out_->grad, 1, 1);
+    weight_->getWGrad()->mul(
+        *(in_->value->getTranspose()), *(out_->grad), 1, 1);
   }
 
   // If callback does not change value, backward propagation error
@@ -47,7 +48,7 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) {
   /* Calculate the input layers error */
   if (in_->grad) {
     REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(out_->grad, weight_->getW()->getTranspose(), 1, 1);
+    in_->grad->mul(*(out_->grad), *(weight_->getW()->getTranspose()), 1, 1);
   }
 
   hl_set_sync_flag(syncFlag);
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index 89afe33c36..d8a667ff8d 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -84,8 +84,8 @@ void FullyConnectedLayer::forward(PassType passType) {
     auto input = getInput(i);
     CHECK(input.value) << "The input of 'fc' layer must be matrix";
     REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
-           : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
+           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
   }
 
   /* add the bias-vector */
@@ -123,7 +123,7 @@ void FullyConnectedLayer::backward(const UpdateCallback& callback) {
       MatrixPtr oGrad = getOutputGrad();
       {
         REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
       }
     }
 
@@ -136,7 +136,7 @@ void FullyConnectedLayer::backward(const UpdateCallback& callback) {
     if (NULL != preGrad) {
       MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
       REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
     }
 
     hl_set_sync_flag(syncFlag);
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 42c0019319..3340e38e62 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c9e121047b..c47943f81c 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "TransLayer.h"
 #include "ValidationLayer.h"
 
-P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 172e558b82..6dfd48fb96 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <functional>
 #include <memory>
 #include "ModelConfig.pb.h"
+#include "paddle/function/Function.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/utils/ClassRegistrar.h"
@@ -100,6 +101,11 @@ protected:
   /// Mark input grad in(true) or out(false) of backward function.
   std::vector<bool> markInBackward_;
 
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+
 public:
   /**
     * Wait until all input value ready.
@@ -126,6 +132,26 @@ public:
   virtual void markAllInputGrad();
 
 protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
   /**
    * Notify specified layer the output grad ready.
    * Called in the backward function.
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index af550c7a01..b7f748f3bb 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -59,7 +59,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   matX->rowMax(*maxX_);
   expX_->assign(*matX);
   // subtract max to avoid overflow or underflow
-  expX_->mul(maxX_, ones_, (real)-1, (real)1);
+  expX_->mul(*maxX_, *ones_, (real)-1, (real)1);
   expX_->exp2();
 
   real* a = a_->getData();
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 140a4c6ecf..2588fad279 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 452091eff4..01cc5fec8b 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(prev_batch_state);
 
 namespace paddle {
 
@@ -316,7 +316,7 @@ void LstmLayer::forwardSequence(int batchSize,
     }
     if (prevOutput_) {
       frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(prevOutput_, weight_->getW(), 1, 1);
+      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
   }
   AsyncGpuBlock asyncGpuBlock;
@@ -338,7 +338,7 @@ void LstmLayer::forwardSequence(int batchSize,
         frameOutput->setData(lstmValue.outputValue);
         nextFrame(reversed_, getSize());
         frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(frameOutput, weight_->getW(), 1, 1);
+        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
       }
     }
     if (n != numSequences - 1) {
@@ -348,7 +348,7 @@ void LstmLayer::forwardSequence(int batchSize,
       if (!reversed_) {
         if (!prevState_) lstmValue.prevStateValue = nullptr;
         if (prevOutput_) {
-          frameGate->mul(frameOutput, weight_->getW(), 1, 1);
+          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
         }
       } else {
         lstmValue.prevStateValue = nullptr;
@@ -470,7 +470,7 @@ void LstmLayer::backwardSequence(int batchSize,
           frameGate->setData(lstmGrad.gateGrad);
           nextFrame(reversed_, getSize());
           frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(frameGate, weightT, 1, 1);
+          frameOutput->mul(*frameGate, *weightT, 1, 1);
         } else {
           nextFrame(reversed_, getSize());
         }
@@ -479,14 +479,14 @@ void LstmLayer::backwardSequence(int batchSize,
       if (weight_->getWGrad()) {
         if (!reversed_) {
           weight_->getWGrad()->mul(
-              output_.value->subMatrix(start, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start + 1, length - 1),
+              *output_.value->subMatrix(start, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start + 1, length - 1),
               1,
               1);
         } else {
           weight_->getWGrad()->mul(
-              output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start, length - 1),
+              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start, length - 1),
               1,
               1);
         }
@@ -541,7 +541,7 @@ void LstmLayer::forwardBatch(int batchSize,
 
       if (n != 0) {
         MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(batch1, weight_->getW(), 1, 1);
+        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
       } else if (prevOutput_) {
         Matrix::resizeOrCreate(prevBatchOutput2_,
                                gateValue->getHeight(),
@@ -549,7 +549,7 @@ void LstmLayer::forwardBatch(int batchSize,
                                false,
                                useGpu_);
         batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1);
+        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
 
         batchValue_->prevOutput2Batch(*prevState_,
                                       *totalState_->subMatrix(0, numSequences));
@@ -672,16 +672,16 @@ void LstmLayer::backwardBatch(int batchSize,
 
       if (n != 0) {
         MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(gateGrad, weightT, 1, 1);
+        tmp->mul(*gateGrad, *weightT, 1, 1);
       }
 
       if (n != 0 && weight_->getWGrad()) {
         /* backward weight */
         MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1);
+        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
       } else if (prevOutput_ && weight_->getWGrad()) {
         weight_->getWGrad()->mul(
-            prevBatchOutput2_->getTranspose(), gateGrad, 1, 1);
+            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
       }
     }
   }
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 1243c12889..fb41af5631 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -547,7 +547,7 @@ void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
       if (coordIter.getPrePos(delays_, i, prePos)) {
         int preOffset = coordIter.offset(prePos);
         frameGate_[start + offset].value->mul(
-            frameOutput_[start + preOffset].value, weight_->getW(), 1.0, 1.0);
+            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
       }
     }
     forwardGate2OutputSequence(start, coordIter);
@@ -747,11 +747,11 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
       if (coordIter.getPrePos(delays_, i, prePos)) {
         int preOffset = coordIter.offset(prePos);
         frameOutput_[start + preOffset].grad->mul(
-            frameGate_[start + offset].grad, weightT, 1.0, 1.0);
+            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
         if (weight_->getWGrad()) {
           weight_->getWGrad()->mul(
-              frameOutput_[start + preOffset].value->getTranspose(),
-              frameGate_[start + offset].grad,
+              *frameOutput_[start + preOffset].value->getTranspose(),
+              *frameGate_[start + offset].grad,
               1.0,
               1.0);
         }
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 677b047029..546ef9c1f2 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 86255b231b..011bab8fde 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -50,7 +50,7 @@ public:
 class ResponseNormLayer : public NormLayer {
 protected:
   size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  float scale_, pow_;
+  real scale_, pow_;
   MatrixPtr denoms_;
 
 public:
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 934fc31e0a..4331009de7 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -45,50 +45,57 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
   /* the size of inputs for norm-layer is 1 */
   CHECK_EQ(config_.inputs_size(), 1);
 
+  createFunction(
+      forward_,
+      "CrossMapNormal",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+  createFunction(
+      backward_,
+      "CrossMapNormalGrad",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+
   return true;
 }
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
-  int batchSize = input->getHeight();
+  size_t batchSize = input->getHeight();
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
-  denoms_->zeroMem();
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
 
-  outV->crossMapNormalFwd(
-      *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_);
+  // prepare forward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
-
-  preOutGrad->crossMapNormalBwd(*localGrad,
-                                *denoms_,
-                                *preOutV,
-                                *localOutV,
-                                channels_,
-                                imgSizeH_,
-                                imgSizeW_,
-                                size_,
-                                scale_,
-                                pow_);
+
+  // prepare backward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 4f7b638334..2c0d8a3a71 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -39,5 +39,8 @@ public:
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index cf9a008318..b606e44365 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -96,7 +96,7 @@ void OuterProdLayer::forward(PassType passType) {
       tmpRow0->setData(inV0->getData() + i * dim0);
       tmpRow1->setData(inV1->getData() + i * dim1);
 
-      tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1);
+      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
     }
   }
 }
@@ -121,7 +121,7 @@ void OuterProdLayer::backward(const UpdateCallback& callback) {
         tmpRow0->setData(inG0->getData() + i * dim0);
         tmpRow1->setData(inV1->getData() + i * dim1);
 
-        tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1);
+        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
       }
     }
 
@@ -131,7 +131,7 @@ void OuterProdLayer::backward(const UpdateCallback& callback) {
         tmpRow0->setData(inV0->getData() + i * dim0);
         tmpRow1->setData(inG1->getData() + i * dim1);
 
-        tmpRow1->mul(tmpRow0, tmpMtx0, 1, 1);
+        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
       }
     }
   }
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
new file mode 100644
index 0000000000..36ace7597c
--- /dev/null
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for generating priorbox locations and variances.
+ * - Input: Two and only two input layer are accepted. The input layer must be
+ *        be a data output layer and a convolution output layer.
+ * - Output: The priorbox locations and variances of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class PriorBoxLayer : public Layer {
+public:
+  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {}
+
+protected:
+  int numPriors_;
+  std::vector<int> minSize_;
+  std::vector<int> maxSize_;
+  std::vector<real> aspectRatio_;
+  std::vector<real> variance_;
+  MatrixPtr buffer_;
+};
+
+bool PriorBoxLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto pbConf = config_.inputs(0).priorbox_conf();
+  std::copy(pbConf.min_size().begin(),
+            pbConf.min_size().end(),
+            std::back_inserter(minSize_));
+  std::copy(pbConf.max_size().begin(),
+            pbConf.max_size().end(),
+            std::back_inserter(maxSize_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(aspectRatio_));
+  std::copy(pbConf.variance().begin(),
+            pbConf.variance().end(),
+            std::back_inserter(variance_));
+  // flip
+  int inputRatioLength = aspectRatio_.size();
+  for (int index = 0; index < inputRatioLength; index++)
+    aspectRatio_.push_back(1 / aspectRatio_[index]);
+  aspectRatio_.push_back(1.);
+  numPriors_ = aspectRatio_.size();
+  if (maxSize_.size() > 0) numPriors_++;
+  return true;
+}
+
+void PriorBoxLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto input = getInput(0);
+  int layerWidth = input.getFrameWidth();
+  int layerHeight = input.getFrameHeight();
+
+  auto image = getInput(1);
+  int imageWidth = image.getFrameWidth();
+  int imageHeight = image.getFrameHeight();
+
+  real stepW = static_cast<real>(imageWidth) / layerWidth;
+  real stepH = static_cast<real>(imageHeight) / layerHeight;
+  int dim = layerHeight * layerWidth * numPriors_ * 4;
+  reserveOutput(1, dim * 2);
+  // use a cpu buffer to compute
+  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
+  auto* tmpPtr = buffer_->getData();
+
+  int idx = 0;
+  for (int h = 0; h < layerHeight; ++h) {
+    for (int w = 0; w < layerWidth; ++w) {
+      real centerX = (w + 0.5) * stepW;
+      real centerY = (h + 0.5) * stepH;
+      int minSize = 0;
+      for (size_t s = 0; s < minSize_.size(); s++) {
+        // first prior.
+        minSize = minSize_[s];
+        int boxWidth = minSize;
+        int boxHeight = minSize;
+        // xmin, ymin, xmax, ymax.
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        if (maxSize_.size() > 0) {
+          CHECK_EQ(minSize_.size(), maxSize_.size());
+          // second prior.
+          for (size_t s = 0; s < maxSize_.size(); s++) {
+            int maxSize = maxSize_[s];
+            boxWidth = boxHeight = sqrt(minSize * maxSize);
+            tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+            tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+            tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+            tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+            // set the variance.
+            for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+          }
+        }
+      }
+      // rest of priors.
+      for (size_t r = 0; r < aspectRatio_.size(); r++) {
+        real ar = aspectRatio_[r];
+        if (fabs(ar - 1.) < 1e-6) continue;
+        real boxWidth = minSize * sqrt(ar);
+        real boxHeight = minSize / sqrt(ar);
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+      }
+    }
+  }
+  // clip the prior's coordidate such that it is within [0, 1]
+  for (int d = 0; d < dim * 2; ++d)
+    if ((d % 8) < 4)
+      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(buffer_->data_, dim * 2);
+}
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 8cd8042479..778a7fe13d 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -88,11 +88,37 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
   /**
    * Get output size of projection.
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
+protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
 protected:
   /// Config of projection
   ProjectionConfig config_;
@@ -106,5 +132,9 @@ protected:
   const Argument* out_;
   /// Store `passType` passed to forward()
   PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 9f3bf76a2d..55e0fdfb90 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gflags/gflags.h>
 #include "Layer.h"
 #include "SequenceToBatch.h"
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
@@ -215,12 +215,12 @@ void RecurrentLayer::forwardSequence(int batchSize,
 void RecurrentLayer::forwardOneSequence(int start, int length) {
   if (!reversed_) {
     if (prevOutput_) {
-      frameOutput_[start].value->mul(prevOutput_, weight_->getW(), 1, 1);
+      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
     activation_->forward(frameOutput_[start]);
     for (int i = 1; i < length; ++i) {
       frameOutput_[start + i].value->mul(
-          frameOutput_[start + i - 1].value, weight_->getW(), 1, 1);
+          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
     if (prevOutput_) {
@@ -230,7 +230,7 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     activation_->forward(frameOutput_[start + length - 1]);
     for (int i = length - 2; i >= 0; --i) {
       frameOutput_[start + i].value->mul(
-          frameOutput_[start + i + 1].value, weight_->getW(), 1, 1);
+          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
   }
@@ -282,13 +282,13 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     for (int i = length - 1; i > 0; --i) {
       activation_->backward(frameOutput_[start + i]);
       frameOutput_[start + i - 1].grad->mul(
-          frameOutput_[start + i].grad, weightT, 1, 1);
+          *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
-          output_.value->subMatrix(start, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start + 1, length - 1),
+          *output_.value->subMatrix(start, length - 1)->getTranspose(),
+          *output_.grad->subMatrix(start + 1, length - 1),
           1,
           1);
     }
@@ -296,13 +296,13 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     for (int i = 0; i < length - 1; ++i) {
       activation_->backward(frameOutput_[start + i]);
       frameOutput_[start + i + 1].grad->mul(
-          frameOutput_[start + i].grad, weightT, 1, 1);
+          *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start + length - 1]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
-          output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start, length - 1),
+          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+          *output_.grad->subMatrix(start, length - 1),
           1,
           1);
     }
@@ -329,7 +329,7 @@ void RecurrentLayer::forwardBatch(int batchSize,
       if (n != 0) {
         MatrixPtr batch1 =
             batchValue_->getBatchValue(n - 1, batch2->getHeight());
-        batch2->mul(batch1, weight_->getW(), 1, 1);
+        batch2->mul(*batch1, *weight_->getW(), 1, 1);
       }
       Argument arg;
       arg.value = batch2;
@@ -367,14 +367,14 @@ void RecurrentLayer::backwardBatch(int batchSize,
 
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
-        batch1->mul(batch2, weightT, 1, 1);
+        batch1->mul(*batch2, *weightT, 1, 1);
       }
 
       if (backwardByBatch && weight_->getWGrad()) {
         if (n != 0) {
           /* backward weight */
           batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
-          weight_->getWGrad()->mul(batch1->getTranspose(), batch2, 1, 1);
+          weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1);
         }
       }
     }
@@ -389,14 +389,14 @@ void RecurrentLayer::backwardBatch(int batchSize,
       int len = starts[seq + 1] - starts[seq];
       if (!reversed_) {
         weight_->getWGrad()->mul(
-            output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq] + 1, len - 1),
+            *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
+            *output_.grad->subMatrix(starts[seq] + 1, len - 1),
             1,
             1);
       } else {
         weight_->getWGrad()->mul(
-            output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq], len - 1),
+            *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
+            *output_.grad->subMatrix(starts[seq], len - 1),
             1,
             1);
       }
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 9200a01eee..5eacff6b71 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -155,20 +155,20 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
       // manully compute the multiplication of
       // the input vector and the selected rows.
       REGISTER_TIMER("selective.plain");
-      interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
+      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
     } else {
       // if the indecies is not sparse enough,
       // use full mul instead
       REGISTER_TIMER("selective.mul");
       if (fullOutput_) {
-        interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
+        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
       } else {
         Matrix::resizeOrCreate(mmat_,
                                hsize,
                                wsize,
                                /*trans=*/false,
                                /*useGpu=*/useGpu_);
-        mmat_->mul(input, weight->getTranspose());
+        mmat_->mul(*input, *weight->getTranspose());
         interOutput_->add3(mmat_);
       }
     }
@@ -242,14 +242,14 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
     MatrixPtr preGrad = getInputGrad(i);
     if (preGrad) {
       REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(interOutGrad_, weights_[i]->getW(), 1, 1);
+      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
     }
 
     MatrixPtr wGrad = weights_[i]->getWGrad();
     if (wGrad) {
       REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
       MatrixPtr input = getInputValue(i);
-      wGrad->mul(interOutGrad_->getTranspose(), input, 1, 1);
+      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
     }
 
     {
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
index 642eb1bdd3..5be88d7c05 100644
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -77,7 +77,7 @@ void TensorLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
     for (size_t i = 0; i < getSize(); ++i) {
       MatrixPtr weights = weights_[i]->getW();
-      tmpMat->mul(input1, weights, 1, 0);
+      tmpMat->mul(*input1, *weights, 1, 0);
       outV->rowDotMul(i, *tmpMat, *input2);
     }
   }
@@ -112,7 +112,7 @@ void TensorLayer::backward(const UpdateCallback& callback) {
       if (weights_[i]->getWGrad()) {
         tmpMat->rowScale(i, *input1, *oGrad);
         MatrixPtr input1_T = tmpMat->getTranspose();
-        weights_[i]->getWGrad()->mul(input1_T, input2, 1, 1);
+        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
       }
     }
   }
@@ -130,11 +130,11 @@ void TensorLayer::backward(const UpdateCallback& callback) {
       if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
         tmpMat->rowScale(i, *input2, *oGrad);
         MatrixPtr weights_T = weights->getTranspose();
-        preGrad1->mul(tmpMat, weights_T, 1, 1);
+        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
       }
       if (NULL != preGrad2) { /* (grad * e1) * W */
         tmpMat->rowScale(i, *input1, *oGrad);
-        preGrad2->mul(tmpMat, weights, 1, 1);
+        preGrad2->mul(*tmpMat, *weights, 1, 1);
       }
     }
   }
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 3f7ff04882..2a12499e5b 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -46,7 +46,7 @@ TransposedFullMatrixProjection::TransposedFullMatrixProjection(
 
 void TransposedFullMatrixProjection::forward() {
   REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(in_->value, weight_->getW()->getTranspose(), 1, 1);
+  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
 }
 
 void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
@@ -55,7 +55,8 @@ void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
   /* Calculate the W-gradient for the current layer */
   if (weight_->getWGrad()) {
     REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(out_->grad->getTranspose(), in_->value, 1, 1);
+    weight_->getWGrad()->mul(
+        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
   }
 
   // If callback does not change value, backprop error asynchronously so that
@@ -69,7 +70,7 @@ void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
   /* Calculate the input layers error */
   if (in_->grad) {
     REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(out_->grad, weight_->getW(), 1, 1);
+    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
   }
 
   hl_set_sync_flag(syncFlag);
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index 471055429d..4c1de7b3b7 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 34dc375f21..0caa5e1e11 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -2,8 +2,7 @@
 
 ################### test_ProtoDataProvider ############
 add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp
-    TestUtil.cpp)
+    test_ProtoDataProvider.cpp)
 
 # test_ProtoDataProvider will mkdir as same name,
 # so if WORKING_DIRECTORY is default directory, then
@@ -15,45 +14,46 @@ add_test(NAME test_ProtoDataProvider
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
     test_LayerGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_ActivationGrad
     COMMAND test_ActivationGrad)
 ################# test_ConvTrans #######################
 add_unittest_without_exec(test_ConvTrans
     test_ConvTrans.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
+################# test_PriorBox #######################
+add_unittest_without_exec(test_PriorBox
+    test_PriorBox.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_PriorBox
+    COMMAND test_PriorBox)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
     
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
 add_unittest_without_exec(test_BatchNorm
     test_BatchNorm.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
-    test_Evaluator.cpp
-    TestUtil.cpp)
+    test_Evaluator.cpp)
 
 ################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
@@ -64,8 +64,7 @@ add_simple_unittest(test_MultinomialSampler)
 ############## test_PyDataProvider ########################
 if(WITH_PYTHON)
     add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp
-        TestUtil.cpp)
+        test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
         COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
@@ -73,18 +72,15 @@ if(WITH_PYTHON)
 endif()
 
 ############### test_RecurrentLayer #######################
-add_unittest(test_RecurrentLayer
-    test_RecurrentLayer.cpp
-    TestUtil.cpp)
+add_simple_unittest(test_RecurrentLayer)
 
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
     add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp
-        TestUtil.cpp)
+        test_WarpCTCLayer.cpp)
 
     add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 endif()
 
@@ -100,8 +96,7 @@ add_test(NAME test_RecurrentGradientMachine
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp
-    TestUtil.cpp)
+    test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
         COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index dffc24936f..ae016e74ea 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "LayerGradUtil.h"
 
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 namespace paddle {
 real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
@@ -303,13 +303,31 @@ void initDataLayer(TestConfig testConf,
   ICpuGpuVectorPtr sequenceStartPositions;
   ICpuGpuVectorPtr subSequenceStartPositions;
   IVectorPtr cpuSequenceDims;
-  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
+  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
+    if (testConf.inputDefs[i].inputType != INPUT_SEQUENCE_LABEL) continue;
+
+    const std::vector<int>& labelSeqStartPositions =
+        testConf.inputDefs[i].labelSeqStartPositions;
+    if (labelSeqStartPositions.size() != 0) {
+      CHECK(!sequenceStartPositions);
+      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
+
+      sequenceStartPositions =
+          ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
+      sequenceStartPositions->copyFrom(
+          labelSeqStartPositions.data(), labelSeqStartPositions.size(), useGpu);
+    }
+  }
+
+  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
     LayerConfig config;
     config.set_name(testConf.inputDefs[i].name);
     config.set_type("data");
     config.set_size(testConf.inputDefs[i].dim);
     LayerPtr layer = LayerPtr(new DataLayer(config));
-    size_t numSequence = batchSize / 10 + 1;
+    size_t numSequence = sequenceStartPositions
+                             ? sequenceStartPositions->getSize() - 1
+                             : batchSize / 10 + 1;
 
     Argument data;
     auto fillData = [&](bool trans, int height, int width) {
@@ -336,9 +354,17 @@ void initDataLayer(TestConfig testConf,
         break;
       case INPUT_LABEL:
       case INPUT_SEQUENCE_LABEL:
-        data.ids = VectorT<int>::create(batchSize, useGpu);
-        // now rand number can be 0 to inputDefs[i].dim
-        data.ids->rand(testConf.inputDefs[i].dim);
+        if (testConf.inputDefs[i].labelInitValue.size() != 0) {
+          const std::vector<int>& labelInitValue =
+              testConf.inputDefs[i].labelInitValue;
+          CHECK_EQ(labelInitValue.size(), batchSize);
+          data.ids = VectorT<int>::create(batchSize, useGpu);
+          data.ids->copyFrom(labelInitValue.data(), batchSize);
+        } else {
+          data.ids = VectorT<int>::create(batchSize, useGpu);
+          // now rand number can be 0 to inputDefs[i].dim
+          data.ids->rand(testConf.inputDefs[i].dim);
+        }
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
         data.value = makeRandomSparseMatrix(
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 62ac2d160f..9f68eb64d0 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
 
 namespace paddle {
@@ -64,6 +64,9 @@ struct InputDef {
   size_t paraSize;
   ParaSparse sparse;
   bool isStatic;
+  std::vector<int> labelInitValue;
+  std::vector<int> labelSeqStartPositions;
+
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
     inputType = type;
     name = nameIn;
@@ -72,6 +75,23 @@ struct InputDef {
     sparse = {""};
     isStatic = false;
   }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           const std::vector<int>& labelInitValue,
+           const std::vector<int>& labelSeqStartPositions)
+      : labelInitValue(labelInitValue),
+        labelSeqStartPositions(labelSeqStartPositions) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
   InputDef(InputType type,
            string nameIn,
            size_t dimIn,
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 20a6126d0b..b201ba8a5a 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -20,13 +20,13 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 void testActivation(const string& act) {
   LOG(INFO) << "test activation: " << act;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 3bd4e321b7..d07299bfe3 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -22,16 +22,16 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the batchNormLayer can be followed by a ConvLayer
 TEST(Layer, batchNorm) {
@@ -114,8 +114,8 @@ TEST(Layer, batchNorm) {
   bnLayer->forward(PASS_GC);
   convLayer->forward(PASS_GC);
 
-  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
-  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 83100e3bec..40bb1e2d73 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -23,16 +23,16 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
@@ -206,8 +206,8 @@ TEST(Layer, convTransLayerFwd2) {
                  /* filter_size */ 5,
                  result);
 
-  float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
   result->setData(resultData);
   doOneConvtTest(/* imgSize */ 5,
                  /* output_x */ 2,
@@ -216,8 +216,8 @@ TEST(Layer, convTransLayerFwd2) {
                  /* filter_size */ 4,
                  result);
 
-  float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                         4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
   result->setData(resultData2);
   doOneConvtTest(/* imgSize */ 5,
                  /* output_x */ 2,
@@ -226,8 +226,8 @@ TEST(Layer, convTransLayerFwd2) {
                  /* filter_size */ 5,
                  result);
 
-  float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                         2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
   result->setData(resultData3);
   doOneConvtTest(/* imgSize */ 5,
                  /* output_x */ 2,
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 02763406a3..207fc0566f 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -23,16 +23,16 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result
@@ -106,8 +106,8 @@ TEST(Layer, convParaUnified) {
 #ifndef PADDLE_ONLY_CPU
   MatrixPtr input, resultCpu, resultGpu;
   input = Matrix::create(1, 4 * 4, false, false);
-  float inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
 
   input->setData(inputData);
 
@@ -137,26 +137,9 @@ TEST(Layer, convParaUnified) {
   checkMatrixEqual(resultCpu, resultGpu);
 
   input = Matrix::create(1, 3 * 3 * 2, false, false);
-  float inputData2[] = {1,
-                        2,
-                        3,
-                        4,
-                        5,
-                        6,
-                        7,
-                        8,
-                        9,
-
-                        10,
-                        11,
-                        12,
-                        13,
-                        14,
-                        15,
-                        16,
-                        17,
-                        18};
-  float param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+  real inputData2[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
 
   input->setData(inputData2);
 
@@ -185,7 +168,7 @@ TEST(Layer, convParaUnified) {
                             true);
   checkMatrixEqual(resultCpu, resultGpu);
 
-  float param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
 
   resultCpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 7a930aebcf..8165eb8269 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -15,15 +15,15 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 enum InputType {
   INPUT_DATA,         // dense vector
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 9f8b197df5..66a70ecd41 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -21,16 +21,16 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 TEST(Operator, dot_mul) {
   TestConfig config;
@@ -1021,11 +1021,10 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   testLayerGrad(config, "norm", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
 TEST(Layer, NormLayer) {
   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
+  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
 }
-#endif
 
 void setPoolConfig(TestConfig* config,
                    PoolConfig* pool,
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 330adee8f7..f046cb0b28 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index baa55aa025..4db30f37a5 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DEFINE_bool(use_label, true, "input label or sequence label");
-P_DEFINE_bool(static_para, false, "static parameter");
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
 
 struct DataIn {
   std::vector<Argument> inArgs;
@@ -114,7 +114,7 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
       parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
     }
   }
-  gradientMachine->start(trainer.getConfig(), nullptr);
+  gradientMachine->start();
   gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
   for (size_t i = 0; i < in.outGrads.size(); i++) {
     // If the all the layers in the config have no parameters, also
@@ -267,8 +267,8 @@ TEST(Compare, img_conv2) {
 }
 #endif
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
   if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
     compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
new file mode 100644
index 0000000000..ae0e3bc3d2
--- /dev/null
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of priorBox layer and check to see if its output
+// matches the given result
+void doOnePriorBoxTest(size_t feature_map_width,
+                       size_t feature_map_height,
+                       size_t image_width,
+                       size_t image_height,
+                       vector<int> min_size,
+                       vector<int> max_size,
+                       vector<real> aspect_ratio,
+                       vector<real> variance,
+                       bool use_gpu,
+                       MatrixPtr& result) {
+  // Setting up the priorbox layer
+  TestConfig configt;
+  configt.layerConfig.set_type("priorbox");
+
+  configt.inputDefs.push_back({INPUT_DATA, "featureMap", 1, 0});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  configt.inputDefs.push_back({INPUT_DATA, "image", 1, 0});
+  configt.layerConfig.add_inputs();
+  PriorBoxConfig* pb = input->mutable_priorbox_conf();
+  for (size_t i = 0; i < min_size.size(); i++) pb->add_min_size(min_size[i]);
+  for (size_t i = 0; i < max_size.size(); i++) pb->add_max_size(max_size[i]);
+  for (size_t i = 0; i < variance.size(); i++) pb->add_variance(variance[i]);
+  for (size_t i = 0; i < aspect_ratio.size(); i++)
+    pb->add_aspect_ratio(aspect_ratio[i]);
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
+  dataLayers[0]->getOutput().setFrameHeight(feature_map_height);
+  dataLayers[0]->getOutput().setFrameWidth(feature_map_width);
+  dataLayers[1]->getOutput().setFrameHeight(image_height);
+  dataLayers[1]->getOutput().setFrameWidth(image_width);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr priorboxLayer;
+  initTestLayer(configt, &layerMap, &parameters, &priorboxLayer);
+  priorboxLayer->forward(PASS_GC);
+  checkMatrixEqual(priorboxLayer->getOutputValue(), result);
+}
+
+TEST(Layer, priorBoxLayerFwd) {
+  vector<int> minSize;
+  vector<int> maxSize;
+  vector<real> aspectRatio;
+  vector<real> variance;
+  bool useGpu = false;
+
+  minSize.push_back(276);
+  maxSize.push_back(330);
+  variance.push_back(0.1);
+  variance.push_back(0.1);
+  variance.push_back(0.2);
+  variance.push_back(0.2);
+
+  // CPU case 1.
+  MatrixPtr result;
+  real resultData[] = {0.04,
+                       0.04,
+                       0.96,
+                       0.96,
+                       0.1,
+                       0.1,
+                       0.2,
+                       0.2,
+                       0,
+                       0,
+                       1,
+                       1,
+                       0.1,
+                       0.1,
+                       0.2,
+                       0.2};
+  result = Matrix::create(1, 2 * 8, false, useGpu);
+  result->setData(resultData);
+  doOnePriorBoxTest(/* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    /* image_width */ 300,
+                    /* image_height */ 300,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    result);
+  // CPU case 2.
+  variance[1] = 0.2;
+  variance[3] = 0.1;
+  maxSize.pop_back();
+  real resultData2[] = {0,     0,     0.595, 0.595, 0.1, 0.2, 0.2, 0.1,
+                        0.405, 0,     1,     0.595, 0.1, 0.2, 0.2, 0.1,
+                        0,     0.405, 0.595, 1,     0.1, 0.2, 0.2, 0.1,
+                        0.405, 0.405, 1,     1,     0.1, 0.2, 0.2, 0.1};
+  Matrix::resizeOrCreate(result, 1, 4 * 8, false, useGpu);
+  result->setData(resultData2);
+  doOnePriorBoxTest(/* feature_map_width */ 2,
+                    /* feature_map_height */ 2,
+                    /* image_width */ 400,
+                    /* image_height */ 400,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    result);
+  // CPU case 3.
+  aspectRatio.push_back(2);
+  real resultData3[] = {0.04,     0.04, 0.96, 0.96,       0.1,        0.2,
+                        0.2,      0.1,  0,    0.17473088, 1,          0.825269,
+                        0.1,      0.2,  0.2,  0.1,        0.17473088, 0,
+                        0.825269, 1,    0.1,  0.2,        0.2,        0.1};
+  Matrix::resizeOrCreate(result, 1, 3 * 8, false, useGpu);
+  result->setData(resultData3);
+  doOnePriorBoxTest(/* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    /* image_width */ 300,
+                    /* image_height */ 300,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    result);
+
+#ifndef PADDLE_ONLY_CPU
+  // reset the input parameters
+  variance[1] = 0.1;
+  variance[3] = 0.2;
+  maxSize.push_back(330);
+  aspectRatio.pop_back();
+  MatrixPtr resultGpu;
+  useGpu = true;
+  // GPU case 1.
+  resultGpu = Matrix::create(1, 2 * 8, false, useGpu);
+  resultGpu->copyFrom(resultData, 2 * 8);
+  doOnePriorBoxTest(/* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    /* image_width */ 300,
+                    /* image_height */ 300,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    resultGpu);
+  // GPU case 2.
+  variance[1] = 0.2;
+  variance[3] = 0.1;
+  maxSize.pop_back();
+  Matrix::resizeOrCreate(resultGpu, 1, 4 * 8, false, useGpu);
+  resultGpu->copyFrom(resultData2, 4 * 8);
+  doOnePriorBoxTest(/* feature_map_width */ 2,
+                    /* feature_map_height */ 2,
+                    /* image_width */ 400,
+                    /* image_height */ 400,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    resultGpu);
+  // GPU case 3.
+  aspectRatio.push_back(2);
+  Matrix::resizeOrCreate(resultGpu, 1, 3 * 8, false, useGpu);
+  resultGpu->copyFrom(resultData3, 3 * 8);
+  doOnePriorBoxTest(/* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    /* image_width */ 300,
+                    /* image_height */ 300,
+                    minSize,
+                    maxSize,
+                    aspectRatio,
+                    variance,
+                    useGpu,
+                    resultGpu);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index d421b6e2f2..e11bf402c2 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;  // NOLINT
 
@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
     }        // end for (int numIdSlots : numSlotsArray)
   }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 0f264ecf91..db883543c3 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/PyDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;     // NOLINT
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 7a3b51da8b..7e193eb31a 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
 namespace paddle {
 namespace unittest {
@@ -293,7 +293,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   while (true) {
     int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
-      CHECK_LE((size_t)realBatchSize, batchSize);
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
     } else {
       break;
     }
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index f7b540013e..2e6225519f 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -17,7 +17,7 @@ import random
 from paddle.trainer.PyDataProvider2 import *
 
 
-@provider(input_types=[dense_vector(200, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(slots=[dense_vector(200, seq_type=SequenceType.NO_SEQUENCE)])
 def test_dense_no_seq(setting, filename):
     for i in xrange(200):
         yield [(float(j - 100) * float(i + 1)) / 200.0 for j in xrange(200)]
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index a351667d8b..150850da4d 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
 
-P_DECLARE_int32(seed);
+DECLARE_int32(seed);
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -28,7 +28,7 @@ class TrainerForTest : public paddle::Trainer {
 public:
   void startTrain() {
     GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start(this->getConfig(), dataProvider_);
+    gm.start();
   }
 
   void finishTrain() {
@@ -155,13 +155,14 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
 }
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
       FLAGS_use_gpu = false;
     }
     initMain(argc, argv);
     initPython(argc, argv);
-    testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
   } else {
     return 0;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index cd96ca7c84..16ab0e6aec 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -19,13 +19,13 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(rnn_use_batch);
-P_DECLARE_int32(fixed_seq_length);
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
 
 void checkError(const Matrix& matrix1, const Matrix& matrix2) {
   CHECK(matrix1.getHeight() == matrix2.getHeight());
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 4f3a95a535..ab23d00a2c 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -29,11 +29,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(num_passes);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(config_args);
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
 
 size_t fcLayerWidth = 1024;
 
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 700425412c..23ae95852e 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -20,12 +20,12 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/WarpCTCLayer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(use_gpu);
 
 const real* getData(const Matrix& matrix) {
   if (matrix.useGpu()) {
@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 2933c20fba..8691c87ac3 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index b5d5b6ef61..82a482f701 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -163,15 +163,16 @@ MatrixPtr CpuSparseMatrix::getTranspose() {
 
 SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
 
-void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) {
+void CpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
 
-  if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
-    CpuMatrix::mul(dynamic_cast<CpuMatrix*>(a.get()),
-                   dynamic_cast<CpuMatrix*>(b.get()),
-                   this,
-                   scaleAB,
-                   scaleT);
+  if (a_ptr && b_ptr) {
+    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
   } else {
     LOG(FATAL) << "not supported";
   }
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 9676f8864f..d3e8871cb5 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -203,7 +203,7 @@ public:
   /// mem MUST be alloced outside (memAlloc=false)
   void transpose(MatrixPtr matTrans, bool memAlloc);
 
-  void mul(MatrixPtr A, MatrixPtr B, real alpha, real beta);
+  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
 
   /**
    * @brief sparseMatrix += denseMatrix
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c69e074a76..3ae237bc7d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -582,18 +582,16 @@ void GpuMatrix::mul(const GpuMatrix& a,
 }
 
 /* this = a*b */
-void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
-  mul(a, b, 1.0, 0.0);
-}
+void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
 
-void GpuMatrix::mul(const MatrixPtr a,
-                    const MatrixPtr b,
+void GpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
                     real scaleAB,
                     real scaleT) {
-  GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
-  GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
-  GpuSparseMatrixPtr a_ptr_s = std::dynamic_pointer_cast<GpuSparseMatrix>(a);
-  GpuSparseMatrixPtr b_ptr_s = std::dynamic_pointer_cast<GpuSparseMatrix>(b);
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
 
   if (a_ptr && b_ptr) {
     mul(*a_ptr, *b_ptr, scaleAB, scaleT);
@@ -766,20 +764,19 @@ void GpuMatrix::maxoutBackward(Matrix& a,
 }
 
 /*calulate the error of classification */
-void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
-  GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
-  GpuIVectorPtr label_ptr = std::dynamic_pointer_cast<GpuIVector>(label);
-
+void GpuMatrix::classificationError(Matrix& output, IVector& label) {
+  auto output_ptr = dynamic_cast<const GpuMatrix*>(&output);
+  auto label_ptr = dynamic_cast<const GpuIVector*>(&label);
   CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
 
   CHECK(height_ == output_ptr->height_ && width_ == 1)
       << "Matrix dimensions are not equal";
-  real* output_d = output_ptr->data_;
-  real* recResult_d = data_;
-  int* label_d = label_ptr->getData();
 
-  hl_matrix_classification_error(
-      output_d, label_d, recResult_d, height_, output_ptr->width_);
+  hl_matrix_classification_error((real*)output_ptr->data_,
+                                 (int*)label_ptr->getData(),
+                                 data_,
+                                 height_,
+                                 output_ptr->width_);
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -1265,69 +1262,6 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       outGrad.getStride());
 }
 
-void GpuMatrix::crossMapNormalFwd(Matrix& input,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  Matrix& denoms,
-                                  size_t channels,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  size_t num = input.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-
-  CHECK(height * width * channels == input.getWidth());
-  CHECK(denoms.getHeight() == input.getHeight() &&
-        denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
-        input.getWidth() == width_);
-  hl_CMRNorm_forward(num,
-                     input.getData(),
-                     denoms.getData(),
-                     data_,
-                     channels,
-                     height,
-                     width,
-                     sizeX,
-                     scale,
-                     -pow);
-}
-
-void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
-                                  Matrix& denoms,
-                                  Matrix& preOutV,
-                                  Matrix& localOutV,
-                                  size_t channels,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  size_t num = preOutV.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-
-  CHECK(width * height * channels == preOutV.getWidth());
-  CHECK(denoms.getHeight() == preOutV.getHeight() &&
-        denoms.getWidth() == preOutV.getWidth() &&
-        preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
-  CHECK(denoms.getHeight() == localGrad.getHeight() &&
-        denoms.getWidth() == localGrad.getWidth());
-
-  hl_CMRNorm_backward(num,
-                      preOutV.getData(),
-                      denoms.getData(),
-                      localOutV.getData(),
-                      localGrad.getData(),
-                      data_,
-                      channels,
-                      height,
-                      width,
-                      sizeX,
-                      -pow,
-                      2.0f * pow * scale);
-}
-
 void GpuMatrix::maxSequenceForward(Matrix& input,
                                    const IVector& sequence,
                                    IVector& index) {
@@ -1370,92 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(MatrixPtr input,
-                                         MatrixPtr weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<GpuMatrix*>(input.get()));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight.get()));
-
-  size_t numSequences = sequence.getSize() - 1;
-  int64_t inputDim = input->getWidth();
-  int64_t dim = getWidth();
-  CHECK_EQ(dim, inputDim * contextLength);
-
-  real* outData = getData();
-  real* inputData = input->getData();
-  const int* starts = sequence.getData();
-
-  hl_context_projection_forward(inputData,
-                                starts,
-                                isPadding ? weight->getData() : NULL,
-                                outData,
-                                numSequences,
-                                inputDim,
-                                contextLength,
-                                contextStart,
-                                beginPad,
-                                isPadding);
-}
-
-void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
-                                              const IVector& sequence,
-                                              int contextLength,
-                                              int contextStart) {
-  CHECK(dynamic_cast<GpuMatrix*>(inputGrad.get()));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-
-  size_t numSequences = sequence.getSize() - 1;
-  int64_t inputDim = inputGrad->getWidth();
-  int64_t dim = getWidth();
-  CHECK_EQ(dim, inputDim * contextLength);
-
-  real* outGrad = getData();
-  real* inGrad = inputGrad->getData();
-  const int* starts = sequence.getData();
-
-  hl_context_projection_backward_data(outGrad,
-                                      starts,
-                                      inGrad,
-                                      numSequences,
-                                      inputDim,
-                                      contextLength,
-                                      contextStart);
-}
-
-void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
-                                                const IVector& sequence,
-                                                int contextLength,
-                                                int contextStart,
-                                                int totalPad,
-                                                size_t beginPad) {
-  CHECK(dynamic_cast<GpuMatrix*>(weightGrad.get()));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-
-  size_t numSequences = sequence.getSize() - 1;
-  int64_t weightDim = weightGrad->getWidth();
-  int64_t dim = getWidth();
-  CHECK_EQ(dim, weightDim * contextLength);
-
-  real* outGrad = getData();
-  real* wtGrad = weightGrad->getData();
-  const int* starts = sequence.getData();
-
-  hl_context_projection_backward_weight(outGrad,
-                                        starts,
-                                        wtGrad,
-                                        numSequences,
-                                        weightDim,
-                                        totalPad,
-                                        contextLength,
-                                        contextStart,
-                                        beginPad);
-}
-
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   CHECK(data.useGpu_ == true && W.useGpu_ == true)
       << "Matrix type are not equal";
@@ -1463,7 +1311,9 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   real* output = getData();
   hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
@@ -1476,7 +1326,9 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_w(
       wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
@@ -1488,7 +1340,9 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_diff(
       ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
@@ -2219,84 +2073,6 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
   }
 }
 
-void CpuMatrix::crossMapNormalFwd(Matrix& input,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  Matrix& denoms,
-                                  size_t channels,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  size_t num = input.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-  size_t numCols = input.getWidth();
-  CHECK(height * width * channels == input.getWidth());
-  CHECK(denoms.getHeight() == input.getHeight() &&
-        denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
-        input.getWidth() == width_);
-  real* imgData = input.getData();
-  real* diffData = input.getData();
-  real* targetData = getData();
-  size_t halfSize = sizeX / 2;
-  size_t imgPixels = height * width;
-
-  // use integral vector to implement the sum in local window
-  real* integralData =
-      (real*)malloc((channels + sizeX + 1) * sizeof(real));  // NOLINT // TODO:
-  for (size_t i = 0; i <= halfSize; i++) {
-    integralData[i] = 0;
-  }
-  for (size_t i = 0; i < num; i++) {
-    real* targetPtr = targetData + i * numCols;
-    real* imgPtr = imgData + i * numCols;
-    real* diffPtr = diffData + i * numCols;
-    for (size_t m = 0; m < height; m++) {
-      for (size_t n = 0; n < width; n++) {
-        for (size_t c = 0; c < channels; c++) {
-          integralData[c + halfSize + 1] =
-              integralData[c + halfSize] + _square(*(diffPtr + c * imgPixels));
-        }
-        for (size_t k = channels + halfSize + 1; k <= channels + sizeX; k++) {
-          integralData[k] = integralData[channels + halfSize];
-        }
-        for (size_t k = 0; k < channels; k += 1) {
-          real a = integralData[k + sizeX] - integralData[k];
-          a = scale * a + 1;
-          targetPtr[k * imgPixels] = imgPtr[k * imgPixels] * _pow(a, -pow);
-        }
-        diffPtr++;
-        targetPtr++;
-        imgPtr++;
-      }
-    }
-  }
-  free(integralData);
-  integralData = NULL;
-}
-
-void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
-                                  Matrix& denoms,
-                                  Matrix& preOutV,
-                                  Matrix& localOutV,
-                                  size_t channels,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t size,
-                                  float scale,
-                                  float pow) {
-  LOG(FATAL) << "Not implemented";
-
-  CHECK(imgSizeH * imgSizeW * channels == preOutV.getWidth());
-  CHECK(denoms.getHeight() == preOutV.getHeight() &&
-        denoms.getWidth() == preOutV.getWidth() &&
-        preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
-  CHECK(denoms.getHeight() == localGrad.getHeight() &&
-        denoms.getWidth() == localGrad.getWidth());
-
-  // NOLINT // TODO:
-}
-
 /**
  * Input: one or more sequences. Each sequence contains some instances.
  * Output: output size is the number of input sequences (NOT input instances).
@@ -2371,120 +2147,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   }
 }
 
-void CpuMatrix::contextProjectionForward(MatrixPtr input,
-                                         MatrixPtr weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<CpuMatrix*>(input.get()));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight.get()));
-
-  size_t numSequences = sequence.getSize() - 1;
-  int64_t inputDim = input->getWidth();
-  int64_t dim = getWidth();
-  CHECK_EQ(dim, inputDim * contextLength);
-  const int* starts = sequence.getData();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i], padSize);
-        if (isPadding) {
-          MatrixPtr sub = weight->subMatrix(j, padSize);
-          mat->addAtOffset(*sub, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-        if (isPadding) {
-          MatrixPtr sub =
-              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
-          mat->addAtOffset(*sub, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src = input->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      dst->addAtOffset(*src, j * inputDim);
-    }
-  }
-}
-
-void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad,
-                                          MatrixPtr weightGrad,
-                                          const IVector& sequence,
-                                          int contextLength,
-                                          int contextStart,
-                                          size_t beginPad,
-                                          bool isPadding) {
-  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad.get()));
-  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad.get()));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-
-  int64_t inputDim = 0;
-  int64_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-  const int* starts = sequence.getData();
-  if (inputGrad) {
-    inputDim = inputGrad->getWidth();
-  } else {
-    inputDim = weightGrad->getWidth();
-  }
-  CHECK_EQ(dim, inputDim * contextLength);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i], padSize);
-          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-          MatrixPtr sub = weightGrad->subMatrix(
-              beginPad + contextStart + j - padSize, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!inputGrad) continue;
-      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      src->addAtOffset(*dst, j * inputDim);
-    }
-  }
-}
-
 inline void vecAddTo(real* a, const real* b, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i] += b[i];
@@ -2630,29 +2292,22 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const MatrixPtr a,
-                    const MatrixPtr b,
+void CpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
                     real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
 
-  if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuMatrix*>(a.get()),
-        dynamic_cast<CpuMatrix*>(b.get()),
-        scaleAB,
-        scaleT);
-  } else if (dynamic_cast<CpuSparseMatrix*>(a.get()) &&
-             dynamic_cast<CpuMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuSparseMatrix*>(a.get()),
-        dynamic_cast<CpuMatrix*>(b.get()),
-        scaleAB,
-        scaleT);
-  } else if (dynamic_cast<CpuMatrix*>(a.get()) &&
-             dynamic_cast<CpuSparseMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuMatrix*>(a.get()),
-        dynamic_cast<CpuSparseMatrix*>(b.get()),
-        scaleAB,
-        scaleT);
+  if (a_ptr && b_ptr) {
+    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
   } else {
     LOG(FATAL) << "Not supported";
   }
@@ -3321,7 +2976,7 @@ void CpuMatrix::addColumnVector(const Matrix& b) {
 }
 
 /* this = a*b */
-void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
+void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
   return mul(a, b, 1.0, 0.0);
 }
 
@@ -3544,21 +3199,20 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) {
 }
 
 /* calulate classification error */
-void CpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
-  CHECK(dynamic_cast<CpuMatrix*>(output.get()));
-  CHECK(dynamic_cast<CpuIVector*>(label.get()));
+void CpuMatrix::classificationError(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&output));
+  CHECK(dynamic_cast<const CpuIVector*>(&label));
 
-  size_t numSamples = getHeight();
-  size_t dim = output->getWidth();
-  CHECK_EQ(label->getSize(), numSamples);
-  CHECK_EQ(output->getHeight(), numSamples);
   CHECK_EQ(getWidth(), (size_t)1);
+  size_t numSamples = getHeight();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
 
-  real* out = output->getData();
-  real* result = getData();
-  int* lbl = label->getData();
-  real maxData;
-  int maxIndex;
+  size_t dim = output.getWidth();
+  real* out = output.getData();
+  int* lbl = label.getData();
+  real maxData = 0.0;
+  int maxIndex = -1;
   for (size_t i = 0; i < numSamples; ++i) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
@@ -3570,7 +3224,7 @@ void CpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
         maxData = out[i * dim + j];
       }
     }
-    result[i] = (maxIndex != lbl[i]);
+    getData()[i] = (maxIndex != lbl[i]);
   }
 }
 
@@ -4116,7 +3770,9 @@ void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
@@ -4130,7 +3786,9 @@ void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
@@ -4145,7 +3803,9 @@ void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 4342ca52a3..dd24f8821d 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
@@ -408,7 +408,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+  void addBias(Matrix& b, real scale, bool sharedBias) {
     if (!sharedBias) {
       addBias(b, scale);
     } else {
@@ -425,7 +425,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
     if (!sharedBias) {
       collectBias(a, scale);
     } else {
@@ -444,8 +444,8 @@ public:
    * this = scaleAB*(a*b) + scaleT*this
    * @endcode
    */
-  virtual void mul(const MatrixPtr a,
-                   const MatrixPtr b,
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
                    real scaleAB,
                    real scaleT) {
     LOG(FATAL) << "Not implemented";
@@ -643,7 +643,7 @@ public:
    *  this = a*b
    * @endcode
    */
-  virtual void mul(const MatrixPtr a, const MatrixPtr b) {
+  virtual void mul(const Matrix& a, const Matrix& b) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -835,7 +835,7 @@ public:
    *
    * output[i] = 0 if row i is correct.
    */
-  virtual void classificationError(MatrixPtr output, IVectorPtr label) {
+  virtual void classificationError(Matrix& output, IVector& label) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -952,31 +952,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  /// normalize-operation.
-  virtual void crossMapNormalFwd(Matrix& input,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 Matrix& denoms,
-                                 size_t channels,
-                                 size_t sizeX,
-                                 float scale,
-                                 float pow) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void crossMapNormalBwd(Matrix& localGrad,
-                                 Matrix& denoms,
-                                 Matrix& preOutV,
-                                 Matrix& localOutV,
-                                 size_t channels,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t size,
-                                 float scale,
-                                 float pow) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Input: one or more sequences. Each sequence contains some instances.
    *
@@ -997,42 +972,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(MatrixPtr input,
-                                        MatrixPtr weight,
-                                        const IVector& sequence,
-                                        int contextLength,
-                                        int contextStart,
-                                        size_t beginPad,
-                                        bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackward(MatrixPtr inputGrad,
-                                         MatrixPtr weightGrad,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardData(MatrixPtr inputGrad,
-                                             const IVector& sequence,
-                                             int contextLength,
-                                             int contextStart) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad,
-                                               const IVector& sequence,
-                                               int contextLength,
-                                               int contextStart,
-                                               int totalPad,
-                                               size_t beginPad) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1152,6 +1091,10 @@ public:
       TensorCpuApply<real>(*this, expr);
     }
   }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1272,14 +1215,14 @@ public:
    * this = scaleAB*(a*b) + scaleT*this
    * @endcode
    */
-  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
 
   /**
    * @code
    * this = a*b
    * @endcode
    */
-  void mul(const MatrixPtr a, const MatrixPtr b);
+  void mul(const Matrix& a, const Matrix& b);
 
   void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
 
@@ -1373,7 +1316,7 @@ public:
   void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
   void randomizeUniform();
 
-  void classificationError(MatrixPtr output, IVectorPtr label);
+  void classificationError(Matrix& output, IVector& label);
 
   void convExpand(Matrix& feature,
                   int feaImgHeight,
@@ -1459,26 +1402,6 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
-  void crossMapNormalFwd(Matrix& input,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         Matrix& denoms,
-                         size_t channels,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad,
-                         Matrix& denoms,
-                         Matrix& preOutV,
-                         Matrix& localOutV,
-                         size_t channels,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1487,26 +1410,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input,
-                                MatrixPtr weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackwardData(MatrixPtr inputGrad,
-                                     const IVector& sequence,
-                                     int contextLength,
-                                     int contextStart);
-
-  void contextProjectionBackwardWeight(MatrixPtr weightGrad,
-                                       const IVector& sequence,
-                                       int contextLength,
-                                       int contextStart,
-                                       int totalPad,
-                                       size_t beginPad);
-
   void bilinearForward(const Matrix& in,
                        const size_t inImgH,
                        const size_t inImgW,
@@ -1685,26 +1588,6 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
-  void crossMapNormalFwd(Matrix& input,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         Matrix& denoms,
-                         size_t channels,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad,
-                         Matrix& denoms,
-                         Matrix& preOutV,
-                         Matrix& localOutV,
-                         size_t channels,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1713,22 +1596,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input,
-                                MatrixPtr weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackward(MatrixPtr inputGrad,
-                                 MatrixPtr weightGrad,
-                                 const IVector& sequence,
-                                 int contextLength,
-                                 int contextStart,
-                                 size_t beginPad,
-                                 bool isPadding);
-
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
@@ -1784,7 +1651,7 @@ public:
 
   void addColumnVector(const Matrix& b);
 
-  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
   void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
   void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
@@ -1807,7 +1674,7 @@ public:
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
-  void mul(const MatrixPtr a, const MatrixPtr b);
+  void mul(const Matrix& a, const Matrix& b);
 
   void rightMul(Matrix& b, real scaleAB, real scaleT);
   void rightMul(Matrix& b);
@@ -1881,7 +1748,7 @@ public:
 
   void randomizeUniform();
 
-  void classificationError(MatrixPtr output, IVectorPtr label);
+  void classificationError(Matrix& output, IVector& label);
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
@@ -1973,8 +1840,8 @@ public:
 
 public:
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  void add(Matrix& b, real p1, real p2);
-  void add(real p1, real p2);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
 
 private:
   using Matrix::mul;
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 9154503c21..720a035ecb 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -571,49 +571,48 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_stream_synchronize(stream);
 }
 
-void GpuSparseMatrix::mul(const GpuMatrixPtr a,
-                          const GpuMatrixPtr b,
+void GpuSparseMatrix::mul(const GpuMatrix& a,
+                          const GpuMatrix& b,
                           real scaleAB,
                           real scaleT) {
-  CHECK(a->useGpu_ && b->useGpu_) << "type not match";
+  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
   CHECK(!trans_) << "trans not supported";
-  real* A_d = a->getData();
-  real* B_d = b->getData();
+  real* A_d = (real*)a.getData();
+  real* B_d = (real*)b.getData();
   hl_sparse_matrix_s C_d = sMatrix_.get();
-  hl_trans_op_t a_trans = a->trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t b_trans = b->trans_ ? HPPL_OP_T : HPPL_OP_N;
-
-  if (!a->trans_ && !b->trans_) {
-    CHECK(height_ == a->getHeight());
-    CHECK(width_ == b->getWidth());
-    CHECK(a->getWidth() == b->getHeight());
-  } else if (a->trans_ && !b->trans_) {
-    CHECK(height_ == a->getWidth());
-    CHECK(width_ == b->getWidth());
-    CHECK(a->getHeight() == b->getHeight());
-  } else if (!a->trans_ && b->trans_) {
-    CHECK(height_ == a->getHeight());
-    CHECK(width_ == b->getHeight());
-    CHECK(a->getWidth() == b->getWidth());
+  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+
+  if (!a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getWidth() == b.getHeight());
+  } else if (a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getWidth());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getHeight() == b.getHeight());
+  } else if (!a.trans_ && b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getHeight());
+    CHECK(a.getWidth() == b.getWidth());
   } else {
     LOG(INFO) << "Not support";
   }
   int dimM = height_;
   int dimN = width_;
-  int dimK = !b->trans_ ? b->getHeight() : b->getWidth();
+  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
   hl_sparse_matrix_mul(
       A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
 }
 
-void GpuSparseMatrix::mul(const MatrixPtr a,
-                          const MatrixPtr b,
+void GpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
                           real scaleAB,
                           real scaleT) {
-  if (std::dynamic_pointer_cast<GpuMatrix>(a) &&
-      std::dynamic_pointer_cast<GpuMatrix>(b)) {
-    GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
-    GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
-    mul(a_ptr, b_ptr, scaleAB, scaleT);
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
   } else {
     LOG(FATAL) << "not supported";
   }
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index bd96a3301d..1d3801548e 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -104,10 +104,7 @@ public:
                  size_t newNnz,
                  SparseValueType valueType);
 
-  void mul(const GpuMatrixPtr a,
-           const GpuMatrixPtr b,
-           real scaleAB,
-           real scaleT);
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
   /// B = A , B.trans = !A.trans
   MatrixPtr getTranspose();
 
@@ -218,7 +215,7 @@ protected:
   void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
 
 public:
-  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
 
   void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
   void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 3091743123..b61c6b2d49 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update,
-              false,
-              "Whether to allow inefficient sparse update");
+DEFINE_bool(allow_inefficient_sparse_update,
+            false,
+            "Whether to allow inefficient sparse update");
 
 namespace paddle {
 
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index dd4d85611d..d7dfb2fe57 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
 #include "RowBuffer.h"
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
-
-P_DECLARE_bool(allow_inefficient_sparse_update);
+DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index f9a2c12cd5..56e5442394 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "Allocator.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(pool_limit_size,
-               536870912,
-               "maximum memory size managed by a memory pool, default is 512M");
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
 
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 9bd789e8c5..6fd60e7f3c 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "hl_tensor_ops.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 8a24103bd4..9af6e30c9e 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Thread.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index 9403bb073a..ceb96b2e25 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -8,8 +8,7 @@ add_simple_unittest(test_RowBuffer)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
-    test_matrixCompare.cpp
-    ../../gserver/tests/TestUtil.cpp)
+    test_matrixCompare.cpp)
 
 add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
@@ -17,12 +16,10 @@ add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 
 if(WITH_GPU)
-    if(COMPILER_SUPPORT_CXX11)
-    	CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
-		link_paddle_test(test_Tensor)
-        CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-        link_paddle_test(test_lazyAssign)
-    endif()
+    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+    link_paddle_test(test_Tensor)
+    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+    link_paddle_test(test_lazyAssign)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 33e0952efe..1ca70ea84c 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
   }
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index cc7c1e7eb2..21918b86e1 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 624fa20ca5..58bc43a38b 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
   checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 27216ddb58..04c856453d 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
   testWrapper(functor);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 6aa5891bce..3836f7fc0f 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -28,10 +28,10 @@ limitations under the License. */
  * so we can add some tricks to prevent exp calculate an excessive value.
  *
  */
-#include <fenv.h>
+
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index d490078d90..e6b5dba446 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 
 #include <gtest/gtest.h>
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index adb5fbd9fa..a4084bdf7c 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -224,10 +224,11 @@ void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
 }
 
 TEST(Matrix, paramRelu) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluForward(height, width, w_height, w_width);
           testParamReluBackwardW(height, width, w_height, w_width);
         }
@@ -291,10 +292,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index f62843310d..e8f9b26ff2 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
     ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
   }
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 88b75b6d83..9d3fbaef43 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -33,8 +33,8 @@ TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
       ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
   ret1->zeroMem();
   ret2->zeroMem();
-  ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(testCpuMatrix, mulCpuMatrix, 1.0, 1.0);
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
   checkMatrixEqual(ret1, ret2);
 }
 
@@ -147,9 +147,9 @@ void test_sparse_matrix_mul(MatrixPara paraA,
   hl_stream_synchronize(stream);
 
   /*matrix mul*/
-  cpuMatrixC->mul(cpuMatrixA, cpuMatrixB, 1.0, 1.0);
-  gpuMatrixC->mul(gpuMatrixA, gpuMatrixB, 1.0, 1.0);
-  cpuDenseC->mul(cpuDenseA, cpuDenseB, 1.0, 1.0);
+  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
+  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
+  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
 
   gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
   hl_stream_synchronize(stream);
@@ -224,8 +224,8 @@ TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
   MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
   ret1->zeroMem();
   ret2->zeroMem();
-  ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(testGpuMatrix, mulGpuMatrix, 1.0, 1.0);
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
   checkMatrixEqual(ret1, ret2);
 }
 
@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   checkSMatrixEqual2(matA, matD);
 #endif
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 1859b9fc13..40e38434fa 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 1bf6a0cc43..4a88844b43 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 #ifndef PADDLE_TYPE_DOUBLE
-P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
 #else
-P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
 }
 
 TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 9925e24dc1..4eb9837909 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 16541edb54..786d863a53 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
   testMatrixCase(testSgdUpdate<GpuMatrix>);
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 62de5b25e4..f0c49791d7 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -29,148 +29,6 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-void testMatrixProjectionForward(int contextStart,
-                                 int contextLength,
-                                 bool padding,
-                                 int batchSize,
-                                 int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeight = nullptr;
-  MatrixPtr gpuWeight = nullptr;
-  if (padding) {
-    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeight->randomizeUniform();
-    gpuWeight->copyFrom(*cpuWeight);
-  }
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuOutput =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutput =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(cpuInput,
-                                      cpuWeight,
-                                      *cpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  gpuOutput->contextProjectionForward(gpuInput,
-                                      gpuWeight,
-                                      *gpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-}
-
-void testMatrixProjectionBackward(int contextStart,
-                                  int contextLength,
-                                  bool padding,
-                                  int batchSize,
-                                  int inputDim) {
-  MatrixPtr cpuOutputGrad =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutputGrad =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeightGrad = nullptr;
-  MatrixPtr gpuWeightGrad = nullptr;
-  if (padding) {
-    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeightGrad->randomizeUniform();
-    gpuWeightGrad->copyFrom(*cpuWeightGrad);
-  }
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad,
-                                           cpuWeightGrad,
-                                           *cpuSequence,
-                                           contextLength,
-                                           contextStart,
-                                           beginPad,
-                                           padding);
-  gpuOutputGrad->contextProjectionBackwardData(
-      gpuInputGrad, *gpuSequence, contextLength, contextStart);
-  if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad,
-                                                   *gpuSequence,
-                                                   contextLength,
-                                                   contextStart,
-                                                   pad,
-                                                   beginPad);
-  }
-
-  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
-  if (padding) {
-    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
-  }
-}
-
-TEST(Matrix, projection) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto trainablePadding : {false, true}) {
-        for (auto batchSize : {1, 2, 5, 20, 100}) {
-          for (auto inputDim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " trainablePadding=" << trainablePadding
-                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart,
-                                        contextLength,
-                                        trainablePadding,
-                                        batchSize,
-                                        inputDim);
-            testMatrixProjectionBackward(contextStart,
-                                         contextLength,
-                                         trainablePadding,
-                                         batchSize,
-                                         inputDim);
-          }
-        }
-      }
-    }
-  }
-}
-
 void testMatrixMaxSequence(int batchSize, int inputDim) {
   // forward
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -318,7 +176,7 @@ void testMatrixInverse(int height) {
   cpu->randomizeUniform();
   MatrixPtr cpuT = cpu->getTranspose();
   MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(cpu, cpuT);
+  outputCheck->mul(*cpu, *cpuT);
   cpu->setDiag(1.0);
   cpu->add(*outputCheck);
 
@@ -328,7 +186,7 @@ void testMatrixInverse(int height) {
 
   TensorCheckErr(*cpuI, *gpuI);
 
-  outputCheck->mul(cpu, cpuI);
+  outputCheck->mul(*cpu, *cpuI);
   cpu->setDiag(1.0);
   TensorCheckErr(*cpu, *outputCheck);
 }
@@ -509,8 +367,8 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   gpuB->copyFrom(*cpuB);
   gpuC->copyFrom(*cpuC);
 
-  cpuC->mul(cpuA, cpuB, alpha, beta);
-  gpuC->mul(gpuA, gpuB, alpha, beta);
+  cpuC->mul(*cpuA, *cpuB, alpha, beta);
+  gpuC->mul(*gpuA, *gpuB, alpha, beta);
 
   TensorCheckErr(*cpuC, *gpuC);
 }
@@ -581,8 +439,8 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
   MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
 
-  subCpuC->mul(subCpuA, subCpuB, alpha, beta);
-  subGpuC->mul(subGpuA, subGpuB, alpha, beta);
+  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
+  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
 
   TensorCheckErr(*cpuC, *gpuC);
 }
@@ -915,10 +773,11 @@ void testParamReluBackwardDiff(int height,
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluBackwardDiff(height, width, w_height, w_width);
         }
       }
@@ -939,8 +798,8 @@ void testClassificationError(int numSamples, int dim) {
   gpuOutput->copyFrom(*cpuOutput);
   gpuLabel->copyFrom(*cpuLabel);
 
-  cpuError->classificationError(cpuOutput, cpuLabel);
-  gpuError->classificationError(gpuOutput, gpuLabel);
+  cpuError->classificationError(*cpuOutput, *cpuLabel);
+  gpuError->classificationError(*gpuOutput, *gpuLabel);
 
   TensorCheckEqual(*cpuError, *gpuError);
 }
@@ -1262,10 +1121,4 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index 6f6de238ba..a9185a4b24 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -102,8 +102,8 @@ void testSpMatrixMul(int M, int N, int K, real rate) {
   gpuC->copyFrom(*cpuC, stream);
   hl_stream_synchronize(stream);
 
-  cpuC->mul(cpuA, cpuB->getTranspose(), 1, 1);
-  gpuC->mul(gpuA, gpuB->getTranspose(), 1, 1);
+  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
+  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
 
   MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
   outputCheck->copyFrom(*gpuC, stream);
@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index e91daa3717..65d01a1571 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -245,6 +245,8 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
                                     bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index 630f15c8cf..dbb738e98b 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include <cmath>
 
-P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 417e386dc7..2e7c18b808 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/TypeDefs.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 986ae1539b..29d6e20dc1 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Parameter.h"
+#include <gflags/gflags.h>
 #include <fstream>
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
@@ -23,14 +24,13 @@ limitations under the License. */
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(enable_grad_share,
-               (100 * 1024 * 1024),
-               "threshold for enable gradient parameter share for batch "
-               "multi-cpu training");
-P_DEFINE_int32(
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
     grad_share_block_num,
     64,
     "block number of gradient parameter share for batch multi-cpu training");
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 532c6770e5..72c8336799 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -26,10 +26,10 @@ limitations under the License. */
 #include "ParameterUpdaterHook.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 namespace paddle {
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2d277e47e7..0fca280149 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/math/Vector.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index 49e2ae2b39..458cae886a 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 
-void ParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
   parameters_ = parameters;
   for (ParameterType type : getParameterTypes()) {
     for (auto& para : parameters) {
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index 5401046f67..b230e170c1 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -32,13 +32,13 @@ public:
     parameterTypes_.push_back(type);
   }
 
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
 
   // called by Trainer when starting a new pass
   virtual void startPass() {}
 
   // called by Trainer then finishing a pass, ruturn true if pass accepted
-  virtual bool finishPass(real cost = 0) { return true; }
+  virtual bool finishPass() { return true; }
 
   // called by Trainer before backward() of a batch
   // Return the type of pass it needs. This pass type will be passed
@@ -105,16 +105,16 @@ public:
   ParameterUpdaterComposite() {}
   virtual ~ParameterUpdaterComposite() {}
 
-  virtual void init(std::vector<ParameterPtr>& parameters) = 0;
+  virtual void init(const std::vector<ParameterPtr>& parameters) = 0;
 
   virtual void startPass() {
     syncThreadPool_->execPlusOwner(
         [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
   }
 
-  virtual bool finishPass(real cost = 0) {
+  virtual bool finishPass() {
     syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(cost); });
+        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(); });
     return true;
   }
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index aa57a63469..8bab5a6289 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -23,15 +23,6 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  int ret = RUN_ALL_TESTS();
-
-  return ret;
-}
-
 class CommonTest : public ::testing::Test {
 protected:
   CommonTest() : testStat_("test") {}
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index a43def98c5..0e031a7e20 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "BaseClient.h"
+#include <gflags/gflags.h>
 #include <string.h>
 #include <vector>
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_string(pservers);
+DECLARE_string(pservers);
 
 namespace paddle {
 
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 262afafbe2..11d7a147bf 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/pserver/ProtoServer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 1c1e1964b8..b7f85ea1a6 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
     BaseClient.cpp
     ParameterClient2.cpp
     ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    ParameterServerController.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 329dfb0fb3..8c8ba0a2e5 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/types.h>
+#include <chrono>
 
 #include <arpa/inet.h>
 #include <net/if.h>
@@ -31,23 +32,23 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages,
-              false,
-              "if message size is small, recommend set it True to enable quick "
-              "ack and no delay");
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock send buff size, can reduce network congestion if "
-               "set carefully");
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock recv buff size");
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
 
 namespace paddle {
 
@@ -382,8 +383,20 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   setOption(sockfd);
 
   /// Now connect to the server
-  PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting to " << serverAddr;
+  int retry_second = 0;
+  int error = 0;
+  do {
+    error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
+    if (error == ECONNREFUSED) {
+      LOG(WARNING) << "connection refused by pserver, try again!";
+      if (retry_second++ >= 7) {
+        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
+      }
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    } else {
+      PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
+    }
+  } while (error == ECONNREFUSED);
 
   channel_.reset(new SocketChannel(sockfd, serverAddr));
   tcpRdma_ = F_TCP;
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 86fd1c5276..a97859f83f 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 
-P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 5255394949..89b3ddd502 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -23,10 +23,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 #include "ParameterService.pb.h"
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "ProtoServer.h"
 #include "SparseParameterDistribution.h"
 
-P_DECLARE_int32(parallel_thread_num);
+DECLARE_int32(parallel_thread_num);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 2cb4c93535..856fa0ad1a 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -30,11 +30,11 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min,
-                1.0,
-                "control config_.async_lagged_grad_discard_ratio() min value");
-P_DEFINE_double(
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
     async_lagged_ratio_default,
     1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 61c139981e..0f5a589590 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -29,16 +29,16 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 
 #include "ParameterService.pb.h"
 
 #include "ProtoServer.h"
 
-P_DECLARE_int32(port);
+DECLARE_int32(port);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index ffc521f2c1..845a2c27e2 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
+#include "ParameterServerController.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::vector<std::string> devices;
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-
-  // round robin to loadbalance RDMA server ENGINE
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
+  parameterServerPtr->wait();
 
   return 0;
 }
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/pserver/ParameterServerController.cpp
new file mode 100644
index 0000000000..1d11a2e1ac
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServerController.h"
+
+namespace paddle {
+
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+
+  if (config.nics().empty()) {
+    parameterServers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (config.rdma_tcp() == "rdma") {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
+      }
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
+    }
+  } else {
+    str::split(config.nics(), ',', &devices);
+    parameterServers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (config.rdma_tcp() == "rdma") {
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          parameterServers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
+        }
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
+            << config.port() + i;
+      }
+    }
+  }
+}
+
+ParameterServerController::~ParameterServerController() { this->wait(); }
+
+ParameterServerController* ParameterServerController::createFromGflags() {
+  ParameterServerConfig config;
+
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+
+  return create(config);
+}
+
+ParameterServerController* ParameterServerController::create(
+    const ParameterServerConfig& config) {
+  return new ParameterServerController(config);
+}
+
+void ParameterServerController::start() {
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
+    i++;
+  }
+}
+
+void ParameterServerController::wait() {
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
+    i++;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
new file mode 100644
index 0000000000..fe9bb0b4d0
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+public:
+  DISABLE_COPY(ParameterServerController);
+
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+
+private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 0068f85b52..6dd725db30 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -20,26 +20,26 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver,
-              false,
-              "check whether sparse parameter exhibts balanced distribution at "
-              "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log,
-              false,
-              "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches,
-               100,
-               "run sparse parameter distribution check for N batches");
-P_DEFINE_double(
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
     check_sparse_distribution_ratio,
     0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree,
-                2.0,
-                "the ratio of maximum data size and minimun data size for "
-                "different pserver");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
 
 namespace paddle {
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6e63c4f678..066a6c0293 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -195,9 +195,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   channel_.reset(new SocketChannel(sockfd));
 }
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 10000000, "Data size");
-P_DEFINE_int32(loop_time, 100000, "test loop time");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index 4257a2308d..8e7231a9e1 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -21,9 +21,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(num_gradient_servers);
-P_DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-P_DEFINE_int32(server_cpu, 0, "assign server cpu");
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 3880dde5e3..9f86ee80f4 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 50000000, "Data size");
-P_DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-P_DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index edcefba6a8..981d10afda 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -15,6 +15,7 @@
 import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
+import numpy
 
 __all__ = ['DataProviderConverter']
 
@@ -35,18 +36,18 @@ class IScanner(object):
 class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
-        self.__mat__ = []
-        self.__height__ = 0
+        self.__mat__ = None
 
     def scan(self, dat):
-        self.__mat__.extend(dat)
-        self.__height__ += 1
+        if self.__mat__ is None:
+            self.__mat__ = numpy.array([dat], dtype='float32')
+        else:
+            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
-                                           self.input_type.dim, False)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 
 
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 1bae396a18..66a46e1883 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
     submit_local.sh
     @ONLY)
 
-
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
         RENAME paddle)
+
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
+    @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_usage)
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 207f97c4a6..1522be023f 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -2,6 +2,8 @@ FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG DEBIAN_FRONTEND=noninteractive
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update \
     && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
     libgoogle-glog-dev libgflags-dev libgtest-dev \
@@ -13,19 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
-
-# cmake tends to hide and blur the dependencies between code modules, as
-# noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
-# thinking about using Bazel to fix this problem, e.g.,
-# https://github.com/PaddlePaddle/Paddle/issues/681#issuecomment-263996102. To
-# start the trail of fixing, we add Bazel to our Dockerfiles.
-RUN apt-get update && apt-get install -y curl software-properties-common \
-    && add-apt-repository ppa:webupd8team/java \
-    && echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections \
-    && echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list \
-    && curl https://bazel.build/bazel-release.pub.gpg | apt-key add - \
-    && apt-get update && apt-get install -y oracle-java8-installer bazel
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -53,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 33f6adfea2..09f07043e2 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -2,6 +2,8 @@ FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG DEBIAN_FRONTEND=noninteractive
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update \
     && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
     libgoogle-glog-dev libgflags-dev libgtest-dev \
@@ -13,19 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
-
-# cmake tends to hide and blur the dependencies between code modules, as
-# noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
-# thinking about using Bazel to fix this problem, e.g.,
-# https://github.com/PaddlePaddle/Paddle/issues/681#issuecomment-263996102. To
-# start the trail of fixing, we add Bazel to our Dockerfiles.
-RUN apt-get update && apt-get install -y curl software-properties-common \
-    && add-apt-repository ppa:webupd8team/java \
-    && echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections \
-    && echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list \
-    && curl https://bazel.build/bazel-release.pub.gpg | apt-key add - \
-    && apt-get update && apt-get install -y oracle-java8-installer bazel
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -53,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ca3f1c3f18..7edba3dd09 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -43,5 +43,7 @@ cp -rv /woboq/data $WOBOQ_OUT/../data
     -o $WOBOQ_OUT \
     -p paddle:/paddle
 /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-
+cd /woboq
+make clean
+rm -rf /paddle/build
 trap : 0
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
new file mode 100755
index 0000000000..87083467f5
--- /dev/null
+++ b/paddle/scripts/docker/entrypoint
@@ -0,0 +1,8 @@
+#!/bin/bash
+LOG=/var/log/all
+
+touch $LOG
+
+/usr/sbin/sshd -D >> $LOG &
+jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
+tail -f $LOG
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index ace2c0dee9..f29d32f0d9 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,8 +21,6 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_glog: @WITH_GLOG@"
-        echo "    with_gflags: @WITH_GFLAGS@"
         echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
@@ -124,6 +122,9 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
new file mode 100755
index 0000000000..7dbd1f5884
--- /dev/null
+++ b/paddle/scripts/tools/usage_stat/usage.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
deleted file mode 100755
index ec2ac1f224..0000000000
--- a/paddle/scripts/travis/before_install.linux.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index f438e69b82..7036f971fd 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -2,12 +2,5 @@
 brew update
 brew tap homebrew/science
 brew install python
-sudo pip install --upgrade protobuf==2.6.0
-brew install homebrew/versions/protobuf260 --without-python
-brew install cmake python glog gflags openblas wget md5sha1sum
-
-wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
-tar xf gtest.tar.gz
-cd googletest-release-1.8.0/
-cmake .
-make install
+sudo pip install --upgrade protobuf
+brew install swig openblas md5sha1sum protobuf
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 9caeb21beb..fd3aeb02b2 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,27 +1,19 @@
 #!/bin/bash
-./build_submodules.sh
 source ./common.sh
-CMAKE_EXTRA=""
-if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
-else
-  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
-fi
-
-
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
 if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+  export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
+  export PYTHONHOME=/opt/python/2.7.12
+  export PATH=/opt/python/2.7.12/bin:${PATH}
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
+  sudo make install
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  export PYTHONPATH=/usr/local/lib/python2.7/site-packages
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NPROC=`sysctl -n hw.ncpu`
   make -j $NPROC
-  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 fi
-
-
-sudo make install
-sudo paddle version
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
deleted file mode 100755
index d458bf92bf..0000000000
--- a/paddle/scripts/travis/build_submodules.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-WORK_DIR=$PWD
-PROJ_ROOT=$(git rev-parse --show-cdup)
-SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
-
-for module in $SUBMODULES
-do
-  case $module in
-    "warp-ctc")
-      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
-        rm -rf ${PROJ_ROOT}warp-ctc/build
-      fi
-      mkdir ${PROJ_ROOT}warp-ctc/build
-      cd ${PROJ_ROOT}warp-ctc/build
-      cmake ..; make
-    ;;
-  esac
-done
-cd $WORK_DIR
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
index 9b6e420ca7..f05c7530a3 100755
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,3 +2,5 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
+mkdir -p $HOME/third_party
+EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 0bbb76a8a3..bdafb145bc 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,9 +4,13 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
+# check websites for broken links
+linkchecker doc/en/html/index.html
+linkchecker doc/cn/html/index.html
+
 # Parse Github URL
 REPO=`git config remote.origin.url`
 SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
@@ -35,8 +39,8 @@ git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
 
 # remove old docs. mv new docs.
 rm -rf doc doc_cn
-mv ../doc_cn/html doc_cn
-mv ../doc/html doc
+mv ../doc/cn/html doc_cn
+mv ../doc/en/html doc
 
 # Check is there anything changed.
 set +e
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/precommit.sh
index 5ad84f1821..7a59b1131d 100755
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
@@ -12,6 +12,9 @@ cd ..
 export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
-pre-commit run -a
+
+if ! pre-commit run -a ; then
+  git diff  --exit-code
+fi
 
 trap : 0
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index b4c38a41b8..c79666bc81 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -14,7 +14,9 @@
 
 # This file is used to build paddle python binding package.
 # It will be invoked by Makefile that generated by COMAKE
+
 from setuptools import setup, Extension
+
 import numpy as np
 import api.paddle_ld_flags
 import platform
@@ -30,8 +32,10 @@ is_lin = (system == 'linux')
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
+extra_comps = []
 extra_links = []
 obj = api.paddle_ld_flags.PaddleLDFlag()
+extra_comps = obj.c_flag()
 ldflags = obj.ldflag_str()
 if ldflags is not None:
   extra_links.extend(ldflags.split(" "))
@@ -51,26 +55,21 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
-extra_c = obj.c_flag()
-
-attr=dict()
-if extra_c is not None:
-  attr["extra_compile_args"] = extra_c
-
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
        ['Paddle_wrap.cxx'],
+       language = "c++",
        include_dirs = include_dirs,
        extra_link_args = extra_links,
-       **attr
+       extra_compile_args = extra_comps
     )
   ],
   packages=['py_paddle'],
   include_dirs = include_dirs,
   install_requires = [
     'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=2.4.1' # The paddle protobuf version
+    'protobuf>=3.0.0'    # The paddle protobuf version
   ],
 )
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
new file mode 100644
index 0000000000..c47add04b0
--- /dev/null
+++ b/paddle/testing/CMakeLists.txt
@@ -0,0 +1,8 @@
+# for paddle test case
+
+if(WITH_TESTING)
+  add_library(paddle_test_main STATIC TestMain.cpp)
+  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_library(paddle_test_util STATIC TestUtil.cpp)
+  add_dependencies(paddle_test_util gen_proto_cpp)
+endif()
diff --git a/paddle/utils/CompilerMacros.h b/paddle/testing/TestMain.cpp
similarity index 75%
rename from paddle/utils/CompilerMacros.h
rename to paddle/testing/TestMain.cpp
index e50093f7fc..3e14532d18 100644
--- a/paddle/utils/CompilerMacros.h
+++ b/paddle/testing/TestMain.cpp
@@ -12,6 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include <gtest/gtest.h>
+#include "paddle/utils/Util.h"
 
-#define ATTR_NORETURN __attribute__((noreturn))
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/testing/TestUtil.cpp
similarity index 98%
rename from paddle/gserver/tests/TestUtil.cpp
rename to paddle/testing/TestUtil.cpp
index e656da5b8f..c691fe2625 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "TestUtil.h"
-
+#include <gflags/gflags.h>
 #include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
 namespace paddle {
 
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/testing/TestUtil.h
similarity index 100%
rename from paddle/gserver/tests/TestUtil.h
rename to paddle/testing/TestUtil.h
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 1cf29a39b9..91d89b61a3 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/pserver/ParameterServer2.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DEFINE_string(model_dir, "", "Directory for separated model files");
-P_DEFINE_string(model_file, "", "File for merged model file");
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(model_file, "", "File for merged model file");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
index 8b5b95da5b..4e9e890c85 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -34,7 +34,8 @@ SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
   updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
 }
 
-void SgdUpdaterWithCpuAverager::init(std::vector<ParameterPtr>& parameters) {
+void SgdUpdaterWithCpuAverager::init(
+    const std::vector<ParameterPtr>& parameters) {
   SgdLocalUpdater::init(parameters);
   averager_->init(parameters_.size(), nullptr);
   copyEvents_.resize(parameters_.size());
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index e52b5cd318..c3207e63ce 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -64,7 +64,7 @@ public:
    *           be initialized.
    * @param parameters The parameter need to be initialized.
    */
-  virtual void init(std::vector<ParameterPtr>& parameters) {
+  virtual void init(const std::vector<ParameterPtr>& parameters) {
     ParameterUpdater::init(parameters);
     optimizer_->init(parameters_.size(), nullptr);
     // check no L1 decay in parameter configs
@@ -102,9 +102,9 @@ public:
    * @param cost sum cost during one pass.
    * @return true if accept (used for owlqn).
    */
-  virtual bool finishPass(real cost) {
+  virtual bool finishPass() {
     optimizer_->finishPass();
-    return ParameterUpdater::finishPass(cost);
+    return ParameterUpdater::finishPass();
   }
 
   /**
@@ -208,7 +208,7 @@ public:
    * @brief init. Initialize cpu parameters, model average optimizer.
    * @param parameters
    */
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
 
   virtual PassType startBatch(int64_t batchSize) {
     averager_->startBatch(-1UL);
@@ -220,9 +220,9 @@ public:
     averager_->startPass();
     SgdLocalUpdater::startPass();
   }
-  virtual bool finishPass(real cost) {
+  virtual bool finishPass() {
     averager_->finishPass();
-    return SgdLocalUpdater::finishPass(cost);
+    return SgdLocalUpdater::finishPass();
   }
 
   /// apply the averaged parameter to PARAMETER_VALUE
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index b7f7b93b8d..6939738203 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_int32(trainer_id);
-P_DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
 
 namespace paddle {
 
@@ -44,7 +44,7 @@ RemoteParameterUpdater::RemoteParameterUpdater(
   addParameterType(PARAMETER_MOMENTUM);
 }
 
-void RemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
   if (localUpdater_) {
@@ -309,7 +309,7 @@ void RemoteParameterUpdater::startPass() {
   }
 }
 
-bool RemoteParameterUpdater::finishPass(real cost) {
+bool RemoteParameterUpdater::finishPass() {
   if (localUpdater_) {
     localUpdater_->finishPass();
   }
@@ -595,7 +595,8 @@ SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
       testing_(testing),
       useApplyInPserver_(false) {}
 
-void SparseRemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+void SparseRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
   parameterClient_.reset(new ParameterClient2(
@@ -711,7 +712,7 @@ void SparseRemoteParameterUpdater::startPass() {
   }
 }
 
-bool SparseRemoteParameterUpdater::finishPass(real cost) {
+bool SparseRemoteParameterUpdater::finishPass() {
   if (config_.algorithm() == TrainAlgorithm::SGD) {
     parameterClient_->waitPassFinish();
   } else {
@@ -809,7 +810,7 @@ void SparseRemoteParameterUpdater::saveParametersRemote(
 }
 
 void SparseRemoteParameterUpdaterComposite::init(
-    std::vector<ParameterPtr>& parameters) {
+    const std::vector<ParameterPtr>& parameters) {
   parameters_ = parameters;
 
   std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 66055c778e..5e82c94475 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
-      int expectedPpassCount,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -67,7 +67,7 @@ public:
   /**
    * initialize the internal parameter client and itself.
    */
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
   /**
    * @brief start batch
    *
@@ -90,7 +90,7 @@ public:
    */
   virtual void finishBatch(real cost);
   virtual void startPass();
-  virtual bool finishPass(real cost);
+  virtual bool finishPass();
 
 #ifndef PADDLE_DISABLE_TIMER
   virtual void setForwardbackwardTime(uint64_t delta) {
@@ -146,7 +146,7 @@ protected:
   BatchStatus batchStatus_;
   /// controller thread for sync-sgd
   std::unique_ptr<std::thread> controllerThread_;
-  /// passed alread finished
+  /// passed already finished
   int64_t passCount_;
   /// expected passes to finished
   int64_t expectedPassCount_;
@@ -274,14 +274,14 @@ public:
   }
 
   /// initialization
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
 
   /// stateful batch control
   virtual PassType startBatch(int64_t batchSize);
   /// send all sparse related parameters to all pservers
   virtual void finishBatch(real cost);
   virtual void startPass();
-  virtual bool finishPass(real cost);
+  virtual bool finishPass();
 
   virtual void apply();
   virtual void restore();
@@ -360,7 +360,7 @@ public:
   }
 
   /// initialization of dense and sparse updaters
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
 };
 
 class ParameterUpdaterCreators {
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 97d1b53934..13aa28ae5d 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -46,6 +46,12 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
       gradientMachine_(gradientMachine),
       parameterUpdater_(parameterUpdater),
       testDataProvider_(testDataProvider) {
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
+               << "when doing train and test jobs in the same "
+               << "process. You could run paddle --job=test in "
+               << "a separate process.";
+  }
   testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
@@ -251,7 +257,7 @@ void Tester::test() {
   CHECK(testDataProvider_) << "TestData is not specified";
   testDataProvider_->setSkipShuffle();
   testDataProvider_->reset();
-  gradientMachine_->start(*config_, testDataProvider_);
+  gradientMachine_->start();
 
   // For evaluation
   std::vector<std::string> modelList;
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index bee7f061fe..870d4a4b02 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
-P_DECLARE_int32(trainer_count);
+DECLARE_int32(trainer_count);
 
 namespace paddle {
 
@@ -32,7 +32,7 @@ SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
   }
 }
 
-void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
+void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
   // calc max parameter id
@@ -55,6 +55,9 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
       // not create parameter buf for PARAMETER_GRADIENT for sparse update in
       // Parameter::enableType(). But gradient parameter buf is still used
       // in SgdThreadUpdater. We need to explicitly create it.
+      //
+      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
+      // as a temp buffer.
       para->enableBufType(PARAMETER_GRADIENT);
     }
   }
@@ -67,7 +70,7 @@ void SgdThreadUpdater::startPass() {
   }
 }
 
-bool SgdThreadUpdater::finishPass(real cost) {
+bool SgdThreadUpdater::finishPass() {
   catchUpWith();
 
   for (auto& para : parameters_) {
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index d01ac689f9..bc08a9e9f0 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -33,8 +33,8 @@ namespace paddle {
    because at the current moment, the merging on CPU is happening on the
    main thread, and the its parameter size can be much larger than the one GPU.
    Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function as a callback function
-   supplied to backward() and forwardBackward().
+   is called by gradient machines as a callback function supplied to backward()
+   and forwardBackward().
    For CPU, the parameter updates happens in separate threads maintained by this
    class.
  */
@@ -47,9 +47,9 @@ public:
   virtual void startPass();
 
   // Use the finishPass() function of the base optimizer.
-  virtual bool finishPass(real cost);
+  virtual bool finishPass();
 
-  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual void init(const std::vector<ParameterPtr>& parameters);
   virtual PassType startBatch(int64_t batchSize);
   // Call finishBatch for each optimizer.
   virtual void finishBatch(real cost);
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 85610ec04e..8465addaf9 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "Trainer.h"
 
-#include <fenv.h>
 #include <stdio.h>
 
 #include <iomanip>
@@ -24,7 +23,7 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
@@ -38,60 +37,56 @@ limitations under the License. */
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
 
-P_DEFINE_string(config, "", "Trainer config file");
-
-P_DEFINE_int32(test_period,
-               0,
-               "if equal 0, do test on all test data at the end of "
-               "each pass. While if equal non-zero, do test on all test "
-               "data every test_period batches");
-P_DEFINE_bool(test_all_data_in_one_period,
-              false,
-              "This option was deprecated, since we will always do "
-              "test on all test set ");
-
-P_DEFINE_bool(local, true, "Train in local mode or not");
-
-P_DEFINE_int32(average_test_period,
-               0,
-               "Do test on average parameter every so"
-               " many batches. MUST be devided by FLAGS_log_period."
-               " Default 0 means do not test average parameter");
-
-P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches,
-               0,
-               "Save parameters every so many batches in one pass");
-P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass,
-               0,
-               "Start training from this pass. "
-               "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass,
-               -1,
-               "Will load parameter start from this pass to test");
-P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-P_DEFINE_bool(with_cost, true, "enable cost layer or not");
-P_DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-P_DEFINE_int32(num_passes, 100, "train for so many passes");
-
-P_DEFINE_string(config_args,
-                "",
-                "arguments passed to config file."
-                "Format: key1=value1,key2=value2");
-
-P_DEFINE_bool(save_only_one,
-              false,
-              "Save only parameters in last pass, remove previous.");
-
-P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir,
-                "",
-                "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list,
-                "",
-                "File that saves the model list when evaluation");
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
@@ -312,7 +307,7 @@ static double genPerturbation(real* d, real* grad, size_t dim) {
 }
 
 real Trainer::checkGradient() {
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+  trainerInternal_.getGradientMachine()->start();
   std::vector<ParameterPtr>& parameters =
       trainerInternal_.getGradientMachine()->getNonStaticParameters();
   DataBatch dataBatch;
@@ -394,7 +389,7 @@ void Trainer::startTrain() {
     dataProvider_->reset();
   }
 
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+  trainerInternal_.getGradientMachine()->start();
 }
 
 void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
@@ -541,7 +536,7 @@ void Trainer::trainOnePassBatch(int passId) {
 
   trainerInternal_.getGradientMachine()->onPassEnd();
 
-  bool accepted = trainerInternal_.getParameterUpdater()->finishPass(cost);
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
 
   globalStat.setThreadInfo(true);
   globalStat.printAllStatus();
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index cabbb4acd1..7cbf18ace7 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/internals/metric_learning/MetricTrainer.h"
 #endif
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
index 5c3177c808..173653c816 100644
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_int32(test_period);
+DECLARE_int32(test_period);
 
-P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 2017a08d20..60ac8459a1 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -18,16 +18,16 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(save_dir);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_bool(local);
-P_DECLARE_bool(with_cost);
-P_DECLARE_bool(with_gpu);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index a017cdec9d..039fcdb524 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period,
-               0,
-               "Whether to show parameter stats during training");
+DEFINE_int32(show_parameter_stats_period,
+             0,
+             "Whether to show parameter stats during training");
 
-P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
+DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
 
-P_DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
+DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
-P_DECLARE_bool(local);
+DECLARE_bool(local);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 0a4d56b892..c5c1d484e5 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,81 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 
-P_DEFINE_bool(start_pserver, false, "Whether to start pserver");
-P_DECLARE_int32(gpu_id);
-P_DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(rdma_tcp);
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-// write logs instantly (never buffer log messages)
-#ifdef PADDLE_USE_GLOG
+  // write logs instantly (never buffer log messages)
   FLAGS_logbuflevel = -1;
-#endif
+
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
-  std::vector<std::string> devices;
-
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
-    int rdmaCpu = 0;
-    int onlineCpus = rdma::numCpus();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 60c129f4e2..22e07bd0e9 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -17,9 +17,10 @@ add_test(NAME test_Compare
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
-set(diy_dll_dir ${CMAKE_CURRENT_BINARY_DIR}/../../gserver/tests)
 add_test(NAME test_Trainer
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
@@ -27,7 +28,8 @@ add_test(NAME test_Trainer
 add_unittest_without_exec(test_TrainerOnePass
     test_TrainerOnePass.cpp)
 add_test(NAME test_TrainerOnePass
-  COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
+        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
         ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
@@ -81,5 +83,5 @@ add_test(NAME test_PyDataProviderWrapper
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
diff --git a/paddle/trainer/tests/fake_file_list.list b/paddle/trainer/tests/fake_file_list.list
new file mode 100644
index 0000000000..f27ceed277
--- /dev/null
+++ b/paddle/trainer/tests/fake_file_list.list
@@ -0,0 +1 @@
+do_not_matter.txt
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
new file mode 100644
index 0000000000..9604e1b9b4
--- /dev/null
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=128, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
+
+file_list = 'trainer/tests/fake_file_list.list'
+
+define_py_data_sources2(
+    train_list=file_list,
+    test_list=file_list,
+    module="simple_sparse_neural_network_dp",
+    obj="process")
+
+embedding = embedding_layer(
+    input=data_layer(
+        name="word_ids", size=65536),
+    size=128,
+    param_attr=ParamAttr(sparse_update=True))
+prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
+
+outputs(
+    classification_cost(
+        input=prediction, label=data_layer(
+            name='label', size=10)))
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
new file mode 100644
index 0000000000..8bfd1f37e7
--- /dev/null
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
@@ -0,0 +1,21 @@
+from paddle.trainer.PyDataProvider2 import provider, integer_sequence, integer_value
+import random
+
+
+def init_hook(settings, is_train, **kwargs):
+    settings.is_train = is_train
+
+
+@provider(
+    input_types={'word_ids': integer_value(65536),
+                 'label': integer_value(10)},
+    min_pool_size=0,
+    init_hook=init_hook)
+def process(settings, filename):
+    if settings.is_train:
+        data_size = 2**20
+    else:
+        data_size = 2**10
+
+    for _ in xrange(data_size):
+        yield random.randint(0, 65535), random.randint(0, 9)
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 63fa48540c..e855a8fe2e 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -24,10 +24,10 @@ using namespace std;     // NOLINT
 
 static const string& configFile = "trainer/tests/sample_trainer_config.conf";
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
 
 struct comData {
   vector<Argument> outArgs;
@@ -50,7 +50,7 @@ void calcGradient(bool useGpu, comData& Data) {
   trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
   CHECK(dataBatch.getSize()) << "No data from data provider";
   vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->start();
   for (int i = 0; i < 2; ++i) {
     trainer.getGradientMachine()->forwardBackward(
         inArgs, &Data.outArgs, PASS_TRAIN);
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3fea3a3c24..a7000eb77e 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -25,22 +25,22 @@ using namespace std;     // NOLINT
 static const string& configFile1 =
     "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
-
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio,
-                0.0f,
-                "max diff ratio allowed for parameters value");
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 8a4556721d..94f65e545d 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -22,25 +22,25 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              false,
-              "whether need to run in double accuracy");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_int32(seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
 
 struct ComData {
   vector<Argument> outArgs;
@@ -72,7 +72,7 @@ void calcGradient(ComData& data, const string configFile) {
   CHECK(dataBatch.getSize()) << "No data from data provider";
   vector<Argument>& inArgs = dataBatch.getStreams();
 
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->start();
   trainer.getGradientMachine()->forwardBackward(
       inArgs, &data.outArgs, PASS_TRAIN);
 
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 673ef289d8..383505f813 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -22,20 +22,20 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              true,
-              "whether need to run in double accuracy (recommended)");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            true,
+            "whether need to run in double accuracy (recommended)");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 322121a579..0c79404eee 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -18,11 +18,11 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
-P_DEFINE_string(merger,
-                "./paddle_merge_model",
-                "path to paddle_merge_model binary");
+DECLARE_string(config);
+DECLARE_string(config_args);
+DEFINE_string(merger,
+              "./paddle_merge_model",
+              "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 0fede59f8d..264bc46ebc 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -28,10 +28,10 @@ static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(allow_only_one_model_on_one_gpu);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
 
 void checkGradientTest(const string& configFile,
                        bool useGpu,
@@ -96,11 +96,6 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
-#else
-  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
-#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 0b587ecce1..4d0174f784 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -27,12 +27,15 @@ static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
+static const string& configFileSimpleSparse =
+    "trainer/tests/simple_sparse_neural_network.py";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
 public:
@@ -122,10 +125,10 @@ TEST(average_window_cpu, gpu4) {
 #endif
 
 // 3. test trainer + pserver.
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
 
 double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   auto gradientMachine = trainer.getGradientMachine();
@@ -298,11 +301,15 @@ TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
 }
 
+TEST(SgdThreadUpdater, simpleSparseNN) {
+  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
+}
+
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
   initPython(argc, argv);
   gNumDevices = hl_get_device_count();
-  testing::InitGoogleTest(&argc, argv);
 
   FLAGS_num_passes = 1;          // train one pass
   FLAGS_saving_period = 100000;  // do not save parameteres
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 7d8dfd788f..03446b3b2f 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -30,7 +30,7 @@ static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
 static string expectFile =                                           // NOLINT
     "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
 
-P_DECLARE_string(config_args);
+DECLARE_string(config_args);
 
 vector<float> readRetFile(const string& fname) {
   ifstream inFile(fname);
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index f2cfd74094..956b606a18 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1 +1,2 @@
 enable_virtualenv.c
+PythonUtil.cpp
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index 9dde155aca..a6dbdcae3f 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -20,15 +20,15 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(log_barrier_abstract,
-              true,
-              "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes,
-               5,
-               "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log,
-              false,  // for performance tuning insight
-              "if true, always show barrier abstract even with little gap");
+DEFINE_bool(log_barrier_abstract,
+            true,
+            "if true, show abstract of barrier performance");
+DEFINE_int32(log_barrier_lowest_nodes,
+             5,
+             "how many lowest node will be logged");
+DEFINE_bool(log_barrier_show_log,
+            false,  // for performance tuning insight
+            "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 45240b5002..10d906ee16 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,5 +1,7 @@
 # The utilities for paddle
 
+configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
+
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(enable_virtualenv.py enable_virtualenv.c)
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
deleted file mode 100644
index 51558b45a1..0000000000
--- a/paddle/utils/CommandLineParser.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CommandLineParser.h"
-#ifndef PADDLE_USE_GFLAGS
-#include <stdlib.h>
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-static constexpr int kStatusOK = 0;
-static constexpr int kStatusInvalid = 1;
-static constexpr int kStatusNotFound = 2;
-
-/**
- * \brief: Convert a string to any type value.
- *
- * \note: It will specialize by type T that is supported.
- */
-template <typename T>
-bool StringToValue(const std::string& content, T* value) {
-  bool ok;
-  *value = str::toWithStatus<T>(content, &ok);
-  return ok;
-}
-
-template <>
-bool StringToValue<bool>(const std::string& content, bool* value) {
-  std::string tmp = content;
-
-  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
-    if (in <= 'Z' && in >= 'A') {
-      return in - ('Z' - 'z');
-    } else {
-      return in;
-    }
-  });  // tolower.
-
-  if (tmp == "true" || tmp == "1") {
-    *value = true;
-    return true;
-  } else if (tmp == "false" || tmp == "0") {
-    *value = false;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <>
-bool StringToValue<std::string>(const std::string& content,
-                                std::string* value) {
-  *value = content;
-  return true;
-}
-
-/**
- * \brief Parse argument "--blah=blah".
- *
- * \param argument: The command line argument string, such as "--blah=blah"
- * \param [out] extraInfo: The details error message for parse argument.
- * \return: kStatusOK, kStatusInvalid, kStatusNotFound
- */
-template <typename T>
-int ParseArgument(const std::string& argument, std::string* extraInfo) {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    T* value = command.value;
-
-    std::string prefix = "--";
-    prefix += name;
-    prefix += "=";
-    std::string content;
-    if (str::startsWith(argument, prefix)) {
-      content = argument.substr(prefix.size(), argument.size() - prefix.size());
-    } else {
-      prefix = "-";
-      prefix += name;
-      prefix += "=";
-      if (str::startsWith(argument, prefix)) {
-        content =
-            argument.substr(prefix.size(), argument.size() - prefix.size());
-      }
-    }
-
-    if (!content.empty()) {
-      if (StringToValue(content, value)) {
-        return kStatusOK;
-      } else {
-        *extraInfo = name;
-        return kStatusInvalid;
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * @brief ParseBoolArgumentExtra
- * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
- * false
- */
-static int ParseBoolArgumentExtra(const std::string& argument,
-                                  std::string* extraInfo) {
-  (void)(extraInfo);  // unused extraInfo, just make api same.
-
-  //! @warning: The order and content of prefixes is DESIGNED for parsing
-  //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
-  //! use of this fact. DO NOT CHANGE IT without reading how to parse command
-  //! below.
-  static const std::vector<std::pair<const char*, bool>> prefixes = {
-      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
-
-  for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
-       flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
-    if (argument.size() > command.name.size()) {
-      //! Use the length of prefix is 1, 2, 3, 4.
-      size_t diff = argument.size() - command.name.size() - 1UL;
-      if (diff < prefixes.size()) {
-        const std::string& prefix = std::get<0>(prefixes[diff]);
-        if (argument == prefix + command.name) {
-          *command.value = std::get<1>(prefixes[diff]);
-          return kStatusOK;
-        }
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * \brief: Print command line arguments' usage with type T.
- */
-template <typename T>
-static void PrintTypeUsage() {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    name = "--" + name;  // Program will exit, so modify name is safe.
-    std::string& desc = command.text;
-    T& defaultValue = command.defaultValue;
-    std::cerr << std::setw(20) << name << ": " << desc
-              << "[default:" << defaultValue << "]." << std::endl;
-  }
-}
-
-template <typename... TS>
-static void PrintTypeUsages() {
-  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
-  (void)(unused);
-}
-/**
- * \brief: Print all usage, and exit(1)
- */
-static void PrintUsageAndExit(const char* argv0) {
-  std::cerr << "Program " << argv0 << " Flags: " << std::endl;
-  PrintTypeUsages<bool, int32_t, std::string, double, int64_t, uint64_t>();
-  exit(1);
-}
-
-/**
- * \brief: Print the error flags, usage, and exit.
- */
-static void PrintParseError(const std::string& name,
-                            const char* actualInput,
-                            const char* arg0) {
-  std::cerr << "Parse command flag " << name << " error! User input is "
-            << actualInput << std::endl;
-  PrintUsageAndExit(arg0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
-  int unused_argc = 1;
-  std::string extra;
-  for (int i = 1; i < *argc; ++i) {
-    std::string arg = argv[i];
-    int s = kStatusInvalid;
-#define ParseArgumentWithType(type)           \
-  s = ParseArgument<type>(arg, &extra);       \
-  if (s == kStatusOK) {                       \
-    continue;                                 \
-  } else if (s == kStatusInvalid) {           \
-    PrintParseError(extra, argv[i], argv[0]); \
-  }
-
-    ParseArgumentWithType(bool);  // NOLINT
-    ParseArgumentWithType(int32_t);
-    ParseArgumentWithType(double);  // NOLINT
-    ParseArgumentWithType(int64_t);
-    ParseArgumentWithType(uint64_t);
-    ParseArgumentWithType(std::string);
-
-#undef ParseArgumentWithType
-    s = ParseBoolArgumentExtra(arg, &extra);
-    if (s == kStatusOK) {
-      continue;
-    }
-
-    if (withHelp && (arg == "--help" || arg == "-h")) {
-      PrintUsageAndExit(argv[0]);
-    }
-
-    // NOT Found for all flags.
-    std::swap(argv[unused_argc++], argv[i]);
-  }
-  *argc = unused_argc;
-}
-
-}  // namespace paddle
-#else
-namespace paddle {
-#ifndef GFLAGS_NS
-#define GFLAGS_NS google
-#endif
-
-namespace gflags_ns = GFLAGS_NS;
-
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
-  if (withHelp) {
-    gflags_ns::ParseCommandLineFlags(argc, &argv, true);
-  } else {
-    gflags_ns::ParseCommandLineNonHelpFlags(argc, &argv, true);
-  }
-}
-
-}  // namespace paddle
-#endif
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
deleted file mode 100644
index b4449c6f09..0000000000
--- a/paddle/utils/CommandLineParser.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef PADDLE_USE_GFLAGS
-#include <stdint.h>
-#include <string>
-#include <vector>
-#include "DisableCopy.h"
-
-namespace paddle {
-
-namespace flags_internal {
-
-/**
- * Command line flag registry for special type T. It will store all command
- * arguments settings. such as name, default value.
- */
-template <typename T>
-struct CommandLineFlagRegistry {
-  /**
-   * The factory method of CommandLineFlagRegistry
-   *
-   * \return: The singleton instance of CommandLineFlagRegistry.
-   */
-  static CommandLineFlagRegistry* Instance() {
-    static CommandLineFlagRegistry instance_;
-    return &instance_;
-  }
-
-  struct Command {
-    /// name of argument.
-    std::string name;
-    /// address of actual variable. such as FLAGS_xxx.
-    T* value;
-    /// usage text.
-    std::string text;
-    /// default value of this command.
-    T defaultValue;
-  };
-
-  /// the command line arguments of type T.
-  std::vector<Command> commands;
-
-  DISABLE_COPY(CommandLineFlagRegistry);
-
-private:
-  inline CommandLineFlagRegistry() {}
-};
-
-/**
- *Helper class to register command line flag.
- */
-template <typename T>
-struct CommandLineFlagRegister {
-  /**
-   * \brief: Register a command line argument
-   *
-   * \param [in] name: The command line name.
-   * \param [inout] val: The command line argument instance, FLAGS_xxx.
-   * \param [in] desc: The command line helper message.
-   */
-  CommandLineFlagRegister(const std::string& name,
-                          T* val,
-                          const std::string desc) {
-    CommandLineFlagRegistry<T>::Instance()->commands.push_back(
-        {name, val, desc, *val});
-  }
-};
-
-/**
- * \brief: Define a command line arguments.
- *
- * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the
- *variable
- *is 'FLAGS_name'
- * \param default_value: The default value of command line argument.
- * \param text: The description in command line argument.
- */
-#define PADDLE_DEFINE_variable(type, name, default_value, text) \
-  type FLAGS_##name = default_value;                            \
-  namespace paddle_flags_internal {                             \
-  paddle::flags_internal::CommandLineFlagRegister<type>         \
-      flags_internal_var_##name(#name, &FLAGS_##name, text);    \
-  }  // namespace paddle_flags_internal
-
-/**
- * Declare a variable to use.
- */
-#define PADDLE_DECLARE_variable(type, name) extern type FLAGS_##name;
-
-// DEFINE macro for each types.
-#define P_DEFINE_int32(name, default_value, text) \
-  PADDLE_DEFINE_variable(int32_t, name, default_value, text)
-
-#define P_DEFINE_bool(name, default_value, text) \
-  PADDLE_DEFINE_variable(bool, name, default_value, text)
-
-#define P_DEFINE_string(name, default_value, text) \
-  PADDLE_DEFINE_variable(std::string, name, default_value, text)
-
-#define P_DEFINE_double(name, default_value, text) \
-  PADDLE_DEFINE_variable(double, name, default_value, text)
-
-#define P_DEFINE_int64(name, default_value, text) \
-  PADDLE_DEFINE_variable(int64_t, name, default_value, text)
-
-#define P_DEFINE_uint64(name, default_value, text) \
-  PADDLE_DEFINE_variable(uint64_t, name, default_value, text)
-
-// Declare macro for each types.
-#define P_DECLARE_int32(name) PADDLE_DECLARE_variable(int32_t, name)
-#define P_DECLARE_bool(name) PADDLE_DECLARE_variable(bool, name)
-#define P_DECLARE_string(name) PADDLE_DECLARE_variable(std::string, name)
-#define P_DECLARE_double(name) PADDLE_DECLARE_variable(double, name)
-#define P_DECLARE_int64(name) PADDLE_DECLARE_variable(int64_t, name)
-#define P_DECLARE_uint64(name) PADDLE_DECLARE_variable(uint64_t, name)
-}  // namespace flags_internal
-
-/**
- * \brief Parse command line flags. If parse error, just failed and exit 1.
- *
- * \param [inout] argc: The command argument count. This method will modify
- *argc, and left unused arguments.
- * \param [inout] argv: The command argument values. This method will modify
- *argv, and left unused arguments.
- * \param [in] withHelp: True will parse '-h' and '--help' to print usage.
- *
- * \note: The Command line flags format basically as follow:
- *
- *  * If the type of flag is not bool, then the follow format of command line
- *    will be parsed:
- *    * --flag_name=value
- *    * -flag_name=value
- *
- *  * If the flag is bool, then:
- *    * --flag_name=value, -flag_name=value will be parsed.
- *       * if value.tolower() == "true"| "1" will be treated as true.
- *       * else if value.tolower() == "false" | "0" will be treated as false.
- *    * --flag_name will be parsed as true.
- *    * --noflag_name will be parsed as false.
- */
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
-
-}  // namespace paddle
-
-#else  // if use gflags.
-#include <gflags/gflags.h>
-
-#define P_DEFINE_int32 DEFINE_int32
-#define P_DEFINE_bool DEFINE_bool
-#define P_DEFINE_string DEFINE_string
-#define P_DEFINE_double DEFINE_double
-#define P_DEFINE_int64 DEFINE_int64
-#define P_DEFINE_uint64 DEFINE_uint64
-#define P_DECLARE_int32 DECLARE_int32
-#define P_DECLARE_bool DECLARE_bool
-#define P_DECLARE_string DECLARE_string
-#define P_DECLARE_double DECLARE_double
-#define P_DECLARE_int64 DECLARE_int64
-#define P_DECLARE_uint64 DECLARE_uint64
-namespace paddle {
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/Common.h
similarity index 69%
rename from paddle/utils/TypeDefs.h
rename to paddle/utils/Common.h
index c50a05e82d..1f1d0255a5 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/Common.h
@@ -14,13 +14,22 @@ limitations under the License. */
 
 #pragma once
 
+#include "Excepts.h"
+
+/**
+ * Disable copy macro.
+ */
+#define DISABLE_COPY(class_name)                \
+  class_name(class_name &&) = delete;           \
+  class_name(const class_name &other) = delete; \
+  class_name &operator=(const class_name &other) = delete
+
 namespace paddle {
+
 #ifdef PADDLE_TYPE_DOUBLE
-typedef double real;
+using real = double;
 #else
-typedef float real;
+using real = float;
 #endif
 
 }  // namespace paddle
-
-using paddle::real;
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 7a354da758..0f3985cc7b 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "DisableCopy.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 083f5c509a..9723d7df97 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CustomStackTrace.h"
+#include <gflags/gflags.h>
 #include <iostream>
-#include "CommandLineParser.h"
 
-P_DEFINE_bool(
+DEFINE_bool(
     layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index dc3369b7e8..5c2c504f53 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef EXCEPTS_H_
 #define EXCEPTS_H_
 
+#include <fenv.h>
+
 #if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 1c9e602f45..59d6cbdc51 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -15,65 +15,61 @@ limitations under the License. */
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
-P_DEFINE_bool(use_gpu, false, "Only support CPU training");
+DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
-P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
+DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-P_DEFINE_bool(
-    parallel_nn,
-    false,
-    "Whether to use multi-threads to calculate one neural network."
-    "If it was set false, use gpu_id specify which gpu core to use"
-    "(the device property in the trainer config file will be ingored)."
-    "If it was set true, the gpu core is specified by the trainer"
-    "  config file(gpu_id will be ignored).");
-P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-P_DEFINE_int32(port, 20134, "Listening port for pserver");
-P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num,
-               1,
-               "The ports number for parameter send,"
-               " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse,
-               0,
-               "The ports number for parameter send,"
-               " increment based on default (port + ports_num)");
-P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-P_DEFINE_int32(
-    trainer_id,
-    0,
-    "For distributed training, each trainer must be given an unique id"
-    " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-    " trainer");
-P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy,
-                "fail",
-                "which operation to take on load model fails. support "
-                "fail/rand/zero only.");
-P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server,
-               500,
-               "Log progress every so many batches at pserver end");
-P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector,
-               0,
-               "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver,
-              false,
-              "load and save parameters in pserver. "
-              "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size,
-               1,
-               "Beam size used in generating most probable output sequences.");
+DEFINE_bool(parallel_nn,
+            false,
+            "Whether to use multi-threads to calculate one neural network."
+            "If it was set false, use gpu_id specify which gpu core to use"
+            "(the device property in the trainer config file will be ingored)."
+            "If it was set true, the gpu core is specified by the trainer"
+            "  config file(gpu_id will be ignored).");
+DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
+DEFINE_int32(gpu_id, 0, "Which gpu core to use");
+DEFINE_int32(port, 20134, "Listening port for pserver");
+DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
+DEFINE_int32(ports_num,
+             1,
+             "The ports number for parameter send,"
+             " increment based on default port number");
+DEFINE_int32(ports_num_for_sparse,
+             0,
+             "The ports number for parameter send,"
+             " increment based on default (port + ports_num)");
+DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
+DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
+DEFINE_int32(trainer_id,
+             0,
+             "For distributed training, each trainer must be given an unique id"
+             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
+             " trainer");
+DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
+DEFINE_string(comment, "", "A string for commenting this training task");
+DEFINE_string(load_missing_parameter_strategy,
+              "fail",
+              "which operation to take on load model fails. support "
+              "fail/rand/zero only.");
+DEFINE_int32(log_period, 100, "Log progress every so many batches");
+DEFINE_int32(log_period_server,
+             500,
+             "Log progress every so many batches at pserver end");
+DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
+DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
+DEFINE_bool(loadsave_parameters_in_pserver,
+            false,
+            "load and save parameters in pserver. "
+            "only work while parameter set sparse_remote_update.");
+DEFINE_int32(beam_size,
+             1,
+             "Beam size used in generating most probable output sequences.");
 
-P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-P_DEFINE_string(predict_file, "", "File name for saving predict result");
-P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path,
-                "",
-                "Path of the initial model parameters."
-                "If it was set, start_pass will be ignored.");
+DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
+DEFINE_string(predict_file, "", "File name for saving predict result");
+DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
+DEFINE_string(init_model_path,
+              "",
+              "Path of the initial model parameters."
+              "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 922533d63e..3e72f8356d 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -14,30 +14,30 @@ limitations under the License. */
 
 #pragma once
 
-#include "CommandLineParser.h"
+#include <gflags/gflags.h>
 
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_int32(async_count);
-P_DECLARE_int32(port);
-P_DECLARE_int32(data_server_port);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(trainer_count);
-P_DECLARE_int32(ports_num);
-P_DECLARE_int32(ports_num_for_sparse);
-P_DECLARE_string(nics);
-P_DECLARE_string(rdma_tcp);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_string(comment);
-P_DECLARE_string(load_missing_parameter_strategy);
-P_DECLARE_int32(log_period);
-P_DECLARE_int32(log_period_server);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_int32(enable_parallel_vector);
-P_DECLARE_bool(loadsave_parameters_in_pserver);
-P_DECLARE_int32(beam_size);
-P_DECLARE_bool(show_layer_stat);
-P_DECLARE_string(predict_file);
-P_DECLARE_bool(prev_batch_state);
-P_DECLARE_string(init_model_path);
+DECLARE_bool(parallel_nn);
+DECLARE_int32(async_count);
+DECLARE_int32(port);
+DECLARE_int32(data_server_port);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_int32(trainer_count);
+DECLARE_int32(ports_num);
+DECLARE_int32(ports_num_for_sparse);
+DECLARE_string(nics);
+DECLARE_string(rdma_tcp);
+DECLARE_int32(trainer_id);
+DECLARE_int32(num_gradient_servers);
+DECLARE_string(comment);
+DECLARE_string(load_missing_parameter_strategy);
+DECLARE_int32(log_period);
+DECLARE_int32(log_period_server);
+DECLARE_double(checkgrad_eps);
+DECLARE_int32(enable_parallel_vector);
+DECLARE_bool(loadsave_parameters_in_pserver);
+DECLARE_int32(beam_size);
+DECLARE_bool(show_layer_stat);
+DECLARE_string(predict_file);
+DECLARE_bool(prev_batch_state);
+DECLARE_string(init_model_path);
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 0f922f3548..e87abb9139 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "DisableCopy.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 20f32466a5..5a1c6ecb22 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,175 +18,9 @@ limitations under the License. */
  */
 
 #include "Logging.h"
-#ifndef PADDLE_USE_GLOG
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
 
 namespace paddle {
 
-namespace internal {
-
-std::string join(const std::string& part1, const std::string& part2) {
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline bool env2bool(const char* envName, bool defaultValue = false) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    return memchr("tTyY1\0", envValue[0], 6) != nullptr;
-  }
-}
-
-static inline int env2int(const char* envName, int defaultValue = 0) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    int retValue = defaultValue;
-    try {
-      retValue = std::stoi(envValue);
-    } catch (...) {
-      // pass
-    }
-    return retValue;
-  }
-}
-
-static inline int env2index(const char* envName,
-                            const std::vector<std::string>& options,
-                            int defaultValue) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    for (size_t i = 0; i < options.size(); ++i) {
-      if (options[i] == envValue) {
-        return static_cast<int>(i);
-      }
-    }
-    return defaultValue;
-  }
-}
-
-static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {
-    "INFO", "WARNING", "ERROR", "FATAL"};
-static int gMinLogLevel =
-    env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
-
-static std::vector<std::vector<int>> gLogFds;
-static std::vector<int> gLogFileFds;
-static bool gLogInited = false;
-static void freeLogFileFds() {
-  for (auto fd : gLogFileFds) {
-    close(fd);
-  }
-}
-
-static void initializeLogFds(char* argv0) {
-  gLogFds.resize(NUM_SEVERITIES);
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && gLogToStderr;
-       ++i) {  // Add stderr
-    std::vector<int>& fds = gLogFds[i];
-    fds.push_back(STDERR_FILENO);
-  }
-
-  char* logDir = getenv("PLOG_LOGDIR");
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && logDir != nullptr; ++i) {
-    std::string filename =
-        join(logDir, std::string(argv0) + "." + gLevelName[i]);
-    int fd = open(filename.c_str(), O_CREAT | O_WRONLY, 0644);
-    if (fd == -1) {
-      fprintf(stderr, "Open log file error!");
-      exit(1);
-    }
-    gLogFileFds.push_back(fd);
-
-    std::vector<int>& curFds = gLogFds[i];
-    curFds.insert(curFds.end(), gLogFileFds.begin(), gLogFileFds.end());
-  }
-
-  atexit(freeLogFileFds);
-  gLogInited = true;
-}
-
-static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
-
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
-LogMessage::~LogMessage() { this->generateLogMessage(); }
-
-void LogMessage::generateLogMessage() {
-  if (!gLogInited) {
-    fprintf(stderr,
-            "%c %s:%d] %s\n",
-            "IWEF"[severity_],
-            fname_,
-            line_,
-            str().c_str());
-  } else {
-    for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd,
-              "%c %s:%d] %s\n",
-              "IWEF"[severity_],
-              fname_,
-              line_,
-              str().c_str());
-    }
-  }
-}
-
-LogMessageFatal::LogMessageFatal(const char* file, int line)
-    : LogMessage(file, line, FATAL) {}
-
-LogMessageFatal::~LogMessageFatal() {
-  generateLogMessage();
-  gFailureFunctionPtr();
-}
-}  // namespace internal
-
-void initializeLogging(int argc, char** argv) {
-  internal::initializeLogFds(argv[0]);
-}
-
-namespace logging {
-void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
-
-void installFailureFunction(void (*callback)() ATTR_NORETURN) {
-  paddle::internal::gFailureFunctionPtr = callback;
-}
-
-}  // namespace logging
-
-}  // namespace paddle
-
-#else
-namespace paddle {
 void initializeLogging(int argc, char** argv) {
   (void)(argc);
   if (!getenv("GLOG_logtostderr")) {
@@ -197,13 +31,16 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
+
 void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
+
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
+
 void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
+
 }  // namespace logging
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 4379289f6d..d9e551f089 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -22,175 +22,21 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 
-#ifndef PADDLE_USE_GLOG
-#include "CompilerMacros.h"
-
-//! TODO(yuyang18): Move this utility macro into some global header.
-#define PP_CAT(a, b) PP_CAT_I(a, b)
-#define PP_CAT_I(a, b) PP_CAT_II(~, a##b)
-#define PP_CAT_II(p, res) res
-
-/**
- * Generate Unique Variable Name, Usefully in macro.
- * @SEE
- * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
- */
-#define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
-
+#include <glog/logging.h>
 namespace paddle {
 
-//! Log levels.
-const int INFO = 0;
-const int WARNING = 1;
-const int ERROR = 2;
-const int FATAL = 3;
-const int NUM_SEVERITIES = 4;
-
-namespace internal {
-
-class LogMessage : public std::basic_ostringstream<char> {
-public:
-  LogMessage(const char* fname, int line, int severity);
-  ~LogMessage();
-
-protected:
-  /**
-   * @brief Print log message to stderr, files, etc.
-   */
-  void generateLogMessage();
-
-private:
-  const char* fname_;
-  int line_;
-  int severity_;
-};
-
-// LogMessageFatal ensures the process will exit in failure after
-// logging this message.
-class LogMessageFatal : public LogMessage {
-public:
-  LogMessageFatal(const char* file, int line) __attribute__((cold));
-  ~LogMessageFatal() __attribute__((noreturn));
-};
-
-#define _P_LOG_INFO \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::INFO)
-#define _P_LOG_WARNING \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::WARNING)
-#define _P_LOG_ERROR \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::ERROR)
-#define _P_LOG_FATAL ::paddle::internal::LogMessageFatal(__FILE__, __LINE__)
-
-#define P_LOG(severity) _P_LOG_##severity
-
-#define P_LOG_FIRST_N(severity, n)                                       \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                           \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) ++UNIQUE_NAME(LOG_OCCURRENCES); \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) P_LOG(severity)
-
-#define P_LOG_IF_EVERY_N(severity, condition, n)                              \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                                \
-  if (condition && ((UNIQUE_NAME(LOG_OCCURRENCES) =                           \
-                         (UNIQUE_NAME(LOG_OCCURRENCES) + 1) % n) == (1 % n))) \
-  P_LOG(severity)
-
-#define P_LOG_EVERY_N(severity, n) P_LOG_IF_EVERY_N(severity, true, n)
-
-// TODO(jeff): Define a proper implementation of VLOG_IS_ON
-#define P_VLOG_IS_ON(lvl) ((lvl) <= 0)
-
-#define P_LOG_IF(severity, condition) \
-  if (condition) P_LOG(severity)
-
-#define P_VLOG(lvl) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl))
-
-#define P_VLOG_IF(lvl, cond) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl) && cond)
-
-#define P_VLOG_EVERY_N(lvl, n) P_LOG_IF_EVERY_N(INFO, P_VLOG_IS_ON(lvl), n)
-
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
-#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
-
-// CHECK dies with a fatal error if condition is not true.  It is *not*
-// controlled by NDEBUG, so the check will be executed regardless of
-// compilation mode.  Therefore, it is safe to do things like:
-//    CHECK(fp->Write(x) == 4)
-#define P_CHECK(condition)         \
-  if (PREDICT_FALSE(!(condition))) \
-  P_LOG(FATAL) << "Check failed: " #condition " "
-
-#define P_CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define P_CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define P_CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define P_CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define P_CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define P_CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define P_CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-
-//! GLOG compatible APIs
-//! NOTE: only implement Paddle actually used APIs.
-#define LOG(x) P_LOG(x)
-#define VLOG(x) P_VLOG(x)
-#define DLOG(x) P_VLOG(5)
-#define CHECK(x) P_CHECK(x)
-#define PCHECK(x) P_CHECK(x)
-#define CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-#define VLOG_IS_ON(x) P_VLOG_IS_ON(x)
-#define LOG_FIRST_N(severity, n) P_LOG_FIRST_N(severity, n)
-#define LOG_IF(severity, condition) P_LOG_IF(severity, condition)
-#define VLOG_EVERY_N(lvl, n) P_VLOG_EVERY_N(lvl, n)
-#define VLOG_IF(lvl, cond) P_VLOG_IF(lvl, cond)
-#define LOG_EVERY_N(severity, n) P_LOG_EVERY_N(severity, n)
-}  //  namespace internal
-
-/**
- * @brief initialize logging
- * @note: Current implement of logging is lack of:
- *          PrintCallStack when fatal.
- *          VLOG_IS_ON
- *        But it is portable to multi-platform, and simple enough to modify.
- */
 void initializeLogging(int argc, char** argv);
-namespace logging {
-/**
- * @brief Set Min Log Level. if Log.level < minLogLevel, then will not print log
- *        to stream
- * @param level. Any integer is OK, but only 0 <= x <= NUM_SEVERITIES is useful.
- */
-void setMinLogLevel(int level);
-
-/**
- * @brief Install Log(Fatal) failure function. Default is abort();
- * @param callback: The failure function.
- */
-void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
-/**
- * @brief installFailureWriter
- * @note: not implemented currently.
- */
-inline void installFailureWriter(void (*callback)(const char*, int)) {
-  (void)(callback);  // unused callback.
-}
-}  //  namespace logging
-}  //  namespace paddle
-#else
-#include <glog/logging.h>
-namespace paddle {
-void initializeLogging(int argc, char** argv);
 namespace logging {
+
 void setMinLogLevel(int level);
+
 void installFailureFunction(void (*callback)());
+
 void installFailureWriter(void (*callback)(const char*, int));
-}  //  namespace logging
-}
-#endif  // PADDLE_USE_GLOG
+
+}  // namespace logging
+}  // namespace paddle
 
 #ifndef NDEBUG
 #define DEBUG_LEVEL 5
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp.in
similarity index 97%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/utils/PythonUtil.cpp.in
index 2ee4e4fb7e..66b5795e29 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -20,8 +20,8 @@ namespace paddle {
 
 #ifdef PADDLE_NO_PYTHON
 
-P_DEFINE_string(python_path, "", "python path");
-P_DEFINE_string(python_bin, "python2.7", "python bin");
+DEFINE_string(python_path, "", "python path");
+DEFINE_string(python_bin, "python2.7", "python bin");
 
 constexpr int kExecuteCMDBufLength = 204800;
 
@@ -195,6 +195,10 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
+  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (strlen(pyHome)) {
+    Py_SetPythonHome(pyHome);
+  }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 44acee2495..c7194d3bf1 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -137,6 +137,9 @@ void StatSet::printSegTimerStatus() {
 
 void StatSet::printBarrierTimerStatus() {
   ReadLockGuard guard(lock_);
+  if (barrierStatSet_.empty()) {
+    return;
+  }
   // control barrierAbstact in runtime, so enable compliation
   LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
             << "======= BarrierStatSet status ======" << std::endl;
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 9be79e8859..d9cc6e413a 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -258,28 +258,41 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 // The default arguments are shown in the following line:
 // REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
 // TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
-#define REGISTER_TIMER(statName, ...)                                       \
-  static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
+#define REGISTER_TIMER(statName, ...)                             \
+  static ::paddle::StatPtr __stat =                               \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                \
+      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
 
 #define REGISTER_TIMER_SET(statName, start, ...)                            \
-  static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(                                                    \
-      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
+  static ::paddle::StatPtr __stat =                                         \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
+  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
+                                  "",                                       \
+                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
+                                  false,                                    \
+                                  start);
 
 // dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                        \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)             \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(                                             \
-      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
-
-#define REGISTER_TIMER_INFO(statName, info)             \
-  static StatPtr __stat = globalStat.getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/);
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                     \
+  ::paddle::StatPtr __stat =                                      \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                \
+      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
+
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
+  ::paddle::StatPtr __stat =                                                \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
+  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
+                                  "",                                       \
+                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
+                                  false,                                    \
+                                  start);
+
+#define REGISTER_TIMER_INFO(statName, info)                                 \
+  static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                          \
+      __stat.get(), info, 10 * 1000000LU /*threshold*/);
 
 #endif  // DISABLE_TIMER
 
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 8a2878fc4b..58fe51bd40 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ThreadLocal.h"
-#include "CommandLineParser.h"
+
+#include <gflags/gflags.h>
+
 #include "Util.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed,
-              false,
-              "Whether to use global seed in thread local rand.");
+DEFINE_bool(thread_local_rand_use_global_seed,
+            false,
+            "Whether to use global seed in thread local rand.");
 
 namespace paddle {
 
@@ -35,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
       p = new unsigned int(defaultSeed_ - 1);
     } else {
       p = new unsigned int(defaultSeed_ + getTID());
-      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+      VLOG(3) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
   }
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 26ff385c84..411a64aa8d 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -24,16 +24,16 @@ limitations under the License. */
 #include <fstream>
 #include <mutex>
 
-#include "paddle/utils/Logging.h"
+#include <gflags/gflags.h>
 
-#include "CommandLineParser.h"
 #include "CustomStackTrace.h"
+#include "Logging.h"
 #include "StringUtil.h"
 #include "Thread.h"
 #include "ThreadLocal.h"
 #include "Version.h"
 
-P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
+DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #ifdef WITH_GOOGLE_PERFTOOLS
 /*
@@ -52,10 +52,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #include <gperftools/profiler.h>
 
-P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file,
-                "gperf.prof",
-                "file for storing profile data");
+DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
+DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
   bool static started = false;
@@ -127,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
 
 void runInitFunctions() {
   std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
+    VLOG(3) << "Calling runInitFunctions";
     if (g_initFuncs) {
       std::sort(g_initFuncs->begin(),
                 g_initFuncs->end(),
@@ -141,7 +139,7 @@ void runInitFunctions() {
       g_initFuncs = nullptr;
     }
     g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
+    VLOG(3) << "Call runInitFunctions done.";
   });
 }
 
@@ -154,7 +152,12 @@ void initMain(int argc, char** argv) {
     line += ' ';
   }
   LOG(INFO) << "commandline: " << line;
-  ParseCommandLineFlags(&argc, argv, true);
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
 
   installProfilerSwitch();
@@ -228,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
 }  // namespace path
 
 void copyFileToPath(const std::string& file, const std::string& dir) {
-  LOG(INFO) << "copy " << file << " to " << dir;
+  VLOG(3) << "copy " << file << " to " << dir;
   std::string fileName = path::basename(file);
   std::string dst = path::join(dir, fileName);
   std::ifstream source(file, std::ios_base::binary);
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 24ddde28e7..613844669d 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -26,13 +26,11 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "CommandLineParser.h"
-#include "DisableCopy.h"
+#include "Common.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
 
 #include "Flags.h"
-#include "TypeDefs.h"
 #include "hl_gpu.h"
 
 /**
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index a9e351b69f..731c308421 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -18,13 +18,8 @@ limitations under the License. */
 #include <numeric>
 #include "Flags.h"
 #include "Util.h"
-//! TODO(yuyang18) in gflags, version has another define. Use another flag
-//! instead.
-#ifndef PADDLE_USE_GFLAGS
-P_DEFINE_bool(version, false, "print version");
-#else
-P_DECLARE_bool(version);
-#endif
+
+DECLARE_bool(version);
 
 namespace paddle {
 namespace version {
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index d1a07d9485..f53d6420bb 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 #include <iostream>
-#include "TypeDefs.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
similarity index 97%
rename from paddle/utils/Excepts.cpp
rename to paddle/utils/arch/osx/Excepts.cpp
index 4ddce35ed3..c8e904d8f9 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Excepts.h"
+#include "paddle/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
 
-#include <fenv.h>
-
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 298ede5cd6..26fafbd1ab 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_simple_unittest(test_CommandLineParser)
-add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
deleted file mode 100644
index ed2b3068d5..0000000000
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_USE_GFLAGS
-//! Test Command Line Parser for paddle internal implement.
-
-#include <gtest/gtest.h>
-#include <paddle/utils/CommandLineParser.h>
-
-P_DEFINE_int32(i1, 1, "test int flag 1");
-P_DEFINE_int32(i2, 2, "test int flag 2");
-
-P_DEFINE_string(str1, "1", "test str flag 1");
-P_DEFINE_string(str2, "2", "test str flag 2");
-
-P_DEFINE_bool(b1, true, "test bool flag 1");
-P_DEFINE_bool(b2, false, "test bool flag 2");
-
-P_DEFINE_double(d1, 0.1, "test double flag 1");
-P_DEFINE_double(d2, -42.3, "test double flag 2");
-
-P_DEFINE_int64(l1, 1, "test int64 flag 1");
-P_DEFINE_int64(l2, 2, "test int64 flag 2");
-
-P_DEFINE_uint64(ul1, 32, "test uint64 flag 1");
-P_DEFINE_uint64(ul2, 33, "test uint64 flag 2");
-
-constexpr double EPSILON = 1e-5;
-
-#define cc(x) const_cast<char*>((x))
-
-TEST(CommandLineParser, defaultValue) {
-  char* argv[] = {cc("test_program"), cc("--unused_flag=134")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  paddle::ParseCommandLineFlags(&argc, argv);
-
-  // Check Default Value
-  ASSERT_EQ(argc, 2);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_str1, "1");
-  ASSERT_EQ(FLAGS_str2, "2");
-  ASSERT_EQ(FLAGS_b1, true);
-  ASSERT_EQ(FLAGS_b2, false);
-  ASSERT_NEAR(FLAGS_d1, 0.1, EPSILON);
-  ASSERT_NEAR(FLAGS_d2, -42.3, EPSILON);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_ul1, 32UL);
-  ASSERT_EQ(FLAGS_ul2, 33UL);
-}
-
-TEST(CommandLineParser, normal) {
-  char* argv[] = {cc("test_program"),
-                  cc("--i2=32"),
-                  cc("--str1=abc"),
-                  cc("--b2=1"),
-                  cc("-b1=False"),
-                  cc("--d2=.34"),
-                  cc("--d1=0"),
-                  cc("--l1=-12345678901234"),
-                  cc("-ul2=3212")};
-  int argc = sizeof(argv) / sizeof(char*);
-  paddle::ParseCommandLineFlags(&argc, argv);
-  ASSERT_EQ(argc, 1);
-  ASSERT_EQ(FLAGS_i2, 32);
-  ASSERT_EQ(FLAGS_str1, "abc");
-  ASSERT_EQ(FLAGS_b2, true);
-  ASSERT_EQ(FLAGS_b1, false);
-  ASSERT_NEAR(FLAGS_d2, 0.34, EPSILON);
-  ASSERT_NEAR(FLAGS_d1, 0.0, EPSILON);
-  ASSERT_EQ(FLAGS_l1, -12345678901234);
-  ASSERT_EQ(FLAGS_ul2, 3212UL);
-}
-
-TEST(CommandLineParser, printHelp) {
-  char* argv[] = {cc("test_program"), cc("--help")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  // Will Print Usage
-  ASSERT_DEATH(paddle::ParseCommandLineFlags(&argc, argv), ".*test_program.*");
-}
-
-TEST(CommandLineParser, parseError) {
-  char* argv[] = {cc("test_program"), cc("--i1=abc")};
-
-  int argc = sizeof(argv) / sizeof(char*);
-  ASSERT_DEATH(
-      paddle::ParseCommandLineFlags(&argc, argv),
-      "Parse command flag i1 error! User input is --i1=abc.*test_program.*");
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char** argv) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 292ed4619d..378788bcec 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include <chrono>
 
-#include "paddle/utils/CommandLineParser.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+DEFINE_int32(test_thread_num, 10, "testing thread number");
 
 void testNormalImpl(
     const std::function<void(paddle::CustomStackTrace<std::string>&,
@@ -95,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
     }
   });
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
deleted file mode 100644
index fbfffcc65a..0000000000
--- a/paddle/utils/tests/test_Logging.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include <dirent.h>
-#include <gtest/gtest.h>
-#include <stdlib.h>
-#include <fstream>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#ifndef PADDLE_USE_GLOG
-TEST(Logging, BasicalLog) {
-  auto pinfo = [] {
-    P_LOG(INFO) << "INFO";
-    exit(1);
-  };
-  ASSERT_DEATH(pinfo(), "I .*test_Logging.cpp:[0-9]+] INFO");
-
-  auto pwarn = [] {
-    P_LOG(WARNING) << "WARN";
-    exit(1);
-  };
-  ASSERT_DEATH(pwarn(), "W .*test_Logging.cpp:[0-9]+] WARN");
-
-  auto perr = [] {
-    P_LOG(ERROR) << "ERROR";
-    exit(1);
-  };
-  ASSERT_DEATH(perr(), "E .*test_Logging.cpp:[0-9]+] ERROR");
-
-  auto pfatal = [] { P_LOG(FATAL) << "FATAL"; };
-  ASSERT_DEATH(pfatal(), "F .*test_Logging.cpp:[0-9]+] FATAL");
-}
-
-TEST(Logging, Check) {
-  int a = 1;
-  int b = 2;
-  P_CHECK(a != b);
-
-  auto pcheckDown = [&] { P_CHECK(a == b); };
-  ASSERT_DEATH(pcheckDown(),
-               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
-
-  P_CHECK_LE(a, b);
-  P_CHECK_LT(a, b);
-  double t = 1.2;
-  P_CHECK_LE(a, t);
-  double* ptr = nullptr;
-
-  auto pcheckDown2 = [&] { P_CHECK_NOTNULL(ptr); };
-  ASSERT_DEATH(pcheckDown2(), "F");
-}
-
-#define cc(x) const_cast<char*>(x)
-
-TEST(Logging, LogToStderr) {
-  auto logToStderrCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-    P_LOG(INFO) << "This output will not print to std error";
-    exit(1);
-  };
-
-  ASSERT_DEATH(logToStderrCallback(), "");
-}
-
-constexpr char kLogDirName[] = "./test_log_dir";
-const std::vector<std::string> kLevels = {"INFO", "WARNING", "ERROR", "FATAL"};
-
-TEST(Logging, LogToDir) {
-  ASSERT_EQ(0, mkdir(kLogDirName, 0777));
-  auto logToDirCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    setenv("PLOG_LOGDIR", kLogDirName, true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-
-    P_LOG(INFO) << "INFO";
-    P_LOG(WARNING) << "WARNING";
-    P_LOG(ERROR) << "ERROR";
-    P_LOG(FATAL) << "FATAL";
-  };
-  ASSERT_DEATH(logToDirCallback(), "");
-
-  // There 4 file in logdir
-  auto dir = opendir(kLogDirName);
-  size_t fileCount = 0;
-  std::vector<std::string> filenames;
-  for (auto dirContent = readdir(dir); dirContent != nullptr;
-       dirContent = readdir(dir)) {
-    std::string filename(dirContent->d_name);
-    if (filename == "." || filename == "..") {
-      continue;
-    } else {
-      ++fileCount;
-      for (size_t i = 0; i < kLevels.size(); ++i) {
-        const std::string& curLevel = kLevels[i];
-        if (filename.size() > curLevel.length()) {
-          size_t diff = filename.size() - curLevel.length();
-          size_t j = 0;
-          for (; j < curLevel.length(); ++j) {
-            if (filename[j + diff] != curLevel[j]) {
-              // File Suffix Not Same, then break.
-              break;
-            }
-          }
-          if (j == curLevel.length()) {  // Same suffix.
-            std::ifstream fin;
-            auto fn = paddle::path::join(kLogDirName, filename);
-            fin.open(fn);
-            filenames.push_back(fn);
-            ASSERT_TRUE(fin.is_open());
-            size_t lineCounter = 0;
-            for (std::string line; std::getline(fin, line); ++lineCounter) {
-              // Do Nothing, Just calc lineCounter.
-            }
-
-            // For example.
-            // The info channel will have all log which level >= INFO
-            // So the info file's lineCounter should == 4.
-            ASSERT_EQ(kLevels.size() - i, lineCounter);
-            fin.close();
-          }
-        }
-      }
-    }
-  }
-  closedir(dir);
-  ASSERT_EQ(4UL, fileCount);  // 4 levels.
-  // Clean Unittest.
-  for (std::string& fn : filenames) {
-    ASSERT_EQ(remove(fn.c_str()), 0);
-  }
-  ASSERT_EQ(rmdir(kLogDirName), 0);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int, char**) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 42edede209..8200a24ce7 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 22f8584ef5..cc34eb1f86 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include <vector>
-#include "paddle/utils/CommandLineParser.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
@@ -51,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
         });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index 2f5c5bbce0..6e2580c491 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
     ASSERT_EQ(res[i], i);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 4a8af5b97e..554b1c1d4a 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -12,15 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include <set>
 #include <vector>
-#include "paddle/utils/CommandLineParser.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
@@ -62,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
                    });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 2c40070eca..62d5b9e38b 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
     ModelConfig.proto
     ParameterConfig.proto
     ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
@@ -18,10 +19,10 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN}
         ${CUR_PROTO_GEN})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -29,9 +30,9 @@ foreach(filename ${proto_filenames})
         ${CUR_PROTO_GEN_PY}
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index b34e1ebded..3a9d339976 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -245,7 +245,14 @@ message ImageConfig {
 
   // The size of input feature map.
   required uint32 img_size = 8;
-  required uint32 img_size_y = 9;
+  optional uint32 img_size_y = 9;
+}
+
+message PriorBoxConfig {
+  repeated uint32 min_size = 1;
+  repeated uint32 max_size = 2;
+  repeated float aspect_ratio = 3;
+  repeated float variance = 4;
 }
 
 message LayerInputConfig {
@@ -263,6 +270,7 @@ message LayerInputConfig {
   optional BilinearInterpConfig bilinear_interp_conf = 10;
   optional MaxOutConfig maxout_conf = 11;
   optional SppConfig spp_conf = 12;
+  optional PriorBoxConfig priorbox_conf = 13;
 }
 
 message LayerConfig {
@@ -433,8 +441,10 @@ message EvaluatorConfig {
   repeated string input_layers = 3;
 
   // Used by ChunkEvaluator
-  optional string chunk_scheme = 4; // one of "IOB", "IOE", "IOBES"
-  optional int32 num_chunk_types = 5; // number of chunk types other than "other"
+  // one of "IOB", "IOE", "IOBES"
+  optional string chunk_scheme = 4;
+  // number of chunk types other than "other"
+  optional int32 num_chunk_types = 5;
 
   // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
   // For multi binary labels: true if output > classification_threshold
@@ -453,6 +463,10 @@ message EvaluatorConfig {
 
   // whether to delimit the sequence in the seq_text_printer
   optional bool delimited = 11 [default = true];
+
+  // Used by ChunkEvaluator
+  // chunk of these types are not counted
+  repeated int32 excluded_chunk_types = 12;
 }
 
 message LinkConfig {
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
new file mode 100644
index 0000000000..3068bba8b1
--- /dev/null
+++ b/proto/ParameterServerConfig.proto
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+
+package paddle;
+
+
+/**
+ * Configuration structure for ParameterClient2.
+ */
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+
+/**
+ * Configuration structure for ParameterServer2.
+ */
+message ParameterServerConfig {
+  // The ports number for parameter send,
+  // increment based on default port number
+  required int32 ports_num = 1 [default = 1];
+  // The ports number for parameter send,
+  // increment based on default (port + ports_num
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index dce0b90952..1cda4762eb 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -10,26 +10,17 @@ set(PY_FILES paddle/__init__.py
              ${HELPERS_PY_FILES}
              ${UTILS_PY_FILES})
 
-set(PADDLE_INTERNAL_PACKAGE "")
-if (PADDLE_WITH_INTERNAL)
-    set(PADDLE_INTERNAL_PACKAGE "paddle.internals")
-endif()
-
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
-find_python_module(pip REQUIRED)
-find_python_module(wheel REQUIRED)
-find_python_module(google.protobuf REQUIRED)
-
 add_subdirectory(paddle/trainer_config_helpers/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 6618153df3..bd24c68b6f 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -107,8 +107,7 @@ def integer_value_sub_sequence(dim):
     return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def integer_sequence(dim):
-    return index_slot(dim, seq_type=SequenceType.SEQUENCE)
+integer_sequence = integer_value_sequence
 
 
 class SingleSlotWrapper(object):
@@ -233,7 +232,7 @@ def provider(input_types=None,
              check=False,
              check_fail_continue=False,
              init_hook=None,
-             **kwargs):
+             **outter_kwargs):
     """
     Provider decorator. Use it to make a function into PyDataProvider2 object.
     In this function, user only need to get each sample for some train/test
@@ -279,7 +278,7 @@ def provider(input_types=None,
                                 custom calculate one sample's batch_size.
 
                                 It is very danger to set it to false and use
-                                calc_batch_size together. Default is false.
+                                calc_batch_size together. Default is true.
     :type can_over_batch_size: bool
 
     :param calc_batch_size: a method to calculate each sample's batch size.
@@ -319,11 +318,6 @@ def provider(input_types=None,
                 self.logger = logging.getLogger("")
                 self.logger.setLevel(logging.INFO)
                 self.input_types = None
-                if 'slots' in kwargs:
-                    self.logger.warning('setting slots value is deprecated, '
-                                        'please use input_types instead.')
-                    self.slots = kwargs['slots']
-                self.slots = input_types
                 self.should_shuffle = should_shuffle
 
                 true_table = [1, 't', 'true', 'on']
@@ -359,9 +353,19 @@ def provider(input_types=None,
                 self.check = check
                 if init_hook is not None:
                     init_hook(self, file_list=file_list, **kwargs)
+
+                if 'slots' in outter_kwargs:
+                    self.logger.warning('setting slots value is deprecated, '
+                                        'please use input_types instead.')
+                    self.slots = outter_kwargs['slots']
+                if input_types is not None:
+                    self.slots = input_types
+
                 if self.input_types is not None:
                     self.slots = self.input_types
-                assert self.slots is not None
+
+                assert self.slots is not None, \
+                    "Data Provider's input_types must be set"
                 assert self.generator is not None
 
                 use_dynamic_order = False
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5b7f4d85e2..674b5ac58b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -498,9 +498,16 @@ class Input(Cfg):
             is_static=None,
             is_shared=None,
             update_hooks=None,
-            input_layer_argument=None, ):
+            input_layer_argument=None,
+            make_layer_name_in_submodel=True, ):
+        """
+        @param make_layer_name_in_submodel True by defalut, you might need to
+        set it carefully when adding Input in config_parser.py.
+        """
         self.add_keys(locals())
-        self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
+        self.input_layer_name = MakeLayerNameInSubmodel(
+            input_layer_name
+        ) if make_layer_name_in_submodel else input_layer_name
 
 
 # Define a projection for iexed layer
@@ -1240,7 +1247,8 @@ def Evaluator(
         dict_file=None,
         result_file=None,
         num_results=None,
-        delimited=None, ):
+        delimited=None,
+        excluded_chunk_types=None, ):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
     evaluator.name = MakeLayerNameInSubmodel(name)
@@ -1269,6 +1277,9 @@ def Evaluator(
     if delimited is not None:
         evaluator.delimited = delimited
 
+    if excluded_chunk_types:
+        evaluator.excluded_chunk_types.extend(excluded_chunk_types)
+
 
 class LayerBase(object):
     def __init__(
@@ -1578,6 +1589,27 @@ class PrintLayer(LayerBase):
         super(PrintLayer, self).__init__(name, 'print', 0, inputs)
 
 
+@config_layer('priorbox')
+class PriorBoxLayer(LayerBase):
+    def __init__(self, name, inputs, size, min_size, max_size, aspect_ratio,
+                 variance):
+        super(PriorBoxLayer, self).__init__(name, 'priorbox', 0, inputs)
+        config_assert(len(inputs) == 2, 'PriorBoxLayer must have 2 inputs')
+        input_layer = self.get_input_layer(1)
+        config_assert(
+            input_layer.type == 'data',
+            'Expecting the second input layer of an priorbox layer to be '
+            'a data layer')
+        config_assert(input_layer.width > 0, 'The data layer must set width')
+        config_assert(input_layer.height > 0, 'The data layer must set height')
+        config_assert(len(variance) == 4, 'The variance must have 4 inputs')
+        self.config.inputs[0].priorbox_conf.min_size.extend(min_size)
+        self.config.inputs[0].priorbox_conf.max_size.extend(max_size)
+        self.config.inputs[0].priorbox_conf.aspect_ratio.extend(aspect_ratio)
+        self.config.inputs[0].priorbox_conf.variance.extend(variance)
+        self.config.size = size
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
@@ -1844,7 +1876,8 @@ class BatchNormLayer(LayerBase):
                     initial_std=0.0,
                     initial_mean=0.0,
                     is_static=True,
-                    is_shared=is_shared, ))
+                    is_shared=is_shared,
+                    make_layer_name_in_submodel=False, ))
 
         parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
         cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
@@ -1871,8 +1904,14 @@ class BatchNormLayer(LayerBase):
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
         parse_image(self.inputs[0].image, input_layer.name, image_conf)
-        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
-                           image_conf.channels, False)
+
+        # Only pass the width and height of input to batch_norm layer 
+        # when either of it is non-zero. 
+        if input_layer.width != 0 or input_layer.height != 0:
+            self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                               image_conf.channels, False)
+        else:
+            self.set_layer_size(input_layer.size)
 
         psize = self.calc_parameter_size(image_conf)
         dims = [1, psize]
@@ -3377,8 +3416,35 @@ def register_parse_config_hook(f):
     _parse_config_hooks.add(f)
 
 
-def parse_config(config_file, config_arg_str):
+def update_g_config():
+    '''
+    Update g_config after execute config_file or config_functions.
+    '''
+    for k, v in settings.iteritems():
+        if v is None:
+            continue
+        g_config.opt_config.__setattr__(k, v)
+
+    for k, v in trainer_settings.iteritems():
+        if v is None:
+            continue
+        g_config.__setattr__(k, v)
+
+    for name in g_config.model_config.input_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
+            'The type of input layer "%s" is not "data"' % name
+    for name in g_config.model_config.output_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+    return g_config
+
+
+def parse_config(trainer_config, config_arg_str):
     '''
+    @param trainer_config: can be a string of config file name or a function name
+    with config logic
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
@@ -3412,45 +3478,20 @@ def parse_config(config_file, config_arg_str):
     g_root_submodel.is_recurrent_layer_group = False
     g_current_submodel = g_root_submodel
 
-    # for paddle on spark, need support non-file config.
-    # you can use parse_config like below:
-    #
-    # from paddle.trainer.config_parser import parse_config
-    # def configs():
-    #    #your paddle config code, which is same as config file.
-    #
-    # config = parse_config(configs, "is_predict=1")
-    # # then you get config proto object.
-    if hasattr(config_file, '__call__'):
-        config_file.func_globals.update(
+    if hasattr(trainer_config, '__call__'):
+        trainer_config.func_globals.update(
             make_config_environment("", config_args))
-        config_file()
+        trainer_config()
     else:
-        execfile(config_file, make_config_environment(config_file, config_args))
-    for k, v in settings.iteritems():
-        if v is None:
-            continue
-        g_config.opt_config.__setattr__(k, v)
+        execfile(trainer_config,
+                 make_config_environment(trainer_config, config_args))
 
-    for k, v in trainer_settings.iteritems():
-        if v is None:
-            continue
-        g_config.__setattr__(k, v)
-
-    for name in g_config.model_config.input_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
-            'The type of input layer "%s" is not "data"' % name
-    for name in g_config.model_config.output_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-    return g_config
+    return update_g_config()
 
 
-def parse_config_and_serialize(config_file, config_arg_str):
+def parse_config_and_serialize(trainer_config, config_arg_str):
     try:
-        config = parse_config(config_file, config_arg_str)
+        config = parse_config(trainer_config, config_arg_str)
         #logger.info(config)
         return config.SerializeToString()
     except:
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index 3ac1454934..13155ebddb 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -20,6 +20,6 @@ from layers import *
 from networks import *
 from optimizers import *
 from attrs import *
-
+from config_parser_utils import *
 # This will enable operator overload for LayerOutput
-import math
+import layer_math
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 59bb18bfca..bf02088346 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -19,34 +19,34 @@ __all__ = [
 
 
 def convert_and_compare(x, Type):
-    """                                                                                                                                                                                                
-    Convert x to be the same type as Type and then convert back to                                                                                                                                      
-    check whether there is a loss of information                                                                                                                                                        
-    :param x: object to be checked                                                                                                                                                                      
-    :param Type: target type to check x over                                                                                                                                                           
-    
+    """
+    Convert x to be the same type as Type and then convert back to
+    check whether there is a loss of information
+    :param x: object to be checked
+    :param Type: target type to check x over
+
     """
     return type(x)(Type(x)) == x
 
 
 def is_compatible_with(x, Type):
-    """                                                                                                                                                                                                
-    Check if x has a type compatible with Type                                                                                                                                                         
-    :param x: object to be checked                                                                                                                                                                     
-    :param Type: target type to check x over                                                                                                                                                           
-    
+    """
+    Check if x has a type compatible with Type
+    :param x: object to be checked
+    :param Type: target type to check x over
+
     """
     if type(x) == Type:
         return True
     try:
         if float == Type or int == Type:
-            # avoid those types that can be converted to float/int but not very                                                                                                                            
-            # meaningful and  could potentially lead to error                                                                                                                                              
-            # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
+            # avoid those types that can be converted to float/int but not very
+            # meaningful and  could potentially lead to error
+            # i.e., str and bool typed value should not be used for initializing float/int variable
             if not isinstance(x, str) and not isinstance(x, bool):
                 return convert_and_compare(x, Type)
         elif bool == Type:
-            # should not use string type to initialize bool variable                                                                                                                                   
+            # should not use string type to initialize bool variable
             if not isinstance(x, str):
                 return convert_and_compare(x, Type)
         else:
@@ -88,6 +88,10 @@ class ParameterAttribute(object):
     :type learning_rate: float or None
     :param momentum: The parameter momentum. None means use global value.
     :type momentum: float or None
+    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
+                                        value larger than some value, will be
+                                        clipped.
+    :type gradient_clipping_threshold: float
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
@@ -104,6 +108,7 @@ class ParameterAttribute(object):
                  l2_rate=None,
                  learning_rate=None,
                  momentum=None,
+                 gradient_clipping_threshold=None,
                  sparse_update=False):
         # initialize strategy.
         if is_static:
@@ -152,6 +157,11 @@ class ParameterAttribute(object):
             self.attr['sparse_update'] = True
             self.attr['sparse_remote_update'] = True
 
+        if gradient_clipping_threshold is not None and \
+                is_compatible_with(gradient_clipping_threshold, float):
+            self.attr['gradient_clipping_threshold'] = \
+                gradient_clipping_threshold
+
     def set_default_parameter_name(self, name):
         """
         Set default parameter name. If parameter not set, then will use default
diff --git a/python/paddle/trainer_config_helpers/config_parser.py b/python/paddle/trainer_config_helpers/config_parser.py
new file mode 100644
index 0000000000..4b91b8d282
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf):
+    config = config_parser.parse_config(network_conf, '')
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf):
+    config = config_parser.parse_config(optimizer_conf, '')
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
new file mode 100644
index 0000000000..681b177a55
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser_utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf, config_arg_str=''):
+    config = config_parser.parse_config(network_conf, config_arg_str)
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf, config_arg_str=''):
+    config = config_parser.parse_config(optimizer_conf, config_arg_str)
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index c62553f54c..622b4fc25c 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -69,7 +69,7 @@ def define_py_data_source(file_list,
     """
     if isinstance(file_list, list):
         file_list_name = 'train.list'
-        if isinstance(cls, TestData):
+        if cls == TestData:
             file_list_name = 'test.list'
         with open(file_list_name, 'w') as f:
             f.writelines(file_list)
@@ -78,21 +78,6 @@ def define_py_data_source(file_list,
     if not isinstance(args, basestring) and args is not None:
         args = pickle.dumps(args, 0)
 
-    if data_cls is None:
-
-        def py_data2(files, load_data_module, load_data_object, load_data_args,
-                     **kwargs):
-            data = DataBase()
-            data.type = 'py2'
-            data.files = files
-            data.load_data_module = load_data_module
-            data.load_data_object = load_data_object
-            data.load_data_args = load_data_args
-            data.async_load_data = True
-            return data
-
-        data_cls = py_data2
-
     cls(
         data_cls(
             files=file_list,
@@ -186,7 +171,7 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                                 obj="process", 
                                 args={"dictionary": dict_name})
 
-    The related data provider can refer to :ref:`api_pydataprovider2_en_sequential_model` .
+    The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
 
     :param train_list: Train list name.
     :type train_list: basestring
@@ -207,10 +192,22 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
     :return: None
     :rtype: None
     """
+
+    def py_data2(files, load_data_module, load_data_object, load_data_args,
+                 **kwargs):
+        data = DataBase()
+        data.type = 'py2'
+        data.files = files
+        data.load_data_module = load_data_module
+        data.load_data_object = load_data_object
+        data.load_data_args = load_data_args
+        data.async_load_data = True
+        return data
+
     define_py_data_sources(
         train_list=train_list,
         test_list=test_list,
         module=module,
         obj=obj,
         args=args,
-        data_cls=None)
+        data_cls=py_data2)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 0ee116d8c4..bd247ea9af 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -57,19 +57,21 @@ def evaluator(*attrs):
     return impl
 
 
-def evaluator_base(input,
-                   type,
-                   label=None,
-                   weight=None,
-                   name=None,
-                   chunk_scheme=None,
-                   num_chunk_types=None,
-                   classification_threshold=None,
-                   positive_label=None,
-                   dict_file=None,
-                   result_file=None,
-                   num_results=None,
-                   delimited=None):
+def evaluator_base(
+        input,
+        type,
+        label=None,
+        weight=None,
+        name=None,
+        chunk_scheme=None,
+        num_chunk_types=None,
+        classification_threshold=None,
+        positive_label=None,
+        dict_file=None,
+        result_file=None,
+        num_results=None,
+        delimited=None,
+        excluded_chunk_types=None, ):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -127,7 +129,8 @@ def evaluator_base(input,
         positive_label=positive_label,
         dict_file=dict_file,
         result_file=result_file,
-        delimited=delimited)
+        delimited=delimited,
+        excluded_chunk_types=excluded_chunk_types, )
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
@@ -327,9 +330,11 @@ def ctc_error_evaluator(
 @wrap_name_default()
 def chunk_evaluator(
         input,
+        label,
+        chunk_scheme,
+        num_chunk_types,
         name=None,
-        chunk_scheme=None,
-        num_chunk_types=None, ):
+        excluded_chunk_types=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
     sequence. It calculates the chunk detection F1 score.
@@ -363,24 +368,29 @@ def chunk_evaluator(
 
     .. code-block:: python
 
-       eval = chunk_evaluator(input)
+       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
     :param input: The input layers.
     :type input: LayerOutput
-    :param name: The Evaluator name, it is not necessary.
-    :type name: basename|None
+    :param label: An input layer containing the ground truth label.
+    :type label: LayerOutput
     :param chunk_scheme: The labelling schemes support 4 types. It is one of
-                         "IOB", "IOE", "IOBES", "plain".This Evaluator must
-                         contain this chunk_scheme.
+                         "IOB", "IOE", "IOBES", "plain". It is required.
     :type chunk_scheme: basestring
     :param num_chunk_types: number of chunk types other than "other"
+    :param name: The Evaluator name, it is optional.
+    :type name: basename|None
+    :param excluded_chunk_types: chunks of these types are not considered
+    :type excluded_chunk_types: list of integer|None
     """
     evaluator_base(
         name=name,
         type="chunk",
         input=input,
+        label=label,
         chunk_scheme=chunk_scheme,
-        num_chunk_types=num_chunk_types)
+        num_chunk_types=num_chunk_types,
+        excluded_chunk_types=excluded_chunk_types, )
 
 
 @evaluator(EvaluatorAttribute.FOR_UTILS)
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/layer_math.py
similarity index 100%
rename from python/paddle/trainer_config_helpers/math.py
rename to python/paddle/trainer_config_helpers/layer_math.py
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8dd6b7b7d2..9b6e5774bc 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -106,6 +106,7 @@ __all__ = [
     'maxout_layer',
     'out_prod_layer',
     'print_layer',
+    'priorbox_layer',
     'spp_layer',
 ]
 
@@ -171,6 +172,7 @@ class LayerType(object):
     SPP_LAYER = "spp"
 
     PRINT_LAYER = "print"
+    PRIORBOX_LAYER = "priorbox"
 
     CTC_LAYER = "ctc"
     WARP_CTC_LAYER = "warp_ctc"
@@ -934,6 +936,52 @@ def print_layer(input, name=None):
     # this layer don't return anything, can not be input of other layer.
 
 
+@wrap_name_default("priorbox")
+def priorbox_layer(input,
+                   image,
+                   aspect_ratio,
+                   variance,
+                   min_size,
+                   max_size=[],
+                   name=None):
+    """
+    Compute the priorbox and set the variance. This layer is necessary for ssd.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param image: The network input image.
+    :type image: LayerOutput
+    :param aspect_ratio: The aspect ratio.
+    :type aspect_ratio: list
+    :param variance: The bounding box variance.
+    :type min_size: The min size of the priorbox width/height.
+    :param min_size: list
+    :type max_size: The max size of the priorbox width/height. Could be NULL.
+    :param max_size: list
+    :return: LayerOutput
+    """
+    # plus one for ratio 1.
+    num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
+    size = (input.size / input.num_filters) * num_filters * 2
+    Layer(
+        name=name,
+        type=LayerType.PRIORBOX_LAYER,
+        inputs=[input.name, image.name],
+        size=size,
+        min_size=min_size,
+        max_size=max_size,
+        aspect_ratio=aspect_ratio,
+        variance=variance)
+    return LayerOutput(
+        name,
+        LayerType.PRIORBOX_LAYER,
+        parents=[input, image],
+        num_filters=num_filters,
+        size=size)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -970,7 +1018,7 @@ def pooling_layer(input,
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
-    :rtype: LayerType
+    :rtype: LayerOutput
     """
     extra_dict = dict()
     # noinspection PyUnresolvedReferences
@@ -1776,15 +1824,15 @@ def img_conv_layer(input,
                    trans=False,
                    layer_type=None):
     """
-    Convolution layer for image. Paddle only support square input currently and
-    thus input image's width equals height.
+    Convolution layer for image. Paddle can support both square and non-square 
+    input currently.
 
     The details of convolution layer, please refer UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
 
-    Convolution Transpose (deconv) layer for image. Paddle only support square
-    input currently and thus input image's width equals height.
+    Convolution Transpose (deconv) layer for image. Paddle can support both square 
+    and non-square input currently.
 
     The details of convolution transpose layer,
     please refer to the following explanation and references therein
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index d95b2cfe46..a53ebe160b 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -361,9 +361,6 @@ def settings(batch_size,
              learning_rate_decay_b=0.,
              learning_rate_schedule='poly',
              learning_rate_args='',
-             average_window=0,
-             do_average_in_cpu=False,
-             max_average_window=None,
              learning_method=None,
              regularization=None,
              is_async=False,
@@ -411,8 +408,7 @@ def settings(batch_size,
 
     args = [
         'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'average_window', 'do_average_in_cpu', 'max_average_window'
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
     ]
     kwargs = dict()
     kwargs['algorithm'] = algorithm
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index d1a9843d32..403aafabe9 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,12 +1,12 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 add_test(NAME test_reset_hook
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 if (PROTOBUF_3)
@@ -14,12 +14,12 @@ if (PROTOBUF_3)
     ProtobufEqualMain.cpp)
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
     ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
   )
 else()
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   )
 endif()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index a54af94ce3..ee5961af75 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -10,13 +10,13 @@ protostr=$PWD/protostr
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index c4c6d4020f..3331c10d64 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -1,15 +1,14 @@
 from paddle.trainer_config_helpers import *
-from paddle.trainer_config_helpers import math
 
 settings(batch_size=1000, learning_rate=1e-5)
 
 x = data_layer(name='data', size=100)
-x = math.exp(x)
-x = math.log(x)
-x = math.abs(x)
-x = math.sigmoid(x)
-x = math.square(x)
-x = math.square(x)
+x = layer_math.exp(x)
+x = layer_math.log(x)
+x = layer_math.abs(x)
+x = layer_math.sigmoid(x)
+x = layer_math.square(x)
+x = layer_math.square(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index cd310bd13b..6934fd0da6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -58,8 +58,6 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
-  height: 256
-  width: 256
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index e984ee7062..a37eb6439e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -7,7 +7,7 @@ protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh
+./generate_protostr.sh $1
 
 . ./file_list.sh
 
diff --git a/python/setup.py.in b/python/setup.py.in
index d2fb95f27f..b66a42e87c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,11 @@
 from setuptools import setup
 
-INTERNAL_PACKAGE='${PADDLE_INTERNAL_PACKAGE}'
-
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
           'paddle.utils']
 
-if len(INTERNAL_PACKAGE) != 0:
-    packages.append(INTERNAL_PACKAGE)
-
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
diff --git a/third_party/gtest.BUILD b/third_party/gtest.BUILD
deleted file mode 100644
index 71c74af513..0000000000
--- a/third_party/gtest.BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-cc_library(
-    name="main",
-    srcs=glob(
-        ["src/*.cc"], exclude=["src/gtest-all.cc"]),
-    hdrs=glob(["include/**/*.h", "src/*.h"]),
-    copts=["-Iexternal/gtest/include"],
-    linkopts=["-pthread"],
-    visibility=["//visibility:public"], )
diff --git a/third_party/protobuf_test/BUILD b/third_party/protobuf_test/BUILD
deleted file mode 100644
index 95a687a356..0000000000
--- a/third_party/protobuf_test/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-load("@protobuf//:protobuf.bzl", "cc_proto_library")
-
-cc_proto_library(
-    name="example_proto",
-    srcs=["example.proto"],
-    protoc="@protobuf//:protoc",
-    default_runtime="@protobuf//:protobuf", )
-
-cc_library(
-    name="example_lib",
-    srcs=["example_lib.cc"],
-    hdrs=["example_lib.h"],
-    deps=[":example_proto"], )
-
-cc_test(
-    name="example_lib_test",
-    srcs=["example_lib_test.cc"],
-    copts=["-Iexternal/gtest/include"],
-    deps=[
-        "@gtest//:main",
-        ":example_lib",
-    ], )
diff --git a/third_party/protobuf_test/README.md b/third_party/protobuf_test/README.md
deleted file mode 100644
index e8bdeee6fe..0000000000
--- a/third_party/protobuf_test/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This package tests that Bazel can build protobuf related rules.
diff --git a/third_party/protobuf_test/example.proto b/third_party/protobuf_test/example.proto
deleted file mode 100644
index 6a7eada9c1..0000000000
--- a/third_party/protobuf_test/example.proto
+++ /dev/null
@@ -1,7 +0,0 @@
-syntax = "proto3";
-
-package third_party.protobuf_test;
-
-message Greeting {
-  string name = 1;
-}
diff --git a/third_party/protobuf_test/example_lib.cc b/third_party/protobuf_test/example_lib.cc
deleted file mode 100644
index ced377bc0a..0000000000
--- a/third_party/protobuf_test/example_lib.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "third_party/protobuf_test/example_lib.h"
-
-namespace third_party {
-namespace protobuf_test {
-
-std::string get_greet(const Greeting& who) { return "Hello " + who.name(); }
-
-}  // namespace protobuf_test
-}  // namespace thrid_party
diff --git a/third_party/protobuf_test/example_lib.h b/third_party/protobuf_test/example_lib.h
deleted file mode 100644
index 516326e812..0000000000
--- a/third_party/protobuf_test/example_lib.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "third_party/protobuf_test/example.pb.h"
-
-#include <string>
-
-namespace third_party {
-namespace protobuf_test {
-
-std::string get_greet(const Greeting &who);
-
-}  // namespace protobuf_test
-}  // namespace third_party
diff --git a/third_party/protobuf_test/example_lib_test.cc b/third_party/protobuf_test/example_lib_test.cc
deleted file mode 100644
index 6229f56e60..0000000000
--- a/third_party/protobuf_test/example_lib_test.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "third_party/protobuf_test/example_lib.h"
-
-#include "gtest/gtest.h"
-
-namespace third_party {
-namespace protobuf_test {
-
-TEST(ProtobufTest, GetGreet) {
-  Greeting g;
-  g.set_name("Paddle");
-  EXPECT_EQ("Hello Paddle", get_greet(g));
-}
-
-}  // namespace protobuf_test
-}  // namespace third_party
diff --git a/warp-ctc b/warp-ctc
deleted file mode 160000
index bd535c8d44..0000000000
--- a/warp-ctc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2