diff --git a/.gitignore b/.gitignore
index 1c9730a5ad..6aae076a49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ build/
 .pydevproject
 Makefile
 .test_env/
+third_party/
 
 *~
 bazel-*
+third_party/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index f635e65784..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "warp-ctc"]
-	path = warp-ctc
-	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b9902a863d..a6e45028eb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -15,7 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index 047ca6ffe7..eecf5e81f0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,23 +21,17 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
-      - wget
       - git
       - build-essential
       - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
-      - m4
       - python-numpy
       - python-wheel
-      - libgoogle-glog-dev
-      - libgflags-dev
-      - libgtest-dev
       - curl
-      - lcov
-      - graphviz
       - swig
+      - graphviz
       - clang-format-3.8
       - automake
       - libtool
@@ -53,10 +47,9 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit requests==2.9.2 LinkChecker
+  - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65fbbb481c..abe7b5228c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,166 +1,89 @@
-cmake_minimum_required(VERSION 2.8)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+cmake_minimum_required(VERSION 3.0)
 
 project(paddle CXX C)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
-include(package)
-find_package(SWIG 2.0)
-find_package(CUDA QUIET)
-find_package(Protobuf REQUIRED)
-
-# Check protobuf library version.
-execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-    OUTPUT_VARIABLE PROTOBUF_VERSION)
-string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
-
-set(PROTOBUF_3 OFF)
-if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
-    set(PROTOBUF_3 ON)
-endif()
 
-find_package(PythonLibs 2.7 REQUIRED)
-find_package(PythonInterp 2.7 REQUIRED)
-find_package(ZLIB REQUIRED)
-find_package(NumPy REQUIRED)
-find_package(Threads REQUIRED)
-find_package(AVX QUIET)
-find_package(Glog REQUIRED)
-find_package(Gflags REQUIRED)
-find_package(GTest)
 find_package(Sphinx)
-find_package(Doxygen)
-include(cblas)
-find_program(M4_EXECUTABLE m4)
-###################### Configurations ###########################
-option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
-option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
-option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
-option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
-option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
-option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
-option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
-option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
-option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
-option(ON_TRAVIS "Running test on travis-ci or not." OFF)
-option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
-option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
-
-
-include(cpplint)
-include(ccache)
-if(WITH_RDMA)
-  include(rdma)
-endif()
-include(util)
-include(flags)
-include(cudnn)
-include(FindPythonModule)
-include(check_packages)
-include(swig)
-include(coveralls)
-
-# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
-# version.cmake will get the current PADDLE_VERSION
-include(version)
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-
-if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
-    endif()
-
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-    set(ACCURACY double)
-else(WITH_DOUBLE)
-    set(ACCURACY float)
-endif(WITH_DOUBLE)
-
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
-
-if(WITH_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIR})
-    include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
-else(WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(WITH_PYTHON)
-
-if(WITH_RDMA)
-  include_directories("${RDMA_INC_DIR}")
-else(WITH_RDMA)
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
-
-# glog
-include_directories(${LIBGLOG_INCLUDE_DIR})
-
-#gflags
-add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-include_directories(${GFLAGS_INCLUDE_DIRS})
+find_package(Threads REQUIRED)
 
-if(WITH_TESTING)
-    enable_testing()
-    include_directories(${GTEST_INCLUDE_DIRS})
-endif()
+include(system)
+include(simd)
+
+################################ Configurations #######################################
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" OFF)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
+option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
+option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
+option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
+option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
+option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+########################################################################################
+
+include(external/zlib)      # download, build, install zlib
+include(external/gflags)    # download, build, install gflags
+include(external/glog)      # download, build, install glog
+include(external/gtest)     # download, build, install gtest
+include(external/protobuf)  # download, build, install protobuf
+include(external/python)    # download, build, install python
+include(external/openblas)  # download, build, install openblas
+include(external/swig)      # download, build, install swig
+include(external/warpctc)   # download, build, install warpctc
+
+include(package)            # set paddle packages
+include(cpplint)            # set paddle c++ style
+include(ccache)             # set ccache for compilation
+include(util)               # set unittest and link libs
+include(rdma)               # set rdma libraries
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries
+include(version)            # set PADDLE_VERSION
+include(coveralls)          # set code coverage
+
+include(configure)          # add paddle env configuration
 
-include_directories("${CBLAS_INC_DIR}")
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
-include_directories(${PROTOBUF_INCLUDE_DIRS})
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-if(EXISTS "${PROJ_ROOT}/paddle/internals/CMakeLists.txt")
-    set(PADDLE_WITH_INTERNAL ON)
-    include(paddle/internals/CMakeLists.txt)
-else()
-    set(PADDLE_WITH_INTERNAL OFF)
-    set(INTERNAL_PROTO_PATH "")
-endif()
+
+set(EXTERNAL_LIBS
+    # have not include gtest here.
+    ${GFLAGS_LIBRARIES}
+    ${GLOG_LIBRARIES}
+    ${CBLAS_LIBRARIES}
+    ${PROTOBUF_LIBRARY}
+    ${ZLIB_LIBRARIES}
+)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 685334c658..4e1ae7dc81 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,6 +13,7 @@
 # system paths.
 #
 
+set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
@@ -35,11 +36,12 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
   set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
-  set(CBLAS_LIBS ${MKL_INTEL_LP64}
+  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
-  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return() # return file.
 endif()
 
@@ -68,9 +70,10 @@ find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -98,8 +101,9 @@ find_library(OPENBLAS_LIB NAMES openblas
 if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
-  set(CBLAS_LIBS ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -130,9 +134,7 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
 if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBS ${REFERENCE_CBLAS_LIBRARY})
-  return()
+  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_FOUND ON)
 endif()
-
-message(FATAL_ERROR "CBlas must be set. Paddle support MKL, ATLAS, OpenBlas, reference-cblas."
-  " Try set MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT or REFERENCE_CBLAS_ROOT.")
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
deleted file mode 100644
index afb84c6ff5..0000000000
--- a/cmake/check_packages.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# Check package for each cmake option
-
-if(WITH_GPU)
-  find_package(CUDA REQUIRED)  # CUDA is required when use gpu
-endif()
-
-if(WITH_PYTHON)
-  find_package(PythonLibs 2.6 REQUIRED)
-  find_package(PythonInterp REQUIRED)
-  find_package(NumPy REQUIRED)
-endif()
-
-if(WITH_STYLE_CHECK)
-  find_package(PythonInterp REQUIRED)
-endif()
-
-find_package(Glog REQUIRED)
-
-find_package(Gflags REQUIRED)
-
-if(WITH_TESTING)
-  find_package(GTest REQUIRED)
-endif()
-
-if(WITH_DOC)
-  find_package(Sphinx REQUIRED)
-  find_python_module(recommonmark REQUIRED)
-endif()
-
-if(WITH_SWIG_PY)
-  if(NOT SWIG_FOUND)
-    message(FATAL_ERROR "SWIG is not found. Please install swig or disable WITH_SWIG_PY")
-  endif()
-  find_python_module(wheel REQUIRED)  # package wheel
-endif()
-
-if(NOT M4_EXECUTABLE)
-  message(FATAL_ERROR "Paddle need m4 to generate proto file.")
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
new file mode 100644
index 0000000000..0bb016201d
--- /dev/null
+++ b/cmake/configure.cmake
@@ -0,0 +1,68 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_PYTHON)
+    add_definitions(-DPADDLE_NO_PYTHON)
+endif(NOT WITH_PYTHON)
+
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
+endif(WITH_DOUBLE)
+
+if(NOT WITH_TIMER)
+    add_definitions(-DPADDLE_DISABLE_TIMER)
+endif(NOT WITH_TIMER)
+
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
+if(NOT WITH_GPU)
+    add_definitions(-DPADDLE_ONLY_CPU)
+    add_definitions(-DHPPL_STUB_FUNC)
+
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+else()
+    FIND_PACKAGE(CUDA REQUIRED)
+
+    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+    endif()
+
+    if(NOT CUDNN_FOUND)
+        message(FATAL_ERROR "Paddle need cudnn to compile")
+    endif()
+
+    if(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
+    endif(WITH_AVX)
+
+    # Include cuda and cudnn
+    include_directories(${CUDNN_INCLUDE_DIR})
+    include_directories(${CUDA_TOOLKIT_INCLUDE})
+endif(NOT WITH_GPU)
+
+if(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
+else(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
+endif(WITH_AVX)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 241af9a083..38c636b30e 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -53,7 +53,7 @@ macro(add_style_check_target TARGET_NAME)
             if(LINT MATCHES ON)
                 add_custom_command(TARGET ${TARGET_NAME}
                     PRE_BUILD
-                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                                 "--filter=${STYLE_FILTER}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
new file mode 100644
index 0000000000..d38b7d1ba2
--- /dev/null
+++ b/cmake/external/gflags.cmake
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
+SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+IF(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ELSE(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+
+ExternalProject_Add(
+    gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    PREFIX          ${GFLAGS_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
new file mode 100644
index 0000000000..bec69f3ddf
--- /dev/null
+++ b/cmake/external/glog.cmake
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
+SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
+
+IF(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+ELSE(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+
+ExternalProject_Add(
+    glog
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    PREFIX          ${GLOG_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
new file mode 100644
index 0000000000..2fcb7893fa
--- /dev/null
+++ b/cmake/external/gtest.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WITH_TESTING)
+    ENABLE_TESTING()
+    INCLUDE(ExternalProject)
+
+    SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
+    SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
+
+    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+
+    IF(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+    ELSE(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        gtest
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
+        PREFIX          ${GTEST_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        CMAKE_ARGS      -DBUILD_GMOCK=ON
+        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+    )
+    LIST(APPEND external_project_dependencies gtest)
+ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
new file mode 100644
index 0000000000..66a72cd243
--- /dev/null
+++ b/cmake/external/openblas.cmake
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(cblas)
+
+IF(NOT ${CBLAS_FOUND})
+    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
+    INCLUDE(ExternalProject)
+
+    SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
+    SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+
+    IF(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
+    ELSE(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        PREFIX              ${CBLAS_SOURCES_DIR}
+        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+        BUILD_IN_SOURCE     1
+        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
+        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        UPDATE_COMMAND      ""
+    )
+
+    LIST(APPEND external_project_dependencies openblas)
+ENDIF()
+
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
new file mode 100644
index 0000000000..2f2769b4c6
--- /dev/null
+++ b/cmake/external/protobuf.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
+SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+IF(WIN32)
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+ELSE(WIN32)
+  IF(${HOST_SYSTEM} STREQUAL "centos")
+    SET(LIB "lib64")
+  ELSE()
+    SET(LIB "lib")
+  ENDIF()
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF(WIN32)
+
+ExternalProject_Add(
+  protobuf
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX          ${PROTOBUF_SOURCES_DIR}
+  UPDATE_COMMAND  ""
+  DEPENDS         zlib
+  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+  CONFIGURE_COMMAND
+    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+    -Dprotobuf_BUILD_TESTS=OFF
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+)
+
+LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
new file mode 100644
index 0000000000..cbb6940221
--- /dev/null
+++ b/cmake/external/python.cmake
@@ -0,0 +1,222 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+INCLUDE(python_module)
+
+FIND_PACKAGE(PythonInterp 2.7)
+FIND_PACKAGE(PythonLibs 2.7)
+
+SET(py_env "")
+
+IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    find_python_module(pip REQUIRED)
+    find_python_module(numpy REQUIRED)
+    find_python_module(wheel REQUIRED)
+    find_python_module(google.protobuf REQUIRED)
+    FIND_PACKAGE(NumPy REQUIRED)
+ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    ##################################### PYTHON ########################################
+    SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
+    SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
+    SET(_python_DIR ${PYTHON_INSTALL_DIR})
+
+    IF(UNIX)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSEIF(WIN32)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown system !")
+    ENDIF()
+
+    IF(APPLE)
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
+            )
+    ENDIF()
+
+    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
+
+    # Force Python build to "Release".
+    IF(CMAKE_CONFIGURATION_TYPES)
+        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
+        SET(CMAKE_CFG_INTDIR "Release")
+    ELSE()
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            )
+    ENDIF()
+
+    ExternalProject_Add(python
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
+        PREFIX            ${PYTHON_SOURCES_DIR}
+        UPDATE_COMMAND    ""
+        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
+        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
+            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
+            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
+            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
+            -DDOWNLOAD_SOURCES:BOOL=ON
+            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
+        DEPENDS zlib
+    )
+
+    SET(py_env
+        PATH=${PYTHON_INSTALL_DIR}/bin
+        PYTHONHOME=${PYTHON_INSTALL_DIR}
+        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
+    ####################################################################################
+
+    ##################################### SETUPTOOLS ###################################
+    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
+    ExternalProject_Add(setuptools
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
+        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python zlib
+    )
+    #####################################################################################
+
+    ##################################### SIX ###########################################
+    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
+    ExternalProject_Add(six
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SIX_SOURCES_DIR}
+        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python setuptools
+    )
+    #####################################################################################
+
+    ##################################### CYTHON ########################################
+    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
+    ExternalProject_Add(cython
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX                ${CYTHON_SOURCES_DIR}
+        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
+        GIT_TAG               0.25.2
+        BUILD_IN_SOURCE       1
+        CONFIGURE_COMMAND     ""
+        PATCH_COMMAND         ""
+        UPDATE_COMMAND        ""
+        INSTALL_COMMAND       ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python
+    )
+    ####################################################################################
+
+    ##################################### NUMPY ########################################
+    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
+    SET(NUMPY_TAG_VERSION "v1.11.3")
+    SET(NUMPY_VERSION "1.11.3")
+
+    SET(EGG_NAME "")
+    SET(PYTHON_NUMPY_INCLUDE_DIR "")
+    IF(WIN32)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
+    ELSE(WIN32)
+        IF(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
+        ELSE(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+        ENDIF(APPLE)
+
+        FOREACH(suffix x86_64 intel fat64 fat32 universal)
+            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
+        ENDFOREACH()
+    ENDIF(WIN32)
+
+    ExternalProject_Add(numpy
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/numpy/numpy.git
+        GIT_TAG             ${NUMPY_TAG_VERSION}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        PREFIX              ${NUMPY_SOURCES_DIR}
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools cython
+    )
+    ####################################################################################
+
+    ##################################### WHEEL ########################################
+    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
+    ExternalProject_Add(wheel
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
+        PREFIX              ${WHEEL_SOURCES_DIR}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ""
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools
+    )
+    ####################################################################################
+
+    ################################### PROTOBUF #######################################
+    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
+    ExternalProject_Add(python-protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
+        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
+        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
+        BUILD_IN_SOURCE       1
+        PATCH_COMMAND         ""
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python setuptools six
+    )
+    ####################################################################################
+
+    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
+
+ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+
+MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}")
+MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}")
+MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}")
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
new file mode 100644
index 0000000000..40088c65ef
--- /dev/null
+++ b/cmake/external/swig.cmake
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FIND_PACKAGE(SWIG)
+
+IF(NOT SWIG_FOUND)
+    # build swig as an external project
+    INCLUDE(ExternalProject)
+
+    SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
+    SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+    SET(SWIG_TARGET_VERSION "3.0.2")
+    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
+    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
+
+    IF(WIN32)
+        # swig.exe available as pre-built binary on Windows:
+        ExternalProject_Add(swig
+            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
+            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
+            SOURCE_DIR          ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     ""
+            UPDATE_COMMAND      ""
+        )
+        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
+        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
+    ELSE(WIN32)
+        # From PCRE configure
+        ExternalProject_Add(pcre
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY https://github.com/svn2github/pcre.git
+            PREFIX ${SWIG_SOURCES_DIR}/pcre
+            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
+        )
+
+        # swig uses bison find it by cmake and pass it down
+        FIND_PACKAGE(BISON)
+
+        # From SWIG configure
+        ExternalProject_Add(swig
+            GIT_REPOSITORY      https://github.com/swig/swig.git
+            GIT_TAG             rel-3.0.10
+            PREFIX              ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
+            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
+            ./configure
+                --prefix=${SWIG_INSTALL_DIR}
+                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
+            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
+            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
+            UPDATE_COMMAND  ""
+            DEPENDS pcre
+        )
+
+        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
+        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
+    ENDIF(WIN32)
+
+    LIST(APPEND external_project_dependencies swig)
+ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
new file mode 100644
index 0000000000..7386d935b8
--- /dev/null
+++ b/cmake/external/warpctc.cmake
@@ -0,0 +1,60 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
+SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
+
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+IF(WIN32)
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ELSE(WIN32)
+    IF(APPLE)
+        SET(_warpctc_SHARED_SUFFIX dylib)
+    ELSE(APPLE)
+        SET(_warpctc_SHARED_SUFFIX so)
+    ENDIF(APPLE)
+
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
+
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+    SET(USE_OMP OFF)
+ELSE()
+    SET(USE_OMP ON)
+ENDIF()
+
+ExternalProject_Add(
+    warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    PREFIX          ${WARPCTC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+    CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DBUILD_SHARED=ON
+)
+
+LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
new file mode 100644
index 0000000000..916f6816aa
--- /dev/null
+++ b/cmake/external/zlib.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
+SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
+SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
+
+IF(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    zlib
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
+    PREFIX          ${ZLIB_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+)
+
+LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/FindPythonModule.cmake b/cmake/python_module.cmake
similarity index 100%
rename from cmake/FindPythonModule.cmake
rename to cmake/python_module.cmake
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
index e9a4da79aa..9ff1a77cac 100644
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -5,72 +5,76 @@
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
 # we use static output in svn repositories to avoid implict bugs from not standard runtime env.
 
-set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+if(WITH_RDMA)
+  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
 
-function(generate_rdma_links)
-  #redirect to current DIR to isolate the pollution from system runtime environment
-  #it can benifits unified control for different gcc environment. 
-  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-  #runtime libraries that will crash process while loading it. That redirect trick
-  #can fix it.
-  execute_process(
-    COMMAND mkdir -p librdma
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-endfunction(generate_rdma_links)
-
-
-#check and set headers
-find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-#check and set libs
-find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-if(
-    RDMA_INC_SXISOCK AND
-    RDMA_INC_XIO AND
-    RDMA_INC_EVENT AND
-    RDMA_INC_NUMA AND
-    RDMA_LIB_SXISOCK AND 
-    RDMA_LIB_XIO AND
-    RDMA_LIB_EVENT AND
-    RDMA_LIB_EVENT_CORE AND
-    RDMA_LIB_EVENT_EXTRA AND
-    RDMA_LIB_EVENT_PTHREADS AND
-    RDMA_LIB_NUMA
+  function(generate_rdma_links)
+    #redirect to current DIR to isolate the pollution from system runtime environment
+    #it can benifits unified control for different gcc environment. 
+    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+    #runtime libraries that will crash process while loading it. That redirect trick
+    #can fix it.
+    execute_process(
+      COMMAND mkdir -p librdma
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
+  endfunction(generate_rdma_links)
 
-  set(RDMA_INC_DIR 
-    ${RDMA_INC_SXISOCK} 
-    ${RDMA_INC_XIO}
-    ${RDMA_INC_EVENT}
-    ${RDMA_INC_NUMA})
-  set(RDMA_LIBS  
-    ${RDMA_LIB_SXISOCK} 
-    ${RDMA_LIB_XIO} 
-    ${RDMA_LIB_EVENT} 
-    ${RDMA_LIB_EVENT_CORE} 
-    ${RDMA_LIB_EVENT_EXTRA} 
-    ${RDMA_LIB_EVENT_PTHREADS} 
-    ${RDMA_LIB_NUMA} 
-    )
-  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-  return()
-endif()
+  #check and set headers
+  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+  #check and set libs
+  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 
-#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+  if(
+      RDMA_INC_SXISOCK AND
+      RDMA_INC_XIO AND
+      RDMA_INC_EVENT AND
+      RDMA_INC_NUMA AND
+      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_XIO AND
+      RDMA_LIB_EVENT AND
+      RDMA_LIB_EVENT_CORE AND
+      RDMA_LIB_EVENT_EXTRA AND
+      RDMA_LIB_EVENT_PTHREADS AND
+      RDMA_LIB_NUMA
+      )
 
-message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+    set(RDMA_INC_DIR 
+      ${RDMA_INC_SXISOCK} 
+      ${RDMA_INC_XIO}
+      ${RDMA_INC_EVENT}
+      ${RDMA_INC_NUMA})
+    set(RDMA_LIBS  
+      ${RDMA_LIB_SXISOCK} 
+      ${RDMA_LIB_XIO} 
+      ${RDMA_LIB_EVENT} 
+      ${RDMA_LIB_EVENT_CORE} 
+      ${RDMA_LIB_EVENT_EXTRA} 
+      ${RDMA_LIB_EVENT_PTHREADS} 
+      ${RDMA_LIB_NUMA} 
+      )
+    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+    include_directories("${RDMA_INC_DIR}")
+  else()
+    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+  endif()
+else(WITH_RDMA)
+  set(RDMA_LIBS "")
+  set(RDMA_LD_FLAGS "")
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
diff --git a/cmake/FindAVX.cmake b/cmake/simd.cmake
similarity index 100%
rename from cmake/FindAVX.cmake
rename to cmake/simd.cmake
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
deleted file mode 100644
index 97e87aa947..0000000000
--- a/cmake/swig.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-function(generate_python_api target_name)
-    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    add_custom_target(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py)
-endfunction(generate_python_api)
diff --git a/cmake/system.cmake b/cmake/system.cmake
new file mode 100644
index 0000000000..788db404eb
--- /dev/null
+++ b/cmake/system.cmake
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WIN32)
+    SET(HOST_SYSTEM "win32")
+ELSE(WIN32)
+    IF(APPLE)
+        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
+        SET(MACOS_VERSION ${VERSION})
+        SET(HOST_SYSTEM "macosx")
+    ELSE(APPLE)
+        IF(EXISTS "/etc/issue")
+            FILE(READ "/etc/issue" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ELSEIF(LINUX_ISSUE MATCHES "Debian")
+                SET(HOST_SYSTEM "debian")
+            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
+                SET(HOST_SYSTEM "ubuntu")
+            ENDIF()
+        ENDIF(EXISTS "/etc/issue")
+    ENDIF(APPLE)
+ENDIF(WIN32)
+
+# query number of logical cores
+CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+
+MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
+
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+
+# external dependencies log output
+SET(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD    0     # Wrap download in script to log output
+    LOG_UPDATE      1     # Wrap update in script to log output
+    LOG_CONFIGURE   1     # Wrap configure in script to log output
+    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_TEST        1     # Wrap test in script to log output
+    LOG_INSTALL     1     # Wrap install in script to log output
+)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 8a71b23c62..7da52bb758 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -24,7 +24,7 @@ function(target_circle_link_libraries TARGET_NAME)
                 list(APPEND libsInArgn ${arg})
             endif()
         endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
             list(APPEND LIBS "-undefined dynamic_lookup")
         endif()
         list(REVERSE libsInArgn)
@@ -81,18 +81,6 @@ function(link_paddle_exe TARGET_NAME)
         set(METRIC_LIBS "")
     endif()
 
-    if(PADDLE_WITH_INTERNAL)
-        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_internal_gserver
-            paddle_internal_owlqn
-            ARCHIVE_END
-            paddle_internal_parameter)
-    else()
-        set(INTERAL_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
@@ -107,26 +95,16 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        paddle_test_main
         ${METRIC_LIBS}
-        ${PROTOBUF_LIBRARY}
-        ${LIBGLOG_LIBRARY}
-        ${GFLAGS_LIBRARIES}
+        ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
-        ${CBLAS_LIBS}
-        ${ZLIB_LIBRARIES}
-        ${INTERAL_LIBS}
-        ${CMAKE_DL_LIBS})
-
-    if(WITH_RDMA)
-        target_link_libraries(${TARGET_NAME}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
+        ${CMAKE_DL_LIBS}
+        ${RDMA_LD_FLAGS}
+        ${RDMA_LIBS})
 
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES})
+            ${PYTHON_LIBRARIES} util)
     endif()
 
     if(WITH_GPU)
@@ -143,10 +121,7 @@ function(link_paddle_exe TARGET_NAME)
         endif()
     endif()
 
-    if(NOT WITH_DSO)
-        target_link_libraries(${TARGET_NAME}
-            ${WARPCTC_LIBRARY})
-    endif()
+    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
 # link_paddle_test
@@ -155,8 +130,10 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
     link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
-        ${GTEST_LIBRARIES})
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
+                          paddle_test_util
+                          ${GTEST_LIBRARIES})
 endfunction()
 
 # add_unittest_without_exec
diff --git a/cmake/version.cmake b/cmake/version.cmake
index a0518e07e8..ac1583a24c 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -21,4 +21,5 @@ while ("${PADDLE_VERSION}" STREQUAL "")
   endif()
 endwhile()
 
+add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 810910fd5c..8bd9837523 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -4,3 +4,4 @@ mnist_vgg_model
 plot.png
 train.log
 *pyc
+.ipynb_checkpoints
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
new file mode 100644
index 0000000000..f301da382f
--- /dev/null
+++ b/demo/mnist/api_train.py
@@ -0,0 +1,205 @@
+"""
+A very basic example for how to use current Raw SWIG API to train mnist network.
+
+Current implementation uses Raw SWIG, which means the API call is directly \
+passed to C++ side of Paddle.
+
+The user api could be simpler and carefully designed.
+"""
+import py_paddle.swig_paddle as api
+from py_paddle import DataProviderConverter
+import paddle.trainer.PyDataProvider2 as dp
+import numpy as np
+import random
+from mnist_util import read_from_mnist
+from paddle.trainer_config_helpers import *
+
+
+def optimizer_config():
+    settings(
+        learning_rate=1e-4,
+        learning_method=AdamOptimizer(),
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+
+def network_config():
+    imgs = data_layer(name='pixel', size=784)
+    hidden1 = fc_layer(input=imgs, size=200)
+    hidden2 = fc_layer(input=hidden1, size=200)
+    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+    cost = classification_cost(
+        input=inference, label=data_layer(
+            name='label', size=10))
+    outputs(cost)
+
+
+def init_parameter(network):
+    assert isinstance(network, api.GradientMachine)
+    for each_param in network.getParameters():
+        assert isinstance(each_param, api.Parameter)
+        array_size = len(each_param)
+        array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
+        each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
+
+
+def generator_to_batch(generator, batch_size):
+    ret_val = list()
+    for each_item in generator:
+        ret_val.append(each_item)
+        if len(ret_val) == batch_size:
+            yield ret_val
+            ret_val = list()
+    if len(ret_val) != 0:
+        yield ret_val
+
+
+class BatchPool(object):
+    def __init__(self, generator, batch_size):
+        self.data = list(generator)
+        self.batch_size = batch_size
+
+    def __call__(self):
+        random.shuffle(self.data)
+        for offset in xrange(0, len(self.data), self.batch_size):
+            limit = min(offset + self.batch_size, len(self.data))
+            yield self.data[offset:limit]
+
+
+def input_order_converter(generator):
+    for each_item in generator:
+        yield each_item['pixel'], each_item['label']
+
+
+def main():
+    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
+
+    # get enable_types for each optimizer.
+    # enable_types = [value, gradient, momentum, etc]
+    # For each optimizer(SGD, Adam), GradientMachine should enable different
+    # buffers.
+    opt_config_proto = parse_optimizer_config(optimizer_config)
+    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
+    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
+    enable_types = _temp_optimizer_.getParameterTypes()
+
+    # Create Simple Gradient Machine.
+    model_config = parse_network_config(network_config)
+    m = api.GradientMachine.createFromConfigProto(
+        model_config, api.CREATE_MODE_NORMAL, enable_types)
+
+    # This type check is not useful. Only enable type hint in IDE.
+    # Such as PyCharm
+    assert isinstance(m, api.GradientMachine)
+
+    # Initialize Parameter by numpy.
+    init_parameter(network=m)
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # Initialize ParameterUpdater.
+    updater.init(m)
+
+    # DataProvider Converter is a utility convert Python Object to Paddle C++
+    # Input. The input format is as same as Paddle's DataProvider.
+    converter = DataProviderConverter(
+        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+
+    train_file = './data/raw_data/train'
+    test_file = './data/raw_data/t10k'
+
+    # start gradient machine.
+    # the gradient machine must be started before invoke forward/backward.
+    # not just for training, but also for inference.
+    m.start()
+
+    # evaluator can print error rate, etc. It is a C++ class.
+    batch_evaluator = m.makeEvaluator()
+    test_evaluator = m.makeEvaluator()
+
+    # Get Train Data.
+    # TrainData will stored in a data pool. Currently implementation is not care
+    # about memory, speed. Just a very naive implementation.
+    train_data_generator = input_order_converter(read_from_mnist(train_file))
+    train_data = BatchPool(train_data_generator, 512)
+
+    # outArgs is Neural Network forward result. Here is not useful, just passed
+    # to gradient_machine.forward
+    outArgs = api.Arguments.createArguments(0)
+
+    for pass_id in xrange(2):  # we train 2 passes.
+        updater.startPass()
+
+        for batch_id, data_batch in enumerate(train_data()):
+            # data_batch is input images.
+            # here, for online learning, we could get data_batch from network.
+
+            # Start update one batch.
+            pass_type = updater.startBatch(len(data_batch))
+
+            # Start BatchEvaluator.
+            # batch_evaluator can be used between start/finish.
+            batch_evaluator.start()
+
+            # forwardBackward is a shortcut for forward and backward.
+            # It is sometimes faster than invoke forward/backward separately,
+            # because in GradientMachine, it may be async.
+            m.forwardBackward(converter(data_batch), outArgs, pass_type)
+
+            for each_param in m.getParameters():
+                updater.update(each_param)
+
+            # Get cost. We use numpy to calculate total cost for this batch.
+            cost_vec = outArgs.getSlotValue(0)
+            cost_vec = cost_vec.copyToNumpyMat()
+            cost = cost_vec.sum() / len(data_batch)
+
+            # Make evaluator works.
+            m.eval(batch_evaluator)
+
+            # Print logs.
+            print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
+                cost, batch_evaluator
+
+            batch_evaluator.finish()
+            # Finish batch.
+            #  * will clear gradient.
+            #  * ensure all values should be updated.
+            updater.finishBatch(cost)
+
+        # testing stage. use test data set to test current network.
+        updater.apply()
+        test_evaluator.start()
+        test_data_generator = input_order_converter(read_from_mnist(test_file))
+        for data_batch in generator_to_batch(test_data_generator, 512):
+            # in testing stage, only forward is needed.
+            m.forward(converter(data_batch), outArgs, api.PASS_TEST)
+            m.eval(test_evaluator)
+
+        # print error rate for test data set
+        print 'Pass', pass_id, ' test evaluator: ', test_evaluator
+        test_evaluator.finish()
+        updater.restore()
+
+        updater.catchUpWith()
+        params = m.getParameters()
+        for each_param in params:
+            assert isinstance(each_param, api.Parameter)
+            value = each_param.getBuf(api.PARAMETER_VALUE)
+            value = value.copyToNumpyArray()
+
+            # Here, we could save parameter to every where you want
+            print each_param.getName(), value
+
+        updater.finishPass()
+
+    m.finish()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 4635833d36..888cfef1e7 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -1,5 +1,5 @@
 from paddle.trainer.PyDataProvider2 import *
-import numpy
+from mnist_util import read_from_mnist
 
 
 # Define a py data provider
@@ -8,27 +8,5 @@ import numpy
                  'label': integer_value(10)},
     cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
-    imgf = filename + "-images-idx3-ubyte"
-    labelf = filename + "-labels-idx1-ubyte"
-    f = open(imgf, "rb")
-    l = open(labelf, "rb")
-
-    f.read(16)
-    l.read(8)
-
-    # Define number of samples for train/test
-    if "train" in filename:
-        n = 60000
-    else:
-        n = 10000
-
-    images = numpy.fromfile(
-        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
-    images = images / 255.0 * 2.0 - 1.0
-    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
-
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
-
-    f.close()
-    l.close()
+    for each in read_from_mnist(filename):
+        yield each
diff --git a/demo/mnist/mnist_util.py b/demo/mnist/mnist_util.py
new file mode 100644
index 0000000000..3fd88ae7ed
--- /dev/null
+++ b/demo/mnist/mnist_util.py
@@ -0,0 +1,30 @@
+import numpy
+
+__all__ = ['read_from_mnist']
+
+
+def read_from_mnist(filename):
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+
+    f.read(16)
+    l.read(8)
+
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}
+
+    f.close()
+    l.close()
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index f97ef26107..f61c65a935 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -14,9 +14,19 @@
 # limitations under the License.
 set -e
 set -x
+BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
 
-# download the dictionary and pretrained model 
-for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
+          927cf70f27f860aff1a5703ebf7f1584
+	  a52e43655cd25d279777ed509a1ae27b
+	  b92c67fe9ff70fea53596080e351ac80)
+
+for ((i=0; i<${#ITEM_MD5[@]}; i++))
+do
+  FILENAME=${DOWNLOAD_ITEMS[${i}]}
+  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
+  EXPECTED_MD5=${ITEM_MD5[${i}]}
+  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
 done
diff --git a/demo/quick_start/api_predict.sh b/demo/quick_start/api_predict.sh
index c90d3b7054..4d9aa9e885 100755
--- a/demo/quick_start/api_predict.sh
+++ b/demo/quick_start/api_predict.sh
@@ -17,7 +17,7 @@ set -e
 #Note the default model is pass-00002, you shold make sure the model path
 #exists or change the mode path.
 #only test on trainer_config.lr.py
-model=output/pass-00001/
+model=output/model/pass-00001/
 config=trainer_config.lr.py
 label=data/labels.list
 dict=data/dict.txt
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
new file mode 100755
index 0000000000..aac9b89b14
--- /dev/null
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Should run pserver.sh before run this script.
+bin_dir=$(cd `dirname $0`; pwd)
+home_dir=$(cd "${bin_dir}/.."; pwd)
+source "$bin_dir/env.sh"
+
+model_dir="$bin_dir/output"
+log_file="$bin_dir/train.log"
+
+pushd "$home_dir"
+cfg=trainer_config.lr.py
+paddle train \
+  --config=$cfg \
+  --save_dir=${model_dir} \
+  --trainer_count=4 \
+  --local=0 \
+  --log_period=100 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  --num_gradient_servers=1 \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --pservers="127.0.0.1" \
+  --comment="paddle_trainer" \
+  2>&1 | tee "$log_file"
+popd
diff --git a/demo/quick_start/cluster/env.sh b/demo/quick_start/cluster/env.sh
new file mode 100644
index 0000000000..a404993835
--- /dev/null
+++ b/demo/quick_start/cluster/env.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_nics() {
+  machine=`uname -s`
+  local nics=""
+  if [ "$machine" == "Linux" ]; then
+    nics="lo"
+  elif [ "$machine" == "Darwin" ]; then
+    nics="lo0"
+  else
+    nics="unsupport"
+  fi
+  echo $nics
+}
diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh
new file mode 100755
index 0000000000..b187c1d9b9
--- /dev/null
+++ b/demo/quick_start/cluster/pserver.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+bin_dir=$(cd `dirname $0`; pwd)
+source "$bin_dir/env.sh"
+
+paddle pserver \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --ports_num_for_sparse=1 \
+  --num_gradient_servers=1 \
+  --comment="paddle_pserver" \
+  2>&1 | tee 'pserver.log'
diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py
new file mode 100755
index 0000000000..3afa7a1e9d
--- /dev/null
+++ b/demo/recommendation/evaluate.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
diff --git a/demo/traffic_prediction/README b/demo/traffic_prediction/README
new file mode 100644
index 0000000000..4c95188583
--- /dev/null
+++ b/demo/traffic_prediction/README
@@ -0,0 +1,7 @@
+run by:
+cd ./data
+sh get_data.sh
+cd ..
+sh train.sh
+sh predict.sh
+
diff --git a/demo/traffic_prediction/data/get_data.sh b/demo/traffic_prediction/data/get_data.sh
new file mode 100755
index 0000000000..f2fa548d47
--- /dev/null
+++ b/demo/traffic_prediction/data/get_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#download the dataset
+echo "Downloading traffic data..."
+wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
+
+#extract package
+echo "Unzipping..."
+tar -zxvf traffic_data.tar.gz
+
+echo "data/speeds.csv" > train.list
+echo "data/speeds.csv" > test.list
+echo "data/speeds.csv" > pred.list
+
+echo "Done."
diff --git a/demo/traffic_prediction/dataprovider.py b/demo/traffic_prediction/dataprovider.py
new file mode 100644
index 0000000000..c7883b6950
--- /dev/null
+++ b/demo/traffic_prediction/dataprovider.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import sys
+import numpy as np
+TERM_NUM = 24
+FORECASTING_NUM = 24
+LABEL_VALUE_NUM = 4
+
+
+def initHook(settings, file_list, **kwargs):
+    """
+    Init hook is invoked before process data. It will set obj.slots and store data meta.
+
+    :param settings: global object. It will passed to process routine.
+    :type obj: object
+    :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
+    :param kwargs: unused other arguments.
+    """
+    del kwargs  #unused 
+
+    settings.pool_size = sys.maxint
+    #Use a time seires of the past as feature.
+    #Dense_vector's expression form is [float,float,...,float]
+    settings.input_types = [dense_vector(TERM_NUM)]
+    #There are next FORECASTING_NUM fragments you need predict.
+    #Every predicted condition at time point has four states.
+    for i in range(FORECASTING_NUM):
+        settings.input_types.append(integer_value(LABEL_VALUE_NUM))
+
+
+@provider(
+    init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
+def process(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(",")[1:])
+            # Get the max index.
+            end_time = len(speeds)
+            # Scanning and generating samples
+            for i in range(TERM_NUM, end_time - FORECASTING_NUM):
+                # For dense slot
+                pre_spd = map(float, speeds[i - TERM_NUM:i])
+
+                # Integer value need predicting, values start from 0, so every one minus 1.
+                fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
+
+                # Predicting label is missing, abandon the sample.
+                if -1 in fol_spd:
+                    continue
+                yield [pre_spd] + fol_spd
+
+
+def predict_initHook(settings, file_list, **kwargs):
+    settings.pool_size = sys.maxint
+    settings.input_types = [dense_vector(TERM_NUM)]
+
+
+@provider(init_hook=predict_initHook, should_shuffle=False)
+def process_predict(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(","))
+            end_time = len(speeds)
+            pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
+            yield pre_spd
diff --git a/demo/traffic_prediction/gen_result.py b/demo/traffic_prediction/gen_result.py
new file mode 100644
index 0000000000..3da70b3031
--- /dev/null
+++ b/demo/traffic_prediction/gen_result.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+res = []
+with open('./rank-00000') as f:
+    for line in f:
+        pred = map(int, line.strip('\r\n;').split(";"))
+        #raw prediction range from 0 to 3
+        res.append([i + 1 for i in pred])
+
+file_name = open('./data/pred.list').read().strip('\r\n')
+
+FORECASTING_NUM = 24
+header = [
+    'id',
+    '201604200805',
+    '201604200810',
+    '201604200815',
+    '201604200820',
+    '201604200825',
+    '201604200830',
+    '201604200835',
+    '201604200840',
+    '201604200845',
+    '201604200850',
+    '201604200855',
+    '201604200900',
+    '201604200905',
+    '201604200910',
+    '201604200915',
+    '201604200920',
+    '201604200925',
+    '201604200930',
+    '201604200935',
+    '201604200940',
+    '201604200945',
+    '201604200950',
+    '201604200955',
+    '201604201000',
+]
+###################
+## To CSV format ##
+###################
+with open(file_name) as f:
+    f.next()
+    print ','.join(header)
+    for row_num, line in enumerate(f):
+        fields = line.rstrip('\r\n').split(',')
+        linkid = fields[0]
+        print linkid + ',' + ','.join(map(str, res[row_num]))
diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
new file mode 100755
index 0000000000..cec35dce11
--- /dev/null
+++ b/demo/traffic_prediction/predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+# pass choice 
+model="output/pass-00000"
+paddle train \
+    --config=$cfg \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. 
+
+python gen_result.py > result.txt
+
+rm -rf rank-00000
diff --git a/demo/traffic_prediction/train.sh b/demo/traffic_prediction/train.sh
new file mode 100755
index 0000000000..48dfc5604f
--- /dev/null
+++ b/demo/traffic_prediction/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=1000 \
+  --dot_period=10 \
+  --num_passes=10 \
+  --use_gpu=false \
+  --show_parameter_stats_period=3000 \
+  2>&1 | tee 'train.log'
diff --git a/demo/traffic_prediction/trainer_config.py b/demo/traffic_prediction/trainer_config.py
new file mode 100755
index 0000000000..52d678624a
--- /dev/null
+++ b/demo/traffic_prediction/trainer_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+################################### DATA Configuration #############################################
+is_predict = get_config_arg('is_predict', bool, False)
+trn = './data/train.list' if not is_predict else None
+tst = './data/test.list' if not is_predict else './data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(
+    train_list=trn, test_list=tst, module="dataprovider", obj=process)
+################################### Parameter Configuaration #######################################
+TERM_NUM = 24
+FORECASTING_NUM = 24
+emb_size = 16
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=1e-3,
+    learning_method=RMSPropOptimizer())
+################################### Algorithm Configuration ########################################
+
+output_label = []
+
+link_encode = data_layer(name='link_encode', size=TERM_NUM)
+for i in xrange(FORECASTING_NUM):
+    # Each task share same weight.
+    link_param = ParamAttr(
+        name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
+    link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
+    score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
+    if is_predict:
+        maxid = maxid_layer(score)
+        output_label.append(maxid)
+    else:
+        # Multi-task training.
+        label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
+        cls = classification_cost(
+            input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
+        output_label.append(cls)
+outputs(output_label)
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index ea0ef25f00..7d425a05d4 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -72,7 +72,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 减少数据载入的耗时
 ++++++++++++++++++
 
-使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
 ..  literalinclude:: src/reduce_min_pool_size.py
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca851..9ecab5594c 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_cn.md b/doc/howto/deep_model/rnn/rnn_cn.md
deleted file mode 100644
index 5ec05b2cab..0000000000
--- a/doc/howto/deep_model/rnn/rnn_cn.md
+++ /dev/null
@@ -1,226 +0,0 @@
-RNN 配置
-=================
-
-本教程将指导你如何在 PaddlePaddle 中配置循环神经网络（RNN）。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-
--   准备用来学习循环神经网络的序列数据。
--   配置循环神经网络架构。
--   使用学习完成的循环神经网络模型生成序列。
-
-我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
-
-准备序列数据
----------------------
-
-PaddlePaddle 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。 它们都是序列，它们的大小是`src_dict`，`trg_dict`和`trg_dict`：
-
-``` sourceCode
-settings.input_types = [
-  integer_value_sequence(len(settings.src_dict)),
-  integer_value_sequence(len(settings.trg_dict)),
-  integer_value_sequence(len(settings.trg_dict))]
-```
-
-在`process`函数中，每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-``` sourceCode
-yield src_ids, trg_ids, trg_ids_next
-```
-
-有关如何编写数据提供程序的更多细节描述，请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。
-
-配置循环神经网络架构
------------------------------------------------
-
-### 简单门控循环神经网络(Gated Recurrent Neural Network)
-
-循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-
-![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
-
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。
-
-*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
-
-其中 *f*<sub>*x*</sub>(.) 称为**单步函数**（即单时间步执行的函数，step function），而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用`grumemory`和`lstmemory`，因为它们的计算效率比`recurrent_group`更高。
-
-对于 vanilla RNN，在每个时间步长，**单步函数**为：
-
-*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
-
-其中 *x*<sub>*t*</sub> 是RNN状态，并且 *I*<sub>*t*</sub> 是输入，*W*<sub>*x*</sub> 和 *W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
-
-`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**，**输出函数**和循环神经网络的输入。注意，这个函数的`step`参数需要实现`step function`（单步函数）和`output function`（输出函数）：
-
-
-``` sourceCode
-def simple_rnn(input,
-               size=None,
-               name=None,
-               reverse=False,
-               rnn_bias_attr=None,
-               act=None,
-               rnn_layer_attr=None):
-    def __rnn_step__(ipt):
-       out_mem = memory(name=name, size=size)
-       rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                      full_matrix_projection(out_mem)],
-                             name = name,
-                             bias_attr = rnn_bias_attr,
-                             act = act,
-                             layer_attr = rnn_layer_attr,
-                             size = size)
-       return rnn_out
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__rnn_step__,
-                           reverse=reverse,
-                           input=input)
-```
-
-PaddlePaddle 使用“Memory”（记忆模块）实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态，例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**，其输出被用作Memory的初始值。 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**。
-
-Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
-
-我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
-
-### Sequence to Sequence Model with Attention
-
-我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-
-![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
-
-在这个模型中，源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>。
-
-模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
-
-我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例，并将其投射到 `decoder_size` 维空间完成：
-
-``` sourceCode
-# 定义源语句的数据层
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-# 计算每个词的词向量
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-# 应用前向循环神经网络
-src_forward = grumemory(input=src_embedding, size=encoder_size)
-# 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-src_backward = grumemory(input=src_embedding,
-                          size=encoder_size,
-                          reverse=True)
-# 将循环神经网络的前向和反向部分混合在一起
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-# 投射编码向量到 decoder_size
-encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                           size = decoder_size)
-
-# 计算反向RNN的第一个实例
-backward_first = first_seq(input=src_backward)
-
-# 投射反向RNN的第一个实例到 decoder size
-decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
-```
-
-解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-trg_embedding = embedding_layer(
-    input=data_layer(name='target_language_word',
-                     size=target_dict_dim),
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_target_language_embedding'))
-group_inputs.append(trg_embedding)
-
-# 对于配备有注意力机制的解码器，在训练中，
-# 目标向量（groudtruth）是数据输入，
-# 而源序列的编码向量可以被无边界的memory访问
-# StaticInput 意味着不同时间步的输入都是相同的值，
-# 否则它以一个序列输入，不同时间步的输入是不同的。
-# 所有输入序列应该有相同的长度。
-decoder = recurrent_group(name=decoder_group_name,
-                          step=gru_decoder_with_attention,
-                          input=group_inputs)
-```
-
-单步函数的实现如下所示。首先，它定义解码网络的**Memory**。然后定义 attention，门控循环单元单步函数和输出函数：
-
-``` sourceCode
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    # 定义解码器的Memory
-    # Memory的输出定义在 gru_step 内
-    # 注意 gru_step 应该与它的Memory名字相同
-    decoder_mem = memory(name='gru_decoder',
-                         size=decoder_size,
-                         boot_layer=decoder_boot)
-    # 计算 attention 加权编码向量
-    context = simple_attention(encoded_sequence=enc_vec,
-                               encoded_proj=enc_proj,
-                               decoder_state=decoder_mem)
-    # 混合当前词向量和attention加权编码向量
-    decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                           full_matrix_projection(current_word)],
-                                 size = decoder_size * 3)
-    # 定义门控循环单元循环神经网络单步函数
-    gru_step = gru_step_layer(name='gru_decoder',
-                              input=decoder_inputs,
-                              output_mem=decoder_mem,
-                              size=decoder_size)
-    # 定义输出函数
-    out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                      size=target_dict_dim,
-                      bias_attr=True,
-                      act=SoftmaxActivation())
-    return out
-```
-
-生成序列
------------------
-
-训练模型后，我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意，`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
-
--   使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
--   使用 `beam_search` 函数。这个函数需要设置：
-    -   `bos_id`: 开始标记。每个句子都以开始标记开头。
-    -   `eos_id`: 结束标记。每个句子都以结束标记结尾。
-    -   `beam_size`: beam search 算法中的beam大小。
-    -   `max_length`: 生成序列的最大长度。
--   使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置：
-    -   `id_input`: 数据的整数ID，用于标识生成的文件中的相应输出。
-    -   `dict_file`: 用于将词ID转换为词的字典文件。
-    -   `result_file`: 生成结果文件的路径。
-
-代码如下：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-# 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
-# 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
-# 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-trg_embedding = GeneratedInput(
-    size=target_dict_dim,
-    embedding_name='_target_language_embedding',
-    embedding_size=word_vector_dim)
-group_inputs.append(trg_embedding)
-beam_gen = beam_search(name=decoder_group_name,
-                       step=gru_decoder_with_attention,
-                       input=group_inputs,
-                       bos_id=0, # Beginnning token.
-                       eos_id=1, # End of sentence token.
-                       beam_size=beam_size,
-                       max_length=max_length)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=data_layer(name="sent_id", size=1),
-                          dict_file=trg_dict_path,
-                          result_file=gen_trans_file)
-outputs(beam_gen)
-```
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
-
-完整的配置文件在`demo/seqToseq/seqToseq_net.py`。
diff --git a/doc/howto/deep_model/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
similarity index 86%
rename from doc/howto/deep_model/rnn_config_cn.rst
rename to doc/howto/deep_model/rnn/rnn_config_cn.rst
index e6d8c1133a..ac2bd0775f 100644
--- a/doc/howto/deep_model/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -1,4 +1,4 @@
-RNN 配置
+RNN配置
 ========
 
 本教程将指导你如何在 PaddlePaddle
@@ -20,7 +20,7 @@ PaddlePaddle
 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
 它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
 
-.. code:: sourcecode
+.. code:: python
 
     settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
@@ -29,12 +29,11 @@ PaddlePaddle
 
 在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
 
-.. code:: sourcecode
+.. code:: python
 
     yield src_ids, trg_ids, trg_ids_next
 
-有关如何编写数据提供程序的更多细节描述，请参考
-`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
 ``demo/seqToseq/dataprovider.py``\ 。
 
 配置循环神经网络架构
@@ -45,18 +44,17 @@ PaddlePaddle
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-   :alt: image
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center
 
-   image
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
 
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
-= 1 执行以下操作。
+.. math::
 
-*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
 
-其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ （即单时间步执行的函数，step
-function），而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
 sequence
@@ -67,16 +65,17 @@ vanilla
 
 对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
 
-*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
+.. math::
 
-其中 *x*\ \ *t*\  是RNN状态，并且 *I*\ \ *t*\  是输入，\ *W*\ \ *x*\  和
-*W*\ \ *i*\  分别是RNN状态和输入的变换矩阵。\ *b*
-是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
 
 ``recurrent_group``\ 是构建循环神经网络的最重要的工具。
 它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
 
-.. code:: sourcecode
+.. code:: python
 
     def simple_rnn(input,
                    size=None,
@@ -102,7 +101,7 @@ vanilla
 
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
-Memory是在单步函数中循环使用的状态，例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
 一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
 layer(引导层)**\ ，其输出被用作Memory的初始值。
 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
@@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
-   :alt: image
-
-   image
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
 
-在这个模型中，源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
-*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
-*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
-这个门控循环神经网络生成一系列权重
-*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
-用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
 
 模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
 ``recurrent_group``
-更快。我们已经实现了大多数常用的循环神经网络架构，可以参考
-`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
-了解更多细节。
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
 
 我们还将编码向量投射到 ``decoder_size``
 维空间。这通过获得反向循环网络的第一个实例，并将其投射到
 ``decoder_size`` 维空间完成：
 
-.. code:: sourcecode
+.. code:: python
 
     # 定义源语句的数据层
     src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
@@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
 
-.. code:: sourcecode
+.. code:: python
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         # 定义解码器的Memory
@@ -253,7 +247,7 @@ attention，门控循环单元单步函数和输出函数：
 
 代码如下：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -279,9 +273,6 @@ attention，门控循环单元单步函数和输出函数：
                               result_file=gen_trans_file)
     outputs(beam_gen)
 
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅
-`Semantic Role Labeling
-Demo <../../demo/semantic_role_labeling/index.html>`__
-了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
 
 完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
new file mode 100644
index 0000000000..9489a921c7
--- /dev/null
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+================
+实现新的网络层
+================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = f(W^T x + b)` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 0513f068f3..46481f5ead 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -209,7 +209,6 @@ The implementation of the backward part has the following steps.
       if (biases_ && biases_->getWGrad()) {
         biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
 
-        /* Increasing the number of gradient */
         biases_->getParameterPtr()->incUpdate(callback);
       }
 
@@ -297,7 +296,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
 + each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
 + call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
    - layer and input configurations. (:code:`config` in our example)
-   - type of the input. (:code:`fc` in our example)
+   - type of the layer. (:code:`fc` in our example)
    - batch size of the gradient check. (100 in our example)
    - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
    - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
@@ -310,7 +309,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       config.biasSize = 4096;
       config.layerConfig.set_type("fc");
       config.layerConfig.set_size(4096);
-      config.layerConfig.set_active_type("sigmoid");
+      config.layerConfig.set_active_type("softmax");
       config.layerConfig.set_drop_rate(0.1);
       // Setup inputs.
       config.inputDefs.push_back(
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 6a14ce8ae7..bd3d0ec292 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -7,10 +7,11 @@
 ..  toctree::
   :maxdepth: 1
 
+  usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/cluster/k8s/k8s_cn.md
-  usage/cluster/k8s/k8s_distributed_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md
 
 开发标准
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 983dc743eb..1fbfcd260b 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,8 +7,10 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  usage/cmd_parameter/index_en.md
+  usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000..833e21dd19
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -0,0 +1,409 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">allow_inefficient_sparse_update</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">data_server_port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000..dbf7c6f00b
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,336 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 度量学习(Metric Learning)
+* `--external`
+   - 指示是否使用外部机器进行度量学习.
+   - 类型: bool (默认: 0).
+
+* `--data_server_port`
+  - 数据服务器(data server)的监听端口，主要用在度量学习中.
+  - 类型: int32 (默认: 21134).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--allow_inefficient_sparse_update`
+  - 指示是否允许低效率的稀疏更新.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 27b2faf1d8..aa69a3bd54 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -73,7 +73,7 @@
   - type: bool (default: 0).
 
 * `--load_missing_parameter_strategy`
-  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
     - `fail`: program will exit.
     - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
     - `zero`: all parameters are zero.
@@ -118,11 +118,11 @@
   - type: int32 (default: 0).
 
 * `--test_wait`
-  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
   - type: bool (default: 0).
 
 * `--model_list`
-  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - File that saves the model list when testing. 
   - type: string (default: "", null).
 
 * `--predict_output_dir`
@@ -212,7 +212,7 @@
   - type: bool (default: 0).
 
 * `--pservers`
-  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - Comma separated IP addresses of pservers.
   - type: string (default: "127.0.0.1").
 
 * `--port`
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000..4c87298211
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
deleted file mode 100644
index 2a96e7e976..0000000000
--- a/doc/howto/usage/cmd_parameter/index_en.md
+++ /dev/null
@@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index:
-```
-# Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/usage/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000..0e3c72d27a
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/usage/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000..db8c39d950
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
index 4d7bb33f36..e287f0c4b9 100644
--- a/doc/howto/usage/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@@ -134,14 +134,14 @@ fc2=fc_layer(...)
 fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
 
 ```
-- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
 
-- device=-1: use the CPU for layer l3.
+- device=-1: use the CPU for layer fc3.
 
 - trainer_count:
-  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
 
-  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
 
 ### Case 2: Specify Layers in Different Devices
 
@@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
 In this case, we assume that there are 4 GPUs in one machine.
 
 - trainer_count=1:
-  - Use GPU 0 to compute layer l2.
-  - Use GPU 1 to compute layer l3.
-  - Use CPU to compute layer l4.
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
 
 - trainer_count=2:
-  - Use GPU 0 and 1 to compute layer l2.
-  - Use GPU 2 and 3 to compute layer l3.
-  - Use CPU to compute l4 in two threads.
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
 
 - trainer_count=4:
   - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
new file mode 100644
index 0000000000..b04bfba590
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -0,0 +1,666 @@
+# Kubernetes on AWS
+
+## Create AWS Account and IAM Account
+
+To use AWS, we need to sign up an AWS account on Amazon's Web site.
+An AWS account allows us to login to the AWS Console Web interface to
+create IAM users and user groups. Usually, we create a user group with
+privileges required to run PaddlePaddle, and we create users for
+those who are going to run PaddlePaddle and add these users into the
+group. IAM users can identify themselves using password and tokens,
+where passwords allows users to log in to the AWS Console, and tokens
+make it easy for users to submit and inspect jobs from the command
+line.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges in
+the user group:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+
+
+By the time we write this tutorial, we noticed that Chinese AWS users
+might suffer from authentication problems when running this tutorial.
+Our solution is that we create a VM instance with the default Amazon
+AMI and in the same zone as our cluster runs, so we can SSH to this VM
+instance as a tunneling server and control our cluster and jobs from
+it.
+
+
+## PaddlePaddle on AWS
+
+Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
+
+
+###Download kube-aws and kubectl
+
+####kube-aws
+
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+####kubectl
+
+Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+
+Extract the tarball and then concate the kubernetes binaries directory into PATH:
+
+```
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+
+```
+
+User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
+
+
+###Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+
+
+And then configure your AWS account information:
+
+```
+aws configure
+
+```
+
+
+Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+
+```
+
+Test that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+###Define Cluster Parameters
+
+####EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+
+####KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key in the AWS console, or with the aws command line tool:
+
+```
+$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+
+And then you need to add several inline policies in your user permission.
+
+kms inline policy:
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:xxxxxxxxx:key/*"
+            ]
+        }
+    ]
+}
+```
+cloudformation inline policy:
+
+```
+"Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+
+
+####External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+
+####S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+####Initialize an asset directory
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+$ kube-aws init \
+--cluster-name=my-cluster-name \
+--external-dns-name=my-cluster-endpoint \
+--region=us-west-1 \
+--availability-zone=us-west-1c \
+--key-name=key-pair-name \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+####Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+$ kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+sh $ kube-aws render stack
+```
+
+Here's what the directory structure looks like:
+
+```
+$ tree
+.
+├── cluster.yaml
+├── credentials
+│   ├── admin-key.pem
+│   ├── admin.pem
+│   ├── apiserver-key.pem
+│   ├── apiserver.pem
+│   ├── ca-key.pem
+│   ├── ca.pem
+│   ├── worker-key.pem
+│   └── worker.pem
+│   ├── etcd-key.pem
+│   └── etcd.pem
+│   ├── etcd-client-key.pem
+│   └── etcd-client.pem
+├── kubeconfig
+├── stack-template.json
+└── userdata
+    ├── cloud-config-controller
+    └── cloud-config-worker
+```
+
+These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+
+
+###Kubernetes Cluster Start Up
+
+####Create the instances defined in the CloudFormation template
+
+Now for the exciting part, creating your cluster:
+
+```
+$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+```
+
+####Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+
+####Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS                     AGE
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+```
+
+
+###Setup PaddlePaddle Environment on AWS
+
+Now, we've created a cluster with following network capability:
+
+1. All Kubernetes nodes can communicate with each other.
+
+1. All Docker containers on Kubernetes nodes can communicate with each other.
+
+1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
+
+1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
+
+
+For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+
+
+1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+
+1. Create the Elastic File System in AWS console, and attach the new VPC with it.
+<center>![](src/create_efs.png)</center>
+
+
+1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
+<center>![](src/add_security_group.png)</center>
+
+
+1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
+<center>![](src/efs_mount.png)</center>
+
+
+Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+
+
+
+###Core Concept of PaddlePaddle Training on AWS
+
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+
+####Use Kubernetes Job
+
+We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+
+We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+
+In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+
+####Create PaddlePaddle Node
+
+After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+
+
+####Start up Training
+
+After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+
+With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+
+1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+1. Copy the training data from EFS sharing volume into container.
+1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
+1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+
+
+###Start PaddlePaddle Training Demo on AWS
+
+Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+
+1. Build PaddlePaddle Docker image.
+1. Divide the training data file and upload it onto the EFS sharing volume.
+1. Create the training job yaml file, and start up the job.
+1. Check the result after training.
+
+####Build PaddlePaddle Docker Image
+
+PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+
+1. Copy the training data into container.
+1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+
+
+Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+
+```
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
+
+`start_paddle.py` will start parsing the parameters.
+
+```
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+
+```
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
+
+```
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+
+In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+
+```
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+Use `docker build` to build toe Docker Image:
+
+```
+docker build -t your_repo/paddle:mypaddle .
+```
+
+And then push the built image onto docker registry.
+
+```
+docker push  your_repo/paddle:mypaddle
+```
+
+####Upload Training Data File
+
+Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
+
+```
+efs
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+
+
+####Create Kubernetes Job
+
+Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
+
+In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+
+The yaml file content is as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/admin/efs
+      containers:
+      - name: trainer
+        image: drinkcode/paddle:k8s-job
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+        ports:
+        - name: jobport
+          hostPort: 30001
+          containerPort: 30001
+      restartPolicy: Never
+
+```
+
+In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
+
+`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+
+`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+
+`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+
+`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
+
+`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+
+`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+
+After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+
+```
+kubectl create -f job.yaml
+```
+
+After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+
+
+
+####Check Training Results
+
+During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+
+```
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can always check the container training status through logs, for example:
+
+```
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+
+
+###Kubernetes Cluster Tear Down
+
+
+If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+
+```
+kube-aws destroy
+```
+It's an async call, it might take 5 min to tear down the whole cluster.
+
+If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+
+
+
+## For Experts with Kubernetes and AWS
+
+Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+
+### Some Presumptions
+
+* Instances run on CoreOS, the official IAM.
+* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
+* For networking, we use Flannel network at this moment, we will use Calico solution later on.
+* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc/howto/usage/cluster/k8s/k8s_cn.md b/doc/howto/usage/k8s/k8s_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_cn.md
rename to doc/howto/usage/k8s/k8s_cn.md
index 2575701053..ab07cb9cd5 100644
--- a/doc/howto/usage/cluster/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 单机训练
+# Kubernetes单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/k8s/k8s_distributed_cn.md
index 53d0b4676c..b63b8437a0 100644
--- a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 分布式训练
+# Kubernetes分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
@@ -22,7 +22,7 @@
 
 首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
-![paddle on kubernetes结构图](k8s-paddle-arch.png)
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
 上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md
new file mode 100644
index 0000000000..0c3ab05b70
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_en.md
@@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/howto/usage/cluster/k8s/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/k8s/Dockerfile
rename to doc/howto/usage/k8s/src/Dockerfile
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
new file mode 100644
index 0000000000..50eed4c657
Binary files /dev/null and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
new file mode 100644
index 0000000000..f4d448d151
Binary files /dev/null and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/k8s/src/efs_mount.png
new file mode 100644
index 0000000000..0f9e3cab98
Binary files /dev/null and b/doc/howto/usage/k8s/src/efs_mount.png differ
diff --git a/doc/howto/usage/cluster/k8s/job.yaml b/doc/howto/usage/k8s/src/job.yaml
similarity index 100%
rename from doc/howto/usage/cluster/k8s/job.yaml
rename to doc/howto/usage/k8s/src/job.yaml
diff --git a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/k8s/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/k8s/src/managed_policy.png
new file mode 100644
index 0000000000..c7ecda555b
Binary files /dev/null and b/doc/howto/usage/k8s/src/managed_policy.png differ
diff --git a/doc/howto/usage/cluster/k8s/start.sh b/doc/howto/usage/k8s/src/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start.sh
rename to doc/howto/usage/k8s/src/start.sh
diff --git a/doc/howto/usage/cluster/k8s/start_paddle.py b/doc/howto/usage/k8s/src/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start_paddle.py
rename to doc/howto/usage/k8s/src/start_paddle.py
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
new file mode 100644
index 0000000000..fe800308d8
--- /dev/null
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -0,0 +1,138 @@
+# 中文词向量模型的使用 #
+----------
+本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
+
+在此感谢 @lipeng 提出的代码需求，并给出的相关模型格式的定义。
+
+## 介绍 ###
+### 中文字典 ###
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+  - `<s>`: 分词序列的开始
+  - `<e>`: 分词序列的结束
+  - `<unk>`: 未知词
+
+### 中文词向量的预训练模型 ###
+遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法，模型采用 n-gram 语言模型，结构如下图：6元上下文作为输入层->全连接层->softmax层 。对应于字典，我们预训练得到4种不同维度的词向量，分别为：32维、64维、128维和256维。
+<center>![](./neural-n-gram-model.png)</center>
+<center>Figure 1. neural-n-gram-model</center>
+
+### 下载和数据抽取 ###
+运行以下的命令下载和获取我们的字典和预训练模型：
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    ./pre_DictAndModel.sh
+
+## 中文短语改写的例子 ##
+以下示范如何使用预训练的中文字典和词向量进行短语改写。
+
+### 数据的准备和预处理 ###
+首先，运行以下的命令下载数据集。该数据集（utf8编码）包含20个训练样例，5个测试样例和2个生成式样例。
+
+    cd $PADDLE_ROOT/demo/seqToseq/data
+    ./paraphrase_data.sh
+
+第二步，将数据处理成规范格式，在训练数集上训练生成词向量字典（数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`）:
+
+    cd $PADDLE_ROOT/demo/seqToseq/
+    python preprocess.py -i data/paraphrase [--mergeDict]
+
+- 其中，如果使用`--mergeDict`选项，源语言短语和目标语言短语的字典将被合并（源语言和目标语言共享相同的编码字典）。本实例中，源语言和目标语言都是相同的语言，因此可以使用该选项。
+
+
+### 使用用户指定的词向量字典 ###
+使用如下命令，从预训练模型中，根据用户指定的字典，抽取对应的词向量构成新的词表:
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
+
+- `--preModel PREMODEL`: 预训练词向量字典模型的路径
+- `--preDict PREDICT`:  预训练模型使用的字典的路径
+- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
+- `--usrDict USRDICT`: 用户指定新的字典的路径，用于构成新的词表
+- `-d DIM`: 参数（词向量）的维度
+
+此处，你也可以简单的运行以下的命令：
+
+    cd $PADDLE_ROOT/demo/seqToseq/data/
+    ./paraphrase_model.sh
+
+运行成功以后，你将会看到以下的模型结构：
+
+    paraphrase_model
+    |--- _source_language_embedding
+    |--- _target_language_embedding
+
+### 在PaddlePaddle平台训练模型 ###
+首先，配置模型文件，配置如下（可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置）:
+
+    from seqToseq_net import *
+    is_generating = False
+
+    ################## Data Definition #####################
+    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                                 job_mode = job_mode)
+
+    ############## Algorithm Configuration ##################
+    settings(
+          learning_method = AdamOptimizer(),
+          batch_size = 50,
+          learning_rate = 5e-4)
+
+    ################# Network configure #####################
+    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
+
+这个配置与`demo/seqToseq/translation/train.conf` 基本相同
+
+然后，使用以下命令进行模型训练:
+
+    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
+    ./train.sh
+
+其中，`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同，只有2个配置不一样:
+
+- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
+- `--load_missing_parameter_strategy`：如果参数模型文件缺失，除词向量模型外的参数将使用正态分布随机初始化
+
+如果用户想要了解详细的数据集的格式、模型的结构和训练过程，请查看 [Text generation Tutorial](../text_generation/index_cn.md).
+
+## 可选功能 ##
+###  观测词向量
+PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+
+- `-i INPUT`: 输入的（二进制）词向量模型名称
+- `-o OUTPUT`: 输出的文本模型名称
+- `-d DIM`: （词向量）参数维度
+
+运行完以上命令，用户可以在输出的文本模型中看到:
+
+    0,4,32156096
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+
+- 其中，第一行是`PaddlePaddle` 输出文件的格式说明，包含3个属性：:
+  - `PaddlePaddle`的版本号，本例中为0
+  - 浮点数占用的字节数，本例中为4
+  - 总计的参数个数，本例中为32,156,096
+- 其余行是（词向量）参数行（假设词向量维度为32）
+  - 每行打印32个参数以','分隔
+  - 共有32,156,096/32 = 1,004,877行，也就是说，模型共包含1,004,877个被向量化的词
+
+### 词向量模型的修正
+`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+- `-i INPUT`: 输入的文本词向量模型名称
+- `-o OUTPUT`: 输出的二进制词向量模型名称
+
+请注意，输入的文本格式如下:
+
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+- 输入文本中没有头部（格式说明）行
+- （输入文本）每行存储一个词，以逗号','分隔
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
index 001ed6cc19..0eafd7cb49 100644
Binary files a/doc/tutorials/gan/gan.png and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
index 99c8d73011..ac9ed37b22 100644
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
@@ -4,9 +4,7 @@ This demo implements GAN training described in the original [GAN paper](https://
 
 The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
 
-<p align="center">
-    <img src="./gan.png" width="500" height="300"> 
-</p>
+<center>![](./gan.png)</center>
 <p align="center">
     Figure 1. GAN-Model-Structure
     <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
@@ -111,9 +109,7 @@ $python gan_trainer.py -d uniform --useGpu 1
 ```
 The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
 
-<p align="center">
-    <img src="./uniform_sample.png" width="300" height="300"> 
-</p>
+<center>![](./uniform_sample.png)</center>
 <p align="center">
     Figure 2. Uniform Sample
 </p>
@@ -135,9 +131,7 @@ To train the GAN model on mnist data, one can use the following command:
 $python gan_trainer.py -d mnist --useGpu 1
 ```
 The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<p align="center">
-    <img src="./mnist_sample.png" width="300" height="300"> 
-</p>
+<center>![](./mnist_sample.png)</center>
 <p align="center">
     Figure 3. MNIST Sample
 </p>
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
index 4a96c45cae..e716c48e78 100644
Binary files a/doc/tutorials/gan/uniform_sample.png and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
index 97014d5376..6a27004d58 100644
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
@@ -2,6 +2,7 @@
 
 * [快速入门](quick_start/index_cn.rst)
 * [个性化推荐](rec/ml_regression_cn.rst)
+* [图像分类](image_classification/index_cn.md)
 * [情感分析](sentiment_analysis/index_cn.md)
 * [语义角色标注](semantic_role_labeling/index_cn.md)
 * [机器翻译](text_generation/index_cn.md)
@@ -9,3 +10,4 @@
 ## 常用模型
 
 * [ResNet模型](imagenet_model/resnet_model_cn.md)
+* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index cce9d3a176..77331a703b 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -7,6 +7,7 @@ There are several examples and demos here.
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
 * [Text Generation](text_generation/index_en.md)
+* [Image Auto-Generation](gan/index_en.md)
 
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 2daea052b0..503024cff3 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index ed69bd764f..6e8fcd114d 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,10 +1,30 @@
+FUNCTION(generate_python_api target_name)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
+        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
+                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
+                ${external_project_dependencies}
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+        COMMENT "Generate Python API from swig")
+    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
+                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                ${PROJ_ROOT}/paddle/Paddle_wrap.h
+                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                ${external_project_dependencies})
+ENDFUNCTION(generate_python_api)
+
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
+    Evaluator.cpp
     GradientMachine.cpp
     Matrix.cpp
     Parameter.cpp
     ParameterOptimizer.cpp
+    ParameterUpdater.cpp
     SequenceGenerator.cpp
     Trainer.cpp
     Util.cpp
@@ -40,7 +60,7 @@ file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
@@ -63,7 +83,30 @@ install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
 
 add_custom_target(python_api_wheel ALL DEPENDS
   ${PROJ_ROOT}/paddle/dist/.timestamp)
+add_dependencies(python_api_wheel python_swig_sources
+  paddle_parameter
+  paddle_math
+  paddle_utils
+  paddle_gserver
+  paddle_pserver
+  paddle_trainer
+  paddle_api
+  paddle_cuda)
 
 if(WITH_TESTING)
+    IF(NOT PY_PIP_FOUND)
+        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+        ExternalProject_Add(pip
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/pypa/pip.git
+            GIT_TAG             9.0.1
+            PREFIX              ${PIP_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+            BUILD_IN_SOURCE     1
+            DEPENDS python setuptools python_api_wheel
+        )
+    ENDIF()
     add_subdirectory(test)
 endif()
diff --git a/paddle/utils/DisableCopy.h b/paddle/api/Evaluator.cpp
similarity index 60%
rename from paddle/utils/DisableCopy.h
rename to paddle/api/Evaluator.cpp
index 41de98bbde..c30e098763 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/api/Evaluator.cpp
@@ -11,13 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <sstream>
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
-#pragma once
+Evaluator::Evaluator() : m(new EvaluatorPrivate()) {}
+Evaluator::~Evaluator() { delete m; }
 
-/**
- * Disable copy macro.
- */
-#define DISABLE_COPY(CLASS_NAME)                \
-  CLASS_NAME(CLASS_NAME &&) = delete;           \
-  CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
+void Evaluator::start() { m->rawPtr->start(); }
+
+void Evaluator::finish() { m->rawPtr->finish(); }
+
+std::string Evaluator::toString() {
+  std::ostringstream sout;
+  m->rawPtr->printStats(sout);
+  return sout.str();
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index ced2293376..66115f8293 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -64,6 +64,10 @@ GradientMachine* GradientMachine::createByModelConfig(
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
 void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
 
 void GradientMachine::prefetch(const Arguments& inArgs) {
@@ -166,3 +170,13 @@ SequenceGenerator* GradientMachine::asSequenceGenerator(
   r->setBeamSize(beam_size);
   return r;
 }
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 9194a6371b..068ba286c0 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -96,7 +96,9 @@ namespace std {
 %rename(__getitem__) Vector::get;
 %rename(__setitem__) Vector::set;
 %rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
 %rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
 
 %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
   (float* data, int dim1, int dim2) 
@@ -167,6 +169,7 @@ namespace std {
 %newobject GradientMachine::asSequenceGenerator;
 %newobject GradientMachine::getParameter;
 %newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
 %newobject TrainerConfig::createFromTrainerConfigFile;
 %newobject TrainerConfig::getModelConfig;
 %newobject TrainerConfig::getOptimizationConfig;
@@ -174,6 +177,8 @@ namespace std {
 %newobject Parameter::getConfig;
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
@@ -193,4 +198,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
\ No newline at end of file
+%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index bc1b22e187..f5af8b0035 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/TypeDefs.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
 
-#define DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname& other);       \
-  classname& operator=(const classname& other)
-
 /**
  * @brief Initialize paddle.
  *
@@ -102,7 +98,7 @@ const size_t NO_SPARSE_ID = -1UL;
 struct MatrixPrivate;
 class Matrix {
   Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY_AND_ASSIGN(Matrix);
+  DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
 public:
@@ -242,7 +238,7 @@ private:
 
 struct VectorPrivate;
 class Vector {
-  DISABLE_COPY_AND_ASSIGN(Vector);
+  DISABLE_COPY(Vector);
   Vector();
   static Vector* createByPaddleVectorPtr(void* ptr);
 
@@ -322,7 +318,7 @@ private:
 struct IVectorPrivate;
 class IVector {
   IVector();
-  DISABLE_COPY_AND_ASSIGN(IVector);
+  DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
 public:
@@ -402,7 +398,7 @@ struct ArgumentsPrivate;
 class Arguments {
 private:
   Arguments();  // Internal Create.
-  DISABLE_COPY_AND_ASSIGN(Arguments);
+  DISABLE_COPY(Arguments);
 
 public:
   /**
@@ -474,7 +470,7 @@ enum GradientMatchineCreateMode {
 
 struct ParameterConfigPrivate;
 class ParameterConfig {
-  DISABLE_COPY_AND_ASSIGN(ParameterConfig);
+  DISABLE_COPY(ParameterConfig);
   ParameterConfig();
 
   /**
@@ -504,7 +500,7 @@ private:
 
 struct OptimizationConfigPrivate;
 class OptimizationConfig {
-  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
+  DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
 public:
@@ -521,6 +517,7 @@ private:
 
   friend class TrainerConfig;
   friend class ParameterOptimizer;
+  friend class ParameterUpdater;
   friend class Trainer;
 };
 
@@ -528,7 +525,7 @@ struct ParameterPrivate;
 class Parameter {
 private:
   Parameter();
-  DISABLE_COPY_AND_ASSIGN(Parameter);
+  DISABLE_COPY(Parameter);
 
 public:
   virtual ~Parameter();
@@ -555,6 +552,8 @@ public:
 
   bool load(const std::string& filename) const;
 
+  size_t getSize() const;
+
 private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
@@ -563,6 +562,7 @@ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
+  friend class ParameterUpdater;
 };
 
 struct ModelConfigPrivate;
@@ -574,7 +574,7 @@ struct ModelConfigPrivate;
 class ModelConfig {
 private:
   ModelConfig();
-  DISABLE_COPY_AND_ASSIGN(ModelConfig);
+  DISABLE_COPY(ModelConfig);
 
 public:
   virtual ~ModelConfig();
@@ -595,7 +595,7 @@ struct TrainerConfigPrivate;
 class TrainerConfig {
 private:
   TrainerConfig();
-  DISABLE_COPY_AND_ASSIGN(TrainerConfig);
+  DISABLE_COPY(TrainerConfig);
 
 public:
   virtual ~TrainerConfig();
@@ -635,7 +635,7 @@ public:
 
 struct ParameterTraverseCallbackPrivate;
 class ParameterTraverseCallback {
-  DISABLE_COPY_AND_ASSIGN(ParameterTraverseCallback);
+  DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
 public:
@@ -657,7 +657,7 @@ private:
  */
 struct ParameterOptimizerPrivate;
 class ParameterOptimizer {
-  DISABLE_COPY_AND_ASSIGN(ParameterOptimizer);
+  DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
 public:
@@ -689,12 +689,12 @@ private:
 };
 
 class SequenceGenerator;
-
+class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
 private:
   GradientMachine();
-  DISABLE_COPY_AND_ASSIGN(GradientMachine);
+  DISABLE_COPY(GradientMachine);
 
 public:
   virtual ~GradientMachine();
@@ -720,6 +720,13 @@ public:
       GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
   /**
    * Prefetch row ids of sparse parameter.
    */
@@ -777,6 +784,10 @@ public:
       size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
 private:
   GradientMachinePrivate* m;
 
@@ -788,6 +799,111 @@ private:
   // Not to use c++ 11 init-list, so we use static var as function default arg.
   static std::vector<int> defaultParamTypes;
   friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+private:
+  ParameterUpdater();
+
+public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
 };
 
 struct TrainerPrivate;
@@ -796,7 +912,7 @@ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY_AND_ASSIGN(Trainer);
+  DISABLE_COPY(Trainer);
 
 public:
   virtual ~Trainer();
@@ -862,7 +978,7 @@ public:
 
 struct SequenceGeneratorPrivate;
 class SequenceGenerator {
-  DISABLE_COPY_AND_ASSIGN(SequenceGenerator);
+  DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
 public:
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
index d2b56fc41c..f41352bfec 100644
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
 #include "paddle/trainer/TrainerConfigHelper.h"
 
-#pragma once
-
 struct GradientMachinePrivate {
   std::shared_ptr<paddle::GradientMachine> machine;
 
@@ -65,3 +67,31 @@ struct ArgumentsPrivate {
     return *(std::shared_ptr<T>*)(rawPtr);
   }
 };
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 9cfa2e35f5..19f7a898d6 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -14,21 +14,7 @@ limitations under the License. */
 
 #include "paddle/parameter/Parameter.h"
 #include "PaddleAPI.h"
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
+#include "PaddleAPIPrivate.h"
 
 Parameter::Parameter() : m(new ParameterPrivate()) {}
 
@@ -78,3 +64,5 @@ bool Parameter::save(const std::string& filename) const {
 bool Parameter::load(const std::string& filename) const {
   return m->getPtr()->load(filename);
 }
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
new file mode 100644
index 0000000000..75b0ae7cb6
--- /dev/null
+++ b/paddle/api/ParameterUpdater.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
+#include "paddle/trainer/ThreadParameterUpdater.h"
+
+ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
+
+ParameterUpdater *ParameterUpdater::createLocalUpdater(
+    OptimizationConfig *config) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
+}
+
+ParameterUpdater::~ParameterUpdater() { delete m; }
+
+void ParameterUpdater::init(const GradientMachine &gm) {
+  m->updater->init(gm.m->machine->getNonStaticParameters());
+}
+
+void ParameterUpdater::startPass() { m->updater->startPass(); }
+
+void ParameterUpdater::finishPass() { m->updater->finishPass(); }
+
+PassType ParameterUpdater::startBatch(size_t batchSize) {
+  return m->updater->startBatch((int64_t)batchSize);
+}
+
+void ParameterUpdater::finishBatch(float cost) {
+  m->updater->finishBatch(cost);
+}
+
+void ParameterUpdater::update(Parameter *param) {
+  auto paddleParam = param->m->getPtr();
+  m->updater->update(paddleParam);
+}
+
+void ParameterUpdater::restore() { m->updater->restore(); }
+
+void ParameterUpdater::apply() { m->updater->apply(); }
+
+void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index c3f739568f..54d67aa62f 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-#include <fenv.h>
 #include <algorithm>
 #include <iostream>
 #include <iterator>
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 874f2fd044..db8f005929 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -253,7 +253,7 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   *view_m_data = new float[*dim1];
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
     hl_memcpy_device2host(
         *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index 23542b952b..e11ee92036 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,17 +1,17 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
-ZLIB_LIB="@ZLIB_LIBRARIES@"
+PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
+ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
+GLOG_LIBRARIES="@GLOG_LIBRARIES@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBS@"
+CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
 
-CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 7c8206e3fe..ad5dce209b 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -40,14 +40,14 @@ try:
             self.paddle_build_dir = PADDLE_BUILD_DIR
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIB
-            self.zlib = ZLIB_LIB
+            self.protolib = PROTOBUF_LIBRARY
+            self.zlib = ZLIB_LIBRARIES
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.glog_libs = LIBGLOG_LIBRARY
+            self.glog_libs = GLOG_LIBRARIES
 
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
@@ -141,9 +141,12 @@ try:
 
         def c_flag(self):
             if self.with_coverage:
-                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+                return [
+                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
+                    "-std=c++11"
+                ]
             else:
-                return None
+                return ["-std=c++11"]
 except ImportError:
 
     class PaddleLDFlag(object):
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 08a0fe96a0..a2fa623c80 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh ${PYTHON_EXECUTABLE})
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 2f12ba0264..bcf06afa86 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-rm -rf .test_env
-virtualenv .test_env
-source .test_env/bin/activate
-
-pip --timeout 600  install ../../dist/*.whl
+$1 -m pip install ../../dist/*.whl
 
 test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
@@ -33,7 +29,7 @@ export PYTHONPATH=$PWD/../../../python/
 for fn in $test_list
 do
   echo "test $fn"
-  python $fn
+  $1 $fn
   if [ $? -ne 0 ]; then
     exit 1
   fi
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index aa1ff4a771..57fb89608f 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -88,6 +88,8 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
+add_dependencies(paddle_cuda ${external_project_dependencies})
+
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 84c5f2d5c9..5b9884b786 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -16,7 +16,31 @@ limitations under the License. */
 #define HL_BASE_H_
 
 #include <cstddef>
-#include "paddle/utils/TypeDefs.h"
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
 
 /**
  * HPPL is an internal high performance parallel computing library
@@ -181,46 +205,6 @@ typedef struct {
   size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
-#ifndef PADDLE_TYPE_DOUBLE
-/**
- * HPPL data type: real (float or double)
- *
- * if real == float
- *
- * HL_FLOAT_MAX: 3.40282347e+38F
- *
- * HL_FLOAT_MIN: 1.17549435e-38F
- */
-#define HL_FLOAT_MAX 3.40282347e+38F
-/**
- * if real == double
- *
- * HL_FLOAT_MAX: 1.7976931348623157e+308
- *
- * HL_FLOAT_MIN: 2.2250738585072014e-308
- */
-#define HL_FLOAT_MIN 1.17549435e-38F
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- *
- * Currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
 #ifdef __NVCC__
 
 #include "cuda_runtime.h"
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9bcd25b062..9f9d8f972e 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
-
 /**
  * @brief   Memory copy from sequence to batch.
  *
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
index 79bf6c3db7..7885ae5701 100644
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
 
+#include "ctc.h"
 #include "hl_base.h"
-#include "warp-ctc/include/ctc.h"
 
 typedef ctcStatus_t hl_warpctc_status_t;
 typedef ctcOptions hl_warpctc_options_t;
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index d6b07556f8..05e51bce9e 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
-
 inline void hl_sequence2batch_copy(real* batch,
                                    real* sequence,
                                    const int* batchIndex,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 4e33ac443c..ba823de272 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template <bool padding>
-__global__ void KeContextProjectionForward(real* input,
-                                           const int* sequence,
-                                           real* weightData,
-                                           real* output,
-                                           int inputDim,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  output += seqStart * inputDim * contextLength;
-  input += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        // i + contextStart;
-        if ((i + contextStart) < 0) {
-          if (padding) {
-            value = weightData[i * inputDim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          if (padding) {
-            value =
-              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
-                         inputDim + idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          output + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_forward(real* input,
-                                   const int* sequence,
-                                   real* weightData,
-                                   real* output,
-                                   int numSequences,
-                                   int inputDim,
-                                   int contextLength,
-                                   int contextStart,
-                                   int beginPad,
-                                   bool isPadding) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK(!isPadding || weightData);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  if (isPadding) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-__global__ void KeContextProjectionBackwardData(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  outputGrad += seqStart * inputDim * contextLength;
-  inputGrad += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + contextStart) < 0) {
-          continue;
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          continue;
-        } else {
-          // value = 0;
-          value = inputGrad[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          outputGrad + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-        inputGrad[(i + contextStart) * inputDim + idx] = value;
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_backward_data(real* outputGrad,
-                                         const int* sequence,
-                                         real* inputGrad,
-                                         int numSequences,
-                                         int inputDim,
-                                         int contextLength,
-                                         int contextStart) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(inputGrad);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / padOfBlock;
-  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weightIdx < weightDim) {
-    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
-      int seqStart = sequence[seqId];
-      int seqEnd = sequence[seqId+1];
-      output_r = outputGrad + seqStart * weightDim * contextLength;
-
-      if (contextStart < 0) {
-        if (padId + contextStart < 0) {
-          instanceId = padId;
-        } else {
-          // beginPad > 0;
-          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
-        }
-      } else {
-        if (padId + (seqEnd - seqStart) < contextStart) {
-          continue;
-        } else {
-          // beginPad == 0;
-          instanceId = padId + (seqEnd - seqStart) - contextStart;
-        }
-      }
-
-      int outx = (instanceId - contextLength) < 0 ?
-                 instanceId : (contextLength - 1);
-      int outy = (instanceId - contextLength) < 0 ?
-                 0 : (instanceId - (contextLength - 1));
-      output_r += outy * weightDim * contextLength + outx * weightDim;
-      for (int j = outy; j < seqEnd - seqStart; j++) {
-        value += output_r[weightIdx];
-        if (j - outy == outx) break;
-        output_r += (contextLength - 1) * weightDim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weightIdx < weightDim) {
-    if (idy == 0) {
-      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
-    }
-  }
-}
-
-void hl_context_projection_backward_weight(real* outputGrad,
-                                           const int* sequence,
-                                           real* weightGrad,
-                                           int numSequences,
-                                           int weightDim,
-                                           int totalPad,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(weightGrad);
-
-  int threadsX = 32;
-  int threadsY = 32;
-  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
-  dim3 threads(threadsX, threadsY);
-  dim3 grid(blocksX, 1);
-
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, weightGrad, numSequences, weightDim,
-     contextLength, contextStart, beginPad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
 template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 9ae8bc0f22..55b940ca67 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -29,7 +29,6 @@ void* warpctc_dso_handle = nullptr;
  * false, you need to add the path of libwarp-ctc.so to
  * the linked-libs of paddle or to LD_PRELOAD.
  */
-#ifdef PADDLE_USE_DSO
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
   struct DynLoad__##__name {                                           \
     template <typename... Args>                                        \
@@ -41,15 +40,6 @@ void* warpctc_dso_handle = nullptr;
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
     }                                                                  \
   } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name;  // struct DynLoad__##__name
-#endif
 
 // include all needed warp-ctc functions
 DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0697842bbe..de85eeca82 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -1,23 +1,26 @@
-file(GLOB h_files . *_op.h)
-file(GLOB cpp_files . *_op.cpp)
+file(GLOB h_files . *Op.h)
+file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 
 if(WITH_GPU)
-    file(GLOB cu_files . *_op_gpu.cu)
+    file(GLOB cu_files . *OpGpu.cu)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
+add_dependencies(paddle_function ${external_project_dependencies})
 
-add_library(paddle_test_main STATIC TestMain.cpp)
 
 if(WITH_GPU)
+if(WITH_TESTING)
     # TODO:
-    # file(GLOB test_files . *_op_test.cpp)
+    # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(cross_map_normal_op_test)
+    add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(ContextProjectionOpTest)
+endif()
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000..07907fc1ba
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
+                                               const CpuMatrix* input_mat,
+                                               const CpuMatrix* weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
+  auto in_mat = const_cast<CpuMatrix*>(input_mat);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * in_mat->getWidth());
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input value.
+ * \param inputs[1] input weight.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+
+    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+
+    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    const auto w_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+
+    ContextProjectionForward<Device>(out_mat.get(),
+                                     in_mat.get(),
+                                     w_mat.get(),
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
+                                                CpuMatrix* in_grad_mat,
+                                                CpuMatrix* w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  CHECK(out_grad_mat);
+  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
+                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat =
+              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] weight grad.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+
+    CHECK(outputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
+
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto in_grad_mat =
+        !inputs[0].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    auto w_grad_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+
+    ContextProjectionBackward<Device>(out_grad_mat.get(),
+                                      in_grad_mat ? in_grad_mat.get() : nullptr,
+                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
+                                          in_grad_mat.get(),
+                                          seq_vec,
+                                          context_length_,
+                                          context_start_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * \param inputs[0] weight grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
+
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
+                                            w_grad_mat.get(),
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
new file mode 100644
index 0000000000..93eb050fde
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionForward(typename MatrixT<Device>::type* output,
+                              const typename MatrixT<Device>::type* input,
+                              const typename MatrixT<Device>::type* weight,
+                              const typename SequenceT<Device>::type& sequence,
+                              size_t context_length,
+                              int context_start,
+                              size_t begin_pad);
+
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
+                               typename MatrixT<Device>::type* in_grad,
+                               typename MatrixT<Device>::type* w_grad,
+                               const typename SequenceT<Device>::type& seq_vec,
+                               size_t context_length,
+                               int context_start,
+                               size_t begin_pad,
+                               bool is_padding,
+                               size_t total_pad);
+
+template <DeviceType Device>
+void ContextProjectionBackwardData(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* in_grad,
+    const typename SequenceT<Device>::type& sequence,
+    size_t context_length,
+    int context_start);
+
+template <DeviceType Device>
+void ContextProjectionBackwardWeight(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* w_grad,
+    const typename SequenceT<Device>::type& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
new file mode 100644
index 0000000000..1ec7058f96
--- /dev/null
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -0,0 +1,401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "ContextProjectionOp.h"
+
+namespace paddle {
+
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weight          padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   const real* weight,
+                                   real* output,
+                                   size_t num_sequences,
+                                   size_t input_dim,
+                                   size_t context_length,
+                                   int context_start,
+                                   size_t begin_pad) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+
+  if (weight) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
+                                               const GpuMatrix* input,
+                                               const GpuMatrix* weight,
+                                               const GpuIVector& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  CHECK(input && output);
+  hl_context_projection_forward(input->getData(),
+                                sequence.getData(),
+                                weight ? weight->getData() : nullptr,
+                                output->getData(),
+                                sequence.getSize() - 1,
+                                input->getWidth(),
+                                context_length,
+                                context_start,
+                                begin_pad);
+}
+
+__global__ void KeContextProjectionBackwardData(real* out_grad,
+                                                const int* sequence,
+                                                real* in_grad,
+                                                int input_dim,
+                                                int context_length,
+                                                int context_start) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  out_grad += seq_start * input_dim * context_length;
+  in_grad += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + context_start) < 0) {
+          continue;
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          continue;
+        } else {
+          // value = 0;
+          value = in_grad[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          out_grad + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+        in_grad[(i + context_start) * input_dim + idx] = value;
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  input_grad       input gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ *
+ */
+void hl_context_projection_backward_data(real* out_grad,
+                                         const int* sequence,
+                                         real* input_grad,
+                                         size_t num_sequences,
+                                         size_t input_dim,
+                                         size_t context_length,
+                                         int context_start) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(input_grad);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+
+template <>
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                    GpuMatrix* in_grad,
+                                                    const GpuIVector& sequence,
+                                                    size_t context_length,
+                                                    int context_start) {
+  CHECK(in_grad && out_grad);
+  hl_context_projection_backward_data(out_grad->getData(),
+                                      sequence.getData(),
+                                      in_grad->getData(),
+                                      sequence.getSize() - 1,
+                                      in_grad->getWidth(),
+                                      context_length,
+                                      context_start);
+}
+
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+                                                  const int* sequence,
+                                                  real* w_grad,
+                                                  int num_sequences,
+                                                  int w_dim,
+                                                  int context_length,
+                                                  int context_start,
+                                                  int begin_pad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / pad_of_block;
+  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+
+  sum_s[idy][idx] = 0.0f;
+  if (weight_idx < w_dim) {
+    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
+      int seq_start = sequence[seqId];
+      int seq_end = sequence[seqId+1];
+      output_r = out_grad + seq_start * w_dim * context_length;
+
+      if (context_start < 0) {
+        if (padId + context_start < 0) {
+          instanceId = padId;
+        } else {
+          // begin_pad > 0;
+          instanceId = (padId - begin_pad) +
+            (seq_end - seq_start) - context_start;
+        }
+      } else {
+        if (padId + (seq_end - seq_start) < context_start) {
+          continue;
+        } else {
+          // begin_pad == 0;
+          instanceId = padId + (seq_end - seq_start) - context_start;
+        }
+      }
+
+      int outx = (instanceId - context_length) < 0 ?
+                 instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0 ?
+                 0 : (instanceId - (context_length - 1));
+      output_r += outy * w_dim * context_length + outx * w_dim;
+      for (int j = outy; j < seq_end - seq_start; j++) {
+        value += output_r[weight_idx];
+        if (j - outy == outx) break;
+        output_r += (context_length - 1) * w_dim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (weight_idx < w_dim) {
+    if (idy == 0) {
+      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
+    }
+  }
+}
+
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  w_grad           weight gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   w_dim            input sequence dimension.
+ * @param[in]   total_pad        number of extra timesteps.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_backward_weight(real* out_grad,
+                                           const int* sequence,
+                                           real* w_grad,
+                                           size_t num_sequences,
+                                           size_t w_dim,
+                                           size_t total_pad,
+                                           size_t context_length,
+                                           int context_start,
+                                           size_t begin_pad) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(w_grad);
+
+  int threads_x = 32;
+  int threads_y = 32;
+  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
+  dim3 threads(threads_x, threads_y);
+  dim3 grid(blocks_x, 1);
+
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, w_grad, num_sequences, w_dim,
+     context_length, context_start, begin_pad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+
+template <>
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+        GpuMatrix* out_grad,
+        GpuMatrix* w_grad,
+        const GpuIVector& seq_vec,
+        size_t context_length,
+        int context_start,
+        size_t total_pad,
+        size_t begin_pad) {
+  CHECK(out_grad && w_grad);
+  hl_context_projection_backward_weight(out_grad->getData(),
+                                        seq_vec.getData(),
+                                        w_grad->getData(),
+                                        seq_vec.getSize() - 1,
+                                        w_grad->getWidth(),
+                                        total_pad,
+                                        context_length,
+                                        context_start,
+                                        begin_pad);
+}
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                GpuMatrix* in_grad,
+                                                GpuMatrix* w_grad,
+                                                const GpuIVector& sequence,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+    CHECK(out_grad);
+    if (in_grad) {
+        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+                out_grad,
+                in_grad,
+                sequence,
+                context_length,
+                context_start);
+    }
+    if (is_padding && w_grad) {
+        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                out_grad,
+                w_grad,
+                sequence,
+                context_length,
+                context_start,
+                total_pad,
+                begin_pad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000..6223d2fd23
--- /dev/null
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start)));
+
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionBackward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding)
+                              .set("total_pad", pad));
+
+  CpuMatrix cpu_in_grad(batch_size, input_dim);
+  cpu_in_grad.randomizeUniform();
+  GpuMatrix gpu_in_grad(batch_size, input_dim);
+  gpu_in_grad.copyFrom(cpu_in_grad);
+
+  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
+  cpu_out_grad.randomizeUniform();
+  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
+  gpu_out_grad.copyFrom(cpu_out_grad);
+
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  auto cpu_w_grad =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_w_grad =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_w_grad->randomizeUniform();
+    gpu_w_grad->copyFrom(*cpu_w_grad);
+  }
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
+  if (is_padding) {
+    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
+  }
+}
+
+TEST(ContextProjection, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/cross_map_normal_op.cpp b/paddle/function/CrossMapNormalOp.cpp
similarity index 94%
rename from paddle/function/cross_map_normal_op.cpp
rename to paddle/function/CrossMapNormalOp.cpp
index a9c7693830..96a7a30eeb 100644
--- a/paddle/function/cross_map_normal_op.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "cross_map_normal_op.h"
+#include "CrossMapNormalOp.h"
 #include "paddle/math/Vector.h"
 
 namespace paddle {
@@ -128,11 +128,11 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(1, inputs.size());
-    CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
+    CHECK_EQ(2, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
     for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
       CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
       CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
@@ -180,11 +180,11 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(4, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(4, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
     for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
       CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
       CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
diff --git a/paddle/function/cross_map_normal_op.h b/paddle/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/cross_map_normal_op.h
rename to paddle/function/CrossMapNormalOp.h
diff --git a/paddle/function/cross_map_normal_op_gpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
similarity index 99%
rename from paddle/function/cross_map_normal_op_gpu.cu
rename to paddle/function/CrossMapNormalOpGpu.cu
index aae4f461b6..b33dd10834 100644
--- a/paddle/function/cross_map_normal_op_gpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "hl_base.h"
-#include "cross_map_normal_op.h"
+#include "CrossMapNormalOp.h"
 
 namespace paddle {
 
diff --git a/paddle/function/cross_map_normal_op_test.cpp b/paddle/function/CrossMapNormalOpTest.cpp
similarity index 98%
rename from paddle/function/cross_map_normal_op_test.cpp
rename to paddle/function/CrossMapNormalOpTest.cpp
index 22692691bd..d65d9310af 100644
--- a/paddle/function/cross_map_normal_op_test.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
 
+namespace paddle {
+
 TEST(CrossMapNormal, real) {
   for (size_t numSamples : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
@@ -69,3 +71,5 @@ TEST(CrossMapNormalGrad, real) {
     }
   }
 }
+
+}  // namespace paddle
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 02880e5ea1..614e76b8ac 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -30,20 +30,52 @@ real FuncConfig::get<real>(const std::string& key) const {
   return it->second.r;
 }
 
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].s = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].r = v;
   return *this;
 }
 
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 095584c0b1..9e8cbb8e48 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
   using type = GpuMatrix;
 };
 
+template <DeviceType Device>
+struct SequenceT;
+
+template <>
+struct SequenceT<DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct SequenceT<DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
 typedef std::vector<size_t> Dims;
 
 class Tensor {
@@ -59,6 +72,8 @@ public:
   union value {
     size_t s;
     real r;
+    int i;
+    bool b;
   };
 
   template <typename T>
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index a8c5e412bd..32131037f6 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -33,25 +33,33 @@ public:
     // init cpu and gpu arguments
     auto initArgs = [=](
         Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
+      for (const auto arg : inArgs) {
         size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
+        for (const auto dim : arg.dims_) {
           size *= dim;
         }
-        cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-        gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        cpuArgs.emplace_back(
-            Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-        gpuArgs.emplace_back(
-            Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-
-        // will use an api to refactor this code.
-        CpuVector cpuVector(size / sizeof(real),
-                            (real*)cpuArgs.back().getData());
-        GpuVector gpuVector(size / sizeof(real),
-                            (real*)gpuArgs.back().getData());
-        cpuVector.uniform(0.001, 1);
-        gpuVector.copyFrom(cpuVector);
+        if (arg.getData()) {
+          // todo(tianbing), waste unnecessary mem here
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
+        } else {
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(
+              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+          gpuArgs.emplace_back(
+              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          // will use an api to refactor this code.
+          CpuVector cpuVector(size / sizeof(real),
+                              (real*)cpuArgs.back().getData());
+          GpuVector gpuVector(size / sizeof(real),
+                              (real*)gpuArgs.back().getData());
+          cpuVector.uniform(0.001, 1);
+          gpuVector.copyFrom(cpuVector);
+        }
       }
     };
     initArgs(cpuInputs, gpuInputs, inputs);
@@ -81,6 +89,10 @@ public:
     checkArgs(cpuInouts, gpuInouts);
   }
 
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+
 protected:
   std::shared_ptr<FunctionBase> cpu;
   std::shared_ptr<FunctionBase> gpu;
@@ -95,8 +107,3 @@ protected:
 };
 
 }  // namespace paddle
-
-using paddle::FunctionCompare;
-using paddle::FuncConfig;
-using paddle::Dims;
-using paddle::Tensor;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9b7f7e36ce..9a2ad7567f 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -30,11 +30,11 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 namespace paddle {
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 5bdd55309c..b53790e764 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include <fenv.h>
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 51c0ae5cc9..ee4db21989 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
     CHECK_EQ(inputDim * totalPad, parameter->getSize());
     weight_.reset(new Weight(totalPad, inputDim, parameter));
   }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
 }
 
 void ContextProjection::resetState() {
@@ -78,25 +104,30 @@ LayerStatePtr ContextProjection::getState() {
 }
 
 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
-
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(static_cast<int>(forward_.size()), 1)
+      << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
-      *(in_->value),
-      state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
-      *startPositions,
-      config_.context_length(),
-      config_.context_start(),
-      beginPad_,
-      state_ ? true : isPadding);
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  auto start_pos = in_->sequenceStartPositions;
+  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
+                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                     Tensor(reinterpret_cast<real*>(
+                                const_cast<int*>(start_pos->getData(useGpu_))),
+                            Dims{start_pos->getSize()})},
+                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
+                    {});
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -118,41 +149,28 @@ void ContextProjection::forward() {
 }
 
 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
-    out_->grad->contextProjectionBackward(
-        in_->grad.get(),
-        isPadding ? weight_->getWGrad().get() : nullptr,
-        *startPositions,
-        config_.context_length(),
-        config_.context_start(),
-        beginPad_,
-        isPadding);
-  } else {
-    if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(*(in_->grad),
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          *(weight_->getWGrad()),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
+                             Dims{batch_size, input_dim}),
+                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                      Tensor(reinterpret_cast<real*>(
+                                 const_cast<int*>(start_pos->getData(useGpu_))),
+                             Dims{start_pos->getSize()})},
+                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
+                     {});
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 2df43bd04f..c87d6ed1d6 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -61,6 +61,8 @@ public:
 
   virtual LayerStatePtr getState();
 
+  virtual bool init();
+
 protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index e1c4b91ace..0281170bc5 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -130,7 +130,8 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
+           in_->value->getWidth())
       << "Wrong input size for convolution"
       << " channels=" << channels_ << " imageH=" << imageH_
       << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 42c0019319..3340e38e62 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 140a4c6ecf..2588fad279 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 677b047029..546ef9c1f2 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 8cd8042479..778a7fe13d 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -88,11 +88,37 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
   /**
    * Get output size of projection.
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
+protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
 protected:
   /// Config of projection
   ProjectionConfig config_;
@@ -106,5 +132,9 @@ protected:
   const Argument* out_;
   /// Store `passType` passed to forward()
   PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c26a2a7f06..0caa5e1e11 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -2,8 +2,7 @@
 
 ################### test_ProtoDataProvider ############
 add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp
-    TestUtil.cpp)
+    test_ProtoDataProvider.cpp)
 
 # test_ProtoDataProvider will mkdir as same name,
 # so if WORKING_DIRECTORY is default directory, then
@@ -15,53 +14,46 @@ add_test(NAME test_ProtoDataProvider
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
     test_LayerGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_ActivationGrad
     COMMAND test_ActivationGrad)
 ################# test_ConvTrans #######################
 add_unittest_without_exec(test_ConvTrans
     test_ConvTrans.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
 ################# test_PriorBox #######################
 add_unittest_without_exec(test_PriorBox
     test_PriorBox.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_PriorBox
     COMMAND test_PriorBox)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
     
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
 add_unittest_without_exec(test_BatchNorm
     test_BatchNorm.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
-    test_Evaluator.cpp
-    TestUtil.cpp)
+    test_Evaluator.cpp)
 
 ################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
@@ -72,8 +64,7 @@ add_simple_unittest(test_MultinomialSampler)
 ############## test_PyDataProvider ########################
 if(WITH_PYTHON)
     add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp
-        TestUtil.cpp)
+        test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
         COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
@@ -81,18 +72,15 @@ if(WITH_PYTHON)
 endif()
 
 ############### test_RecurrentLayer #######################
-add_unittest(test_RecurrentLayer
-    test_RecurrentLayer.cpp
-    TestUtil.cpp)
+add_simple_unittest(test_RecurrentLayer)
 
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
     add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp
-        TestUtil.cpp)
+        test_WarpCTCLayer.cpp)
 
     add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 endif()
 
@@ -108,8 +96,7 @@ add_test(NAME test_RecurrentGradientMachine
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp
-    TestUtil.cpp)
+    test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
         COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 57c176810f..ae016e74ea 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -310,7 +310,7 @@ void initDataLayer(TestConfig testConf,
         testConf.inputDefs[i].labelSeqStartPositions;
     if (labelSeqStartPositions.size() != 0) {
       CHECK(!sequenceStartPositions);
-      CHECK_GE(labelSeqStartPositions.size(), 2);
+      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
 
       sequenceStartPositions =
           ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 4e88ac0e81..9f68eb64d0 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
 
 namespace paddle {
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 7d7e68da5c..b201ba8a5a 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 7f5fcb670b..d07299bfe3 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -114,8 +114,8 @@ TEST(Layer, batchNorm) {
   bnLayer->forward(PASS_GC);
   convLayer->forward(PASS_GC);
 
-  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
-  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index dd3378304b..40bb1e2d73 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index ad99b50245..207fc0566f 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index e07066dad8..8165eb8269 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2cc25f6b21..66a70ecd41 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 330adee8f7..f046cb0b28 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 0d26105955..4db30f37a5 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
 
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index a6d6a24269..ae0e3bc3d2 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index d421b6e2f2..e11bf402c2 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;  // NOLINT
 
@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
     }        // end for (int numIdSlots : numSlotsArray)
   }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 0f264ecf91..db883543c3 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/PyDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;     // NOLINT
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 5f8bc5ecd0..7e193eb31a 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -293,7 +293,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   while (true) {
     int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
-      CHECK_LE(realBatchSize, batchSize);
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
     } else {
       break;
     }
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index f91c788863..16ab0e6aec 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 0a4a814d52..23ae95852e 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/WarpCTCLayer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 2933c20fba..8691c87ac3 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 50d2e3eb67..90813a8996 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input.getWidth() * contextLength);
-
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                isPadding ? weight->getData() : NULL,
-                                getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                contextLength,
-                                contextStart,
-                                beginPad,
-                                isPadding);
-}
-
-void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
-                                              const IVector& sequence,
-                                              int contextLength,
-                                              int contextStart) {
-  CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_data(getData(),
-                                      sequence.getData(),
-                                      inputGrad.getData(),
-                                      sequence.getSize() - 1,
-                                      inputGrad.getWidth(),
-                                      contextLength,
-                                      contextStart);
-}
-
-void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
-                                                const IVector& sequence,
-                                                int contextLength,
-                                                int contextStart,
-                                                int totalPad,
-                                                size_t beginPad) {
-  CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_weight(getData(),
-                                        sequence.getData(),
-                                        weightGrad.getData(),
-                                        sequence.getSize() - 1,
-                                        weightGrad.getWidth(),
-                                        totalPad,
-                                        contextLength,
-                                        contextStart,
-                                        beginPad);
-}
-
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   CHECK(data.useGpu_ == true && W.useGpu_ == true)
       << "Matrix type are not equal";
@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   }
 }
 
-void CpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
-  auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
-  CHECK(input_ptr && seq_ptr);
-  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
-
-  const int* starts = seq_ptr->getData();
-  size_t numSequences = seq_ptr->getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i], padSize);
-        if (isPadding) {
-          MatrixPtr sub = weight->subMatrix(j, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-        if (isPadding) {
-          MatrixPtr sub =
-              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      dst->addAtOffset(*src, j * input_ptr->getWidth());
-    }
-  }
-}
-
-void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
-                                          Matrix* weightGrad,
-                                          const IVector& sequence,
-                                          int contextLength,
-                                          int contextStart,
-                                          size_t beginPad,
-                                          bool isPadding) {
-  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
-  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-
-  int64_t inputDim = inputGrad ? inputGrad->getWidth()
-                               : weightGrad ? weightGrad->getWidth() : 0;
-  CHECK_EQ(getWidth(), inputDim * contextLength);
-
-  const int* starts = sequence.getData();
-  size_t numSequences = sequence.getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i], padSize);
-          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-          MatrixPtr sub = weightGrad->subMatrix(
-              beginPad + contextStart + j - padSize, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!inputGrad) continue;
-      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      src->addAtOffset(*dst, j * inputDim);
-    }
-  }
-}
-
 inline void vecAddTo(real* a, const real* b, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i] += b[i];
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 25ce09e346..ceac0212d2 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
@@ -972,42 +972,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(Matrix& input,
-                                        Matrix* weight,
-                                        const IVector& sequence,
-                                        int contextLength,
-                                        int contextStart,
-                                        size_t beginPad,
-                                        bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackward(Matrix* inputGrad,
-                                         Matrix* weightGrad,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardData(Matrix& inputGrad,
-                                             const IVector& sequence,
-                                             int contextLength,
-                                             int contextStart) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                               const IVector& sequence,
-                                               int contextLength,
-                                               int contextStart,
-                                               int totalPad,
-                                               size_t beginPad) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1442,26 +1406,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackwardData(Matrix& inputGrad,
-                                     const IVector& sequence,
-                                     int contextLength,
-                                     int contextStart);
-
-  void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                       const IVector& sequence,
-                                       int contextLength,
-                                       int contextStart,
-                                       int totalPad,
-                                       size_t beginPad);
-
   void bilinearForward(const Matrix& in,
                        const size_t inImgH,
                        const size_t inImgW,
@@ -1648,22 +1592,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackward(Matrix* inputGrad,
-                                 Matrix* weightGrad,
-                                 const IVector& sequence,
-                                 int contextLength,
-                                 int contextStart,
-                                 size_t beginPad,
-                                 bool isPadding);
-
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 9bd789e8c5..6fd60e7f3c 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "hl_tensor_ops.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 8a24103bd4..9af6e30c9e 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Thread.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index a3ea078509..06fc10bae7 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -7,8 +7,7 @@ add_simple_unittest(test_SparseMatrix)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
-    test_matrixCompare.cpp
-    ../../gserver/tests/TestUtil.cpp)
+    test_matrixCompare.cpp)
 
 add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 33e0952efe..1ca70ea84c 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
   }
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index cc7c1e7eb2..21918b86e1 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 624fa20ca5..58bc43a38b 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
   checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 27216ddb58..04c856453d 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
   testWrapper(functor);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 6aa5891bce..3836f7fc0f 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -28,10 +28,10 @@ limitations under the License. */
  * so we can add some tricks to prevent exp calculate an excessive value.
  *
  */
-#include <fenv.h>
+
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index d490078d90..e6b5dba446 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 
 #include <gtest/gtest.h>
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index adb5fbd9fa..6899769144 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -291,10 +291,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index f62843310d..e8f9b26ff2 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
     ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
   }
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 0949ab7ffb..9d3fbaef43 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   checkSMatrixEqual2(matA, matD);
 #endif
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 1859b9fc13..40e38434fa 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 2c458cba9c..4a88844b43 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
 }
 
 TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 9925e24dc1..4eb9837909 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 16541edb54..786d863a53 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
   testMatrixCase(testSgdUpdate<GpuMatrix>);
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index c6fc849ba0..3a780d26c0 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -29,148 +29,6 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-void testMatrixProjectionForward(int contextStart,
-                                 int contextLength,
-                                 bool padding,
-                                 int batchSize,
-                                 int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeight = nullptr;
-  MatrixPtr gpuWeight = nullptr;
-  if (padding) {
-    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeight->randomizeUniform();
-    gpuWeight->copyFrom(*cpuWeight);
-  }
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuOutput =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutput =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(*cpuInput,
-                                      cpuWeight.get(),
-                                      *cpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  gpuOutput->contextProjectionForward(*gpuInput,
-                                      gpuWeight.get(),
-                                      *gpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-}
-
-void testMatrixProjectionBackward(int contextStart,
-                                  int contextLength,
-                                  bool padding,
-                                  int batchSize,
-                                  int inputDim) {
-  MatrixPtr cpuOutputGrad =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutputGrad =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeightGrad = nullptr;
-  MatrixPtr gpuWeightGrad = nullptr;
-  if (padding) {
-    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeightGrad->randomizeUniform();
-    gpuWeightGrad->copyFrom(*cpuWeightGrad);
-  }
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
-                                           cpuWeightGrad.get(),
-                                           *cpuSequence,
-                                           contextLength,
-                                           contextStart,
-                                           beginPad,
-                                           padding);
-  gpuOutputGrad->contextProjectionBackwardData(
-      *gpuInputGrad, *gpuSequence, contextLength, contextStart);
-  if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
-                                                   *gpuSequence,
-                                                   contextLength,
-                                                   contextStart,
-                                                   pad,
-                                                   beginPad);
-  }
-
-  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
-  if (padding) {
-    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
-  }
-}
-
-TEST(Matrix, projection) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto trainablePadding : {false, true}) {
-        for (auto batchSize : {1, 2, 5, 20, 100}) {
-          for (auto inputDim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " trainablePadding=" << trainablePadding
-                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart,
-                                        contextLength,
-                                        trainablePadding,
-                                        batchSize,
-                                        inputDim);
-            testMatrixProjectionBackward(contextStart,
-                                         contextLength,
-                                         trainablePadding,
-                                         batchSize,
-                                         inputDim);
-          }
-        }
-      }
-    }
-  }
-}
-
 void testMatrixMaxSequence(int batchSize, int inputDim) {
   // forward
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -1262,10 +1120,4 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index dcdbccffc3..a9185a4b24 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 417e386dc7..2e7c18b808 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/TypeDefs.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 532c6770e5..72c8336799 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -26,10 +26,10 @@ limitations under the License. */
 #include "ParameterUpdaterHook.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 namespace paddle {
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2d277e47e7..0fca280149 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/math/Vector.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index aa57a63469..8bab5a6289 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -23,15 +23,6 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  int ret = RUN_ALL_TESTS();
-
-  return ret;
-}
-
 class CommonTest : public ::testing::Test {
 protected:
   CommonTest() : testStat_("test") {}
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 262afafbe2..11d7a147bf 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/pserver/ProtoServer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index eed71ccb43..89b3ddd502 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -23,10 +23,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
 
 #include "ParameterService.pb.h"
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index b0cf22e1fb..0f5a589590 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -29,10 +29,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index edcefba6a8..981d10afda 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -15,6 +15,7 @@
 import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
+import numpy
 
 __all__ = ['DataProviderConverter']
 
@@ -35,18 +36,18 @@ class IScanner(object):
 class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
-        self.__mat__ = []
-        self.__height__ = 0
+        self.__mat__ = None
 
     def scan(self, dat):
-        self.__mat__.extend(dat)
-        self.__height__ += 1
+        if self.__mat__ is None:
+            self.__mat__ = numpy.array([dat], dtype='float32')
+        else:
+            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
-                                           self.input_type.dim, False)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 
 
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index b01de499bd..1522be023f 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -43,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index a68cc79b84..09f07043e2 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -15,7 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -43,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ca3f1c3f18..7edba3dd09 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -43,5 +43,7 @@ cp -rv /woboq/data $WOBOQ_OUT/../data
     -o $WOBOQ_OUT \
     -p paddle:/paddle
 /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-
+cd /woboq
+make clean
+rm -rf /paddle/build
 trap : 0
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
new file mode 100755
index 0000000000..87083467f5
--- /dev/null
+++ b/paddle/scripts/docker/entrypoint
@@ -0,0 +1,8 @@
+#!/bin/bash
+LOG=/var/log/all
+
+touch $LOG
+
+/usr/sbin/sshd -D >> $LOG &
+jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
+tail -f $LOG
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
deleted file mode 100755
index 9620bff6bc..0000000000
--- a/paddle/scripts/travis/before_install.linux.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -e
-pip install protobuf
-cd /tmp
-wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
-tar xf protobuf.tar.gz
-cd protobuf*
-./autogen.sh
-./configure --prefix=/usr/
-make -j 2 install
-cd ..
-rm -rf protobuf*
-
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index bd88ed3913..7036f971fd 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -3,10 +3,4 @@ brew update
 brew tap homebrew/science
 brew install python
 sudo pip install --upgrade protobuf
-brew install cmake python glog gflags openblas wget md5sha1sum protobuf
-
-wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
-tar xf gtest.tar.gz
-cd googletest-release-1.8.0/
-cmake .
-make install
+brew install swig openblas md5sha1sum protobuf
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 9caeb21beb..db98504ba4 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,27 +1,19 @@
 #!/bin/bash
-./build_submodules.sh
 source ./common.sh
-CMAKE_EXTRA=""
-if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
-else
-  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
-fi
-
-
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
 if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+  export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
+  export PYTHONHOME=/opt/python/2.7.12
+  export PATH=/opt/python/2.7.12/bin:${PATH}
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
+  sudo make install
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  export PYTHONPATH=/usr/local/lib/python2.7/site-packages
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
   NPROC=`sysctl -n hw.ncpu`
   make -j $NPROC
-  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 fi
-
-
-sudo make install
-sudo paddle version
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
deleted file mode 100755
index d458bf92bf..0000000000
--- a/paddle/scripts/travis/build_submodules.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-WORK_DIR=$PWD
-PROJ_ROOT=$(git rev-parse --show-cdup)
-SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
-
-for module in $SUBMODULES
-do
-  case $module in
-    "warp-ctc")
-      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
-        rm -rf ${PROJ_ROOT}warp-ctc/build
-      fi
-      mkdir ${PROJ_ROOT}warp-ctc/build
-      cd ${PROJ_ROOT}warp-ctc/build
-      cmake ..; make
-    ;;
-  esac
-done
-cd $WORK_DIR
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index b4c38a41b8..e3650bf1c0 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -14,7 +14,9 @@
 
 # This file is used to build paddle python binding package.
 # It will be invoked by Makefile that generated by COMAKE
+
 from setuptools import setup, Extension
+
 import numpy as np
 import api.paddle_ld_flags
 import platform
@@ -30,8 +32,10 @@ is_lin = (system == 'linux')
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
+extra_comps = []
 extra_links = []
 obj = api.paddle_ld_flags.PaddleLDFlag()
+extra_comps = obj.c_flag()
 ldflags = obj.ldflag_str()
 if ldflags is not None:
   extra_links.extend(ldflags.split(" "))
@@ -51,20 +55,15 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
-extra_c = obj.c_flag()
-
-attr=dict()
-if extra_c is not None:
-  attr["extra_compile_args"] = extra_c
-
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
        ['Paddle_wrap.cxx'],
+       language = "c++",
        include_dirs = include_dirs,
        extra_link_args = extra_links,
-       **attr
+       extra_compile_args = extra_comps
     )
   ],
   packages=['py_paddle'],
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
new file mode 100644
index 0000000000..c47add04b0
--- /dev/null
+++ b/paddle/testing/CMakeLists.txt
@@ -0,0 +1,8 @@
+# for paddle test case
+
+if(WITH_TESTING)
+  add_library(paddle_test_main STATIC TestMain.cpp)
+  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_library(paddle_test_util STATIC TestUtil.cpp)
+  add_dependencies(paddle_test_util gen_proto_cpp)
+endif()
diff --git a/paddle/function/TestMain.cpp b/paddle/testing/TestMain.cpp
similarity index 100%
rename from paddle/function/TestMain.cpp
rename to paddle/testing/TestMain.cpp
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/testing/TestUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/TestUtil.cpp
rename to paddle/testing/TestUtil.cpp
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/testing/TestUtil.h
similarity index 100%
rename from paddle/gserver/tests/TestUtil.h
rename to paddle/testing/TestUtil.h
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 7794b20900..5e82c94475 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
-      int expectedPpassCount,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -146,7 +146,7 @@ protected:
   BatchStatus batchStatus_;
   /// controller thread for sync-sgd
   std::unique_ptr<std::thread> controllerThread_;
-  /// passed alread finished
+  /// passed already finished
   int64_t passCount_;
   /// expected passes to finished
   int64_t expectedPassCount_;
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index 880f1f9ddc..bc08a9e9f0 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -33,8 +33,8 @@ namespace paddle {
    because at the current moment, the merging on CPU is happening on the
    main thread, and the its parameter size can be much larger than the one GPU.
    Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function as a callback function
-   supplied to backward() and forwardBackward().
+   is called by gradient machines as a callback function supplied to backward()
+   and forwardBackward().
    For CPU, the parameter updates happens in separate threads maintained by this
    class.
  */
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 09e0a213ab..8465addaf9 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "Trainer.h"
 
-#include <fenv.h>
 #include <stdio.h>
 
 #include <iomanip>
@@ -24,7 +23,7 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 947f9cadcc..e2fbd21e14 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fenv.h>
 #include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 28c3d6f263..22e07bd0e9 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -17,9 +17,10 @@ add_test(NAME test_Compare
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
-set(diy_dll_dir ${CMAKE_CURRENT_BINARY_DIR}/../../gserver/tests)
 add_test(NAME test_Trainer
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
@@ -82,5 +83,5 @@ add_test(NAME test_PyDataProviderWrapper
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 371282dd6b..264bc46ebc 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -96,11 +96,6 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
-#else
-  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
-#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index f2cfd74094..956b606a18 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1 +1,2 @@
 enable_virtualenv.c
+PythonUtil.cpp
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 45240b5002..10d906ee16 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,5 +1,7 @@
 # The utilities for paddle
 
+configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
+
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(enable_virtualenv.py enable_virtualenv.c)
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/Common.h
similarity index 69%
rename from paddle/utils/TypeDefs.h
rename to paddle/utils/Common.h
index c50a05e82d..1f1d0255a5 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/Common.h
@@ -14,13 +14,22 @@ limitations under the License. */
 
 #pragma once
 
+#include "Excepts.h"
+
+/**
+ * Disable copy macro.
+ */
+#define DISABLE_COPY(class_name)                \
+  class_name(class_name &&) = delete;           \
+  class_name(const class_name &other) = delete; \
+  class_name &operator=(const class_name &other) = delete
+
 namespace paddle {
+
 #ifdef PADDLE_TYPE_DOUBLE
-typedef double real;
+using real = double;
 #else
-typedef float real;
+using real = float;
 #endif
 
 }  // namespace paddle
-
-using paddle::real;
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 7a354da758..0f3985cc7b 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "DisableCopy.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index dc3369b7e8..5c2c504f53 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef EXCEPTS_H_
 #define EXCEPTS_H_
 
+#include <fenv.h>
+
 #if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 0f922f3548..e87abb9139 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "DisableCopy.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp.in
similarity index 98%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/utils/PythonUtil.cpp.in
index 7faeff55c2..66b5795e29 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,6 +195,10 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
+  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (strlen(pyHome)) {
+    Py_SetPythonHome(pyHome);
+  }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index d27dae33fd..58fe51bd40 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
       p = new unsigned int(defaultSeed_ - 1);
     } else {
       p = new unsigned int(defaultSeed_ + getTID());
-      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+      VLOG(3) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
   }
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 0f778dbebf..411a64aa8d 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
 
 void runInitFunctions() {
   std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
+    VLOG(3) << "Calling runInitFunctions";
     if (g_initFuncs) {
       std::sort(g_initFuncs->begin(),
                 g_initFuncs->end(),
@@ -139,7 +139,7 @@ void runInitFunctions() {
       g_initFuncs = nullptr;
     }
     g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
+    VLOG(3) << "Call runInitFunctions done.";
   });
 }
 
@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
 }  // namespace path
 
 void copyFileToPath(const std::string& file, const std::string& dir) {
-  LOG(INFO) << "copy " << file << " to " << dir;
+  VLOG(3) << "copy " << file << " to " << dir;
   std::string fileName = path::basename(file);
   std::string dst = path::join(dir, fileName);
   std::ifstream source(file, std::ios_base::binary);
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index e5a89070f1..613844669d 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -26,12 +26,11 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "DisableCopy.h"
+#include "Common.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
 
 #include "Flags.h"
-#include "TypeDefs.h"
 #include "hl_gpu.h"
 
 /**
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index d1a07d9485..f53d6420bb 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 #include <iostream>
-#include "TypeDefs.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
similarity index 97%
rename from paddle/utils/Excepts.cpp
rename to paddle/utils/arch/osx/Excepts.cpp
index 4ddce35ed3..c8e904d8f9 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Excepts.h"
+#include "paddle/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
 
-#include <fenv.h>
-
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 18dd0aac43..378788bcec 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
     }
   });
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 42edede209..8200a24ce7 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 605bedb6c9..cc34eb1f86 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
         });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index 2f5c5bbce0..6e2580c491 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
     ASSERT_EQ(res[i], i);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 1237f1b731..554b1c1d4a 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
                    });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 2c40070eca..e854b2b427 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -18,10 +18,10 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN}
         ${CUR_PROTO_GEN})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -29,9 +29,9 @@ foreach(filename ${proto_filenames})
         ${CUR_PROTO_GEN_PY}
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index dce0b90952..1cda4762eb 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -10,26 +10,17 @@ set(PY_FILES paddle/__init__.py
              ${HELPERS_PY_FILES}
              ${UTILS_PY_FILES})
 
-set(PADDLE_INTERNAL_PACKAGE "")
-if (PADDLE_WITH_INTERNAL)
-    set(PADDLE_INTERNAL_PACKAGE "paddle.internals")
-endif()
-
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
-find_python_module(pip REQUIRED)
-find_python_module(wheel REQUIRED)
-find_python_module(google.protobuf REQUIRED)
-
 add_subdirectory(paddle/trainer_config_helpers/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 2eb7b17a0b..674b5ac58b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3416,8 +3416,35 @@ def register_parse_config_hook(f):
     _parse_config_hooks.add(f)
 
 
-def parse_config(config_file, config_arg_str):
+def update_g_config():
     '''
+    Update g_config after execute config_file or config_functions.
+    '''
+    for k, v in settings.iteritems():
+        if v is None:
+            continue
+        g_config.opt_config.__setattr__(k, v)
+
+    for k, v in trainer_settings.iteritems():
+        if v is None:
+            continue
+        g_config.__setattr__(k, v)
+
+    for name in g_config.model_config.input_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
+            'The type of input layer "%s" is not "data"' % name
+    for name in g_config.model_config.output_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+    return g_config
+
+
+def parse_config(trainer_config, config_arg_str):
+    '''
+    @param trainer_config: can be a string of config file name or a function name
+    with config logic
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
@@ -3451,45 +3478,20 @@ def parse_config(config_file, config_arg_str):
     g_root_submodel.is_recurrent_layer_group = False
     g_current_submodel = g_root_submodel
 
-    # for paddle on spark, need support non-file config.
-    # you can use parse_config like below:
-    #
-    # from paddle.trainer.config_parser import parse_config
-    # def configs():
-    #    #your paddle config code, which is same as config file.
-    #
-    # config = parse_config(configs, "is_predict=1")
-    # # then you get config proto object.
-    if hasattr(config_file, '__call__'):
-        config_file.func_globals.update(
+    if hasattr(trainer_config, '__call__'):
+        trainer_config.func_globals.update(
             make_config_environment("", config_args))
-        config_file()
+        trainer_config()
     else:
-        execfile(config_file, make_config_environment(config_file, config_args))
-    for k, v in settings.iteritems():
-        if v is None:
-            continue
-        g_config.opt_config.__setattr__(k, v)
-
-    for k, v in trainer_settings.iteritems():
-        if v is None:
-            continue
-        g_config.__setattr__(k, v)
+        execfile(trainer_config,
+                 make_config_environment(trainer_config, config_args))
 
-    for name in g_config.model_config.input_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
-            'The type of input layer "%s" is not "data"' % name
-    for name in g_config.model_config.output_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-    return g_config
+    return update_g_config()
 
 
-def parse_config_and_serialize(config_file, config_arg_str):
+def parse_config_and_serialize(trainer_config, config_arg_str):
     try:
-        config = parse_config(config_file, config_arg_str)
+        config = parse_config(trainer_config, config_arg_str)
         #logger.info(config)
         return config.SerializeToString()
     except:
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index 0ff5edf825..13155ebddb 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -20,4 +20,6 @@ from layers import *
 from networks import *
 from optimizers import *
 from attrs import *
+from config_parser_utils import *
+# This will enable operator overload for LayerOutput
 import layer_math
diff --git a/python/paddle/trainer_config_helpers/config_parser.py b/python/paddle/trainer_config_helpers/config_parser.py
new file mode 100644
index 0000000000..4b91b8d282
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf):
+    config = config_parser.parse_config(network_conf, '')
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf):
+    config = config_parser.parse_config(optimizer_conf, '')
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
new file mode 100644
index 0000000000..681b177a55
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser_utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf, config_arg_str=''):
+    config = config_parser.parse_config(network_conf, config_arg_str)
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf, config_arg_str=''):
+    config = config_parser.parse_config(optimizer_conf, config_arg_str)
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index d1a9843d32..403aafabe9 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,12 +1,12 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 add_test(NAME test_reset_hook
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 if (PROTOBUF_3)
@@ -14,12 +14,12 @@ if (PROTOBUF_3)
     ProtobufEqualMain.cpp)
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
     ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
   )
 else()
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   )
 endif()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index a54af94ce3..ee5961af75 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -10,13 +10,13 @@ protostr=$PWD/protostr
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index e984ee7062..a37eb6439e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -7,7 +7,7 @@ protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh
+./generate_protostr.sh $1
 
 . ./file_list.sh
 
diff --git a/python/setup.py.in b/python/setup.py.in
index d2fb95f27f..b66a42e87c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,11 @@
 from setuptools import setup
 
-INTERNAL_PACKAGE='${PADDLE_INTERNAL_PACKAGE}'
-
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
           'paddle.utils']
 
-if len(INTERNAL_PACKAGE) != 0:
-    packages.append(INTERNAL_PACKAGE)
-
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
diff --git a/warp-ctc b/warp-ctc
deleted file mode 160000
index bd535c8d44..0000000000
--- a/warp-ctc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2