diff --git a/.gitignore b/.gitignore
index ee7c6ec370..2b30f7938c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.DS_Store
 build/
+build_doc/
 *.user
 
 .vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3402223b04..9b138576fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: c25201a00e6b0514370501050cf2a8538ac12270
+    sha: v1.0.1
     hooks:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
-    - id: yapf
-      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+    -   id: yapf
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
+    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
diff --git a/.travis.yml b/.travis.yml
index 5a7f45a748..865e21f046 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,6 @@ env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
   - JOB=PRE_COMMIT
-
 addons:
   apt:
     packages:
@@ -49,8 +48,12 @@ before_install:
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - |
+    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - paddle/scripts/travis/main.sh
+  - | 
+    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000000..d5baee2161
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,28 @@
+| Github account | name |
+|---|---|
+| reyoung | Yang Yu |
+| gangliao | Gang Liao |
+| luotao01 | Tao Luo |
+| jacquesqiao | Long-Fei Qiao |
+| qingqing01 | Qing-Qing Dang |
+| hedaoyuan | Dao-Yuan He |
+| wangyang59 | Yang Wang |
+| QiJune | Jun Qi |
+| tianbingsz | Tian-Bing Xu |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| typhoonzero | Yi Wu |
+| backyes | Yan-Fei Wang |
+| pengli09 | Peng Li |
+| livc | Zhao Li |
+| Xreki | Yi-Qun Liu |
+| Yancey1989 | Xu Yan |
+| emailweixu | Wei Xu |
+| wen-bo-yang | Wen-Bo Yang |
+| helinwang | He-Lin Wang |
+| lcy-seso | Ying Cao |
+| Zrachel | Rui-Qing Zhang |
+| Haichao-Zhang | Hai-Chao Zhang |
+| gongweibao | Wei-Bao Gong |
+| lzhao4ever | Liang Zhao |
+| zhouxiao-coder | Xiao Zhou |
+| lipeng-unisound | Peng Li |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a59db8c71..aa4f1eaff9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,19 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(system)
 
@@ -50,6 +50,7 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -75,6 +76,13 @@ endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
+
+if (WITH_C_API AND WITH_PYTHON)
+  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
+    "when using C-API. It will give an unpredictable behavior when using a "
+    "different Python interpreter from compiling.")
+endif()
+
 ########################################################################################
 
 include(external/zlib)      # download, build, install zlib
diff --git a/Dockerfile b/Dockerfile
index 97947adf45..c3ad0c9c2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_AVX:-OFF}
+ENV WITH_GPU=${WITH_GPU:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
@@ -27,7 +27,7 @@ RUN apt-get update && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 swig && \
+    apt-get install -y automake locales clang-format-3.8 swig doxygen && \
     apt-get clean -y
 
 # git credential to skip password typing
@@ -43,7 +43,13 @@ RUN pip install --upgrade pip && \
     pip install -U wheel pillow BeautifulSoup && \
     pip install -U docopt PyYAML sphinx && \
     pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install pre-commit 'requests==2.9.2' 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip install certifi urllib3[secure]
 
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
diff --git a/RELEASE.md b/RELEASE.md
index a8a245ab44..146f7afa7d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,104 @@
+# Release v0.10.0
+
+We are glad to release version 0.10.0.  In this version, we are happy to release the new 
+[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+
+- Our old Python API is kind of out of date.  It's hard to learn and hard to
+  use.  To write a PaddlePaddle program using the old API, we'd have to write
+  at least two Python files: one `data provider` and another one that defines
+  the network topology.  Users start a PaddlePaddle job by running the
+  `paddle_trainer` C++ program, which calls Python interpreter to run the
+  network topology configuration script and then start the training loop,
+  which iteratively calls the data provider function to load minibatches.
+  This prevents us from writing a Python program in a modern way, e.g., in the
+  Jupyter Notebook.
+  
+- The new API, which we often refer to as the *v2 API*, allows us to write
+  much shorter Python programs to define the network and the data in a single
+  .py file.  Also, this program can run in Jupyter Notebook, since the entry
+  point is in Python program and PaddlePaddle runs as a shared library loaded
+  and invoked by this Python program.
+  
+Basing on the new API, we delivered an online interative
+book, [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+and [its Chinese version](http://book.paddlepaddle.org/).
+
+We also worked on updating our online documentation to describe the new API.
+But this is an ongoing work.  We will release more documentation improvements
+in the next version.
+
+We also worked on bring the new API to distributed model training (via MPI and
+Kubernetes).  This work is ongoing. We will release more about it in the next
+version.
+
+## New Features
+
+* We release [new Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+* Deep Learning 101 book in [English](http://book.paddlepaddle.org/index.en.html) and [Chinese](http://book.paddlepaddle.org/).
+* Support rectangle input for CNN.
+* Support stride pooling for seqlastin and seqfirstin.
+* Expose `seq_concat_layer/seq_reshape_layer` in `trainer_config_helpers`.
+* Add dataset package: CIFAR, MNIST, IMDB, WMT14, CONLL05, movielens, imikolov.
+* Add Priorbox layer for Single Shot Multibox Detection. 
+* Add smooth L1 cost.
+* Add data reader creator and data reader decorator for v2 API.
+* Add the CPU implementation of cmrnorm projection.
+
+## Improvements
+
+* Support Python virtualenv for `paddle_trainer`.
+* Add pre-commit hooks, used for automatically format our code.
+* Upgrade protobuf to version 3.x.
+* Add an option to check data type in Python data provider.
+* Speedup the backward of average layer on GPU.
+* Documentation refinement.
+* Check dead links in documents using Travis-CI.
+* Add a example for explaining `sparse_vector`.
+* Add ReLU in layer_math.py
+* Simplify data processing flow for Quick Start.
+* Support CUDNN Deconv.
+* Add data feeder in v2 API.
+* Support predicting the samples from sys.stdin for sentiment demo.
+* Provide multi-proccess interface for image preprocessing. 
+* Add benchmark document for v1 API.
+* Add ReLU in `layer_math.py`.
+* Add packages for automatically downloading public datasets.
+* Rename `Argument::sumCost` to `Argument::sum` since class `Argument` is nothing with cost.
+* Expose Argument::sum to Python
+* Add a new `TensorExpression` implementation for matrix-related expression evaluations.
+* Add lazy assignment for optimizing the calculation of a batch of multiple expressions.
+* Add abstract calss `Function` and its implementation:
+  * `PadFunc` and `PadGradFunc`.
+  * `ContextProjectionForwardFunc` and `ContextProjectionBackwardFunc`.
+  * `CosSimBackward` and `CosSimBackwardFunc`.
+  * `CrossMapNormalFunc` and `CrossMapNormalGradFunc`.
+  * `MulFunc`.
+* Add class `AutoCompare` and `FunctionCompare`, which make it easier to write unit tests for comparing gpu and cpu version of a function.
+* Generate `libpaddle_test_main.a` and remove the main function inside the test file.
+* Support dense numpy vector in PyDataProvider2.
+* Clean code base, remove some copy-n-pasted code snippets:
+  * Extract `RowBuffer` class for `SparseRowMatrix`.
+  * Clean the  interface of `GradientMachine`.
+  * Use `override` keyword in layer.
+  * Simplify `Evaluator::create`, use `ClassRegister` to create `Evaluator`s.
+* Check MD5 checksum when downloading demo's dataset.
+* Add `paddle::Error` which intentially replace `LOG(FATAL)` in Paddle.
+
+## Bug Fixes
+
+* Check layer input types for `recurrent_group`.
+* Don't run `clang-format` with .cu source files.
+* Fix bugs with `LogActivation`.
+* Fix the bug that runs `test_layerHelpers` multiple times.
+* Fix the bug that the seq2seq demo exceeds protobuf message size limit.
+* Fix the bug in dataprovider converter in GPU mode.
+* Fix a bug in `GatedRecurrentLayer`.
+* Fix bug for `BatchNorm` when testing more than one models.
+* Fix broken unit test of paramRelu.
+* Fix some compile-time warnings about `CpuSparseMatrix`.
+* Fix `MultiGradientMachine` error when `trainer_count > batch_size`.
+* Fix bugs that prevents from asynchronous data loading in `PyDataProvider2`.
+
 # Release v0.9.0
 
 ## New Features:
diff --git a/authors b/authors
deleted file mode 100644
index daac4ec5d8..0000000000
--- a/authors
+++ /dev/null
@@ -1,56 +0,0 @@
-Cao, Ying
-Cheng, Yujuan
-Dang, Qingqing
-Dong, Tengfei
-Du, Dalong
-Feng, Shouqiang
-Gao, Haoyuan
-Han, Baochang
-Han, Jinchen
-Hao, Nanyu
-He, Daoyuan
-He, Zhengyan
-Hou, Jue
-Huang, Chang
-Huang, Zhiheng
-Hu, Na
-Kong, Qi
-Liao, Gang
-Li, Bo
-Li, Jiajie
-Li, Jing
-Li, Lei
-Li, Peng
-Liu, Sheng
-Liu, Yuan
-Li, Yuze
-Luo, Heng
-Luo, Tao
-Lyu, Qin
-Mao, Hongyue
-Qian, Xiaojun
-Qiao, Longfei
-Qi, Jun
-Qin, Duohao
-Shen, Guolong
-Shi, Guangchuan
-Song, Xiang
-Wang, Helin
-Wang, Jiang
-Wang, Yanfei
-Wang, Yi
-Wang, Yong
-Weng, Renliang
-Xu, Tianbing
-Xu, Wei
-Xu, Xingyu
-Yan, Chong
-Yan, Chunwei
-Yang, Yi
-Yu, Yang
-Yu, Yinan
-Zhang, Jian
-Zhang, Ruiqing
-Zhang, Weide
-Zhao, Liang
-Zhou, Jie
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b8bf1bb07a..0918e6cc63 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -5,7 +5,7 @@
 # If any cblas implementation found, the following variable will be set.
 #    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle. 
+#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
 #
 # User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
@@ -44,7 +44,6 @@ if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(${MKL_LAPACK_INC_DIR})
-    add_definitions(-DPADDLE_USE_LAPACK)
     message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
   endif()
   return() # return file.
@@ -63,11 +62,11 @@ set(ATLAS_LIB_SEARCH_PATHS
         /usr/lib/atlas
         /usr/lib/atlas-base   # special for ubuntu 14.04.
     )
-find_path(ATLAS_INC_DIR NAMES cblas.h 
+find_path(ATLAS_INC_DIR NAMES cblas.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
+find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
@@ -76,11 +75,11 @@ if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
-  add_definitions(-DPADDLE_USE_ATLAS)  
+  add_definitions(-DPADDLE_USE_ATLAS)
   message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(ATLAS_CLAPACK_INC_DIR)
-    add_definitions(-DPADDLE_USE_LAPACK)
+    set(CBLAS_INC_DIR ${CBLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
     message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
   endif()
   return()
@@ -114,7 +113,6 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(OPENBLAS_LAPACKE_INC_DIR)
-    add_definitions(-DPADDLE_USE_LAPACK)
     message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
   endif()
   return()
@@ -124,7 +122,7 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 
 
-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH 
+set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
 set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
   ${REFERENCE_CBLAS_ROOT}/include
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 38c636b30e..02a5c0b2c9 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -34,7 +34,7 @@ set(IGNORE_PATTERN
 #
 # first argument: target name to attach
 # rest arguments: source list to check code style.
-# 
+#
 # NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
 macro(add_style_check_target TARGET_NAME)
     if(WITH_STYLE_CHECK)
@@ -48,13 +48,17 @@ macro(add_style_check_target TARGET_NAME)
                 if(filename MATCHES ${pattern})
                     message(STATUS "DROP LINT ${filename}")
                     set(LINT OFF)
-                endif() 
+                endif()
             endforeach()
             if(LINT MATCHES ON)
-                add_custom_command(TARGET ${TARGET_NAME}
+                get_filename_component(base_filename ${filename} NAME)
+                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
+                add_custom_command(OUTPUT ${CUR_GEN}
                     PRE_BUILD
                     COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}" ${filename}
+                                "--filter=${STYLE_FILTER}"
+                                "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
         endforeach()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 92ea23c763..46398b22c2 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,35 +27,6 @@ IF(NOT ${CBLAS_FOUND})
         SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
     ENDIF(WIN32)
 
-    IF(CMAKE_COMPILER_IS_GNUCC)
-        ENABLE_LANGUAGE(Fortran)
-        if (NOT CMAKE_Fortran_COMPILER_VERSION)
-          # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
-          execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
-                    OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
-        endif()
-        string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
-        list(GET Fortran_VERSION 0 Fortran_MAJOR)
-        list(GET Fortran_VERSION 1 Fortran_MINOR)
-        find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS 
-                     /lib
-                     /usr/lib
-                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
-                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
-        if (NOT GFORTRAN_LIBRARY)
-            message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
-        endif()
-        find_package(Threads REQUIRED)
-        LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
-    ENDIF(CMAKE_COMPILER_IS_GNUCC)
-
-    IF(NOT CMAKE_Fortran_COMPILER)
-        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
-                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
-    ENDIF(NOT CMAKE_Fortran_COMPILER)
-
-    ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
-
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -64,7 +35,7 @@ IF(NOT ${CBLAS_FOUND})
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2df042d226..a9db4e8ba4 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,7 @@ FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 IF(PROTOBUF_FOUND)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
     STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-    IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
+    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
         SET(PROTOBUF_FOUND OFF)
     ENDIF()
 ENDIF(PROTOBUF_FOUND)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7eb92efcb0..7a996dea92 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -197,3 +197,4 @@ if(CUDA_ARCH)
 endif()
 
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 211593f358..ff49a2d08e 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -1,5 +1,4 @@
 set(CPACK_PACKAGE_NAME paddle)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "")
 set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
 set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
 set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
@@ -10,8 +9,9 @@ set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
 set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
 set(CPACK_PACKAGE_DESCRIPTION "")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "libatlas3-base, libgflags2, libgoogle-glog0, libprotobuf8, libpython2.7, libstdc++6, python-numpy, python-pip, python-pip-whl, python-protobuf")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
+set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 3ca06665ab..75a9d8fc25 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -28,6 +28,11 @@ ELSE(WIN32)
         STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
+        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
+                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
+        ENDIF()
     ELSE(APPLE)
 
         IF(EXISTS "/etc/issue")
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 2defecd178..f1cadaa728 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,7 +29,7 @@ settings(
     batch_size=128,
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     regularization=L2Regularization(8e-4),
     gradient_clipping_threshold=25)
 
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index e523a34d5a..3d1f86ec3b 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
                         encoder_size=512,
                         decoder_size=512,
                         beam_size=3,
-                        max_length=250):
+                        max_length=250,
+                        error_clipping=50):
     """
     A wrapper for an attention version of GRU Encoder-Decoder network
     is_generating: whether this config is used for generating
@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
         input=src_word_id,
         size=word_vector_dim,
         param_attr=ParamAttr(name='_source_language_embedding'))
-    src_forward = simple_gru(input=src_embedding, size=encoder_size)
+    src_forward = simple_gru(
+        input=src_embedding,
+        size=encoder_size,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     src_backward = simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
+        input=src_embedding,
+        size=encoder_size,
+        reverse=True,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
             decoder_inputs += full_matrix_projection(input=context)
             decoder_inputs += full_matrix_projection(input=current_word)
 
-        gru_step = gru_step_layer(
+        gru_step = gru_step_naive_layer(
             name='gru_decoder',
             input=decoder_inputs,
             output_mem=decoder_mem,
-            size=decoder_size)
+            size=decoder_size,
+            layer_attr=ExtraLayerAttribute(
+                error_clipping_threshold=error_clipping))
 
         with mixed_layer(
                 size=target_dict_dim, bias_attr=True,
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
index 0624b17787..ea012ba1ae 100644
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -27,7 +27,7 @@ settings(
     learning_method=MomentumOptimizer(),
     batch_size=batch_size,
     regularization=L2Regularization(batch_size * 1e-4),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     learning_rate=1e-1,
     learning_rate_decay_a=1e-5,
     learning_rate_decay_b=0.25, )
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
index b9b41b2433..937a34df10 100644
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -27,7 +27,7 @@ settings(
     learning_method=MomentumOptimizer(),
     batch_size=batch_size,
     regularization=L2Regularization(batch_size * 1e-5),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     learning_rate=2e-3,
     learning_rate_decay_a=5e-7,
     learning_rate_decay_b=0.5, )
diff --git a/demo/word2vec/train_v2.py b/demo/word2vec/api_train_v2.py
similarity index 76%
rename from demo/word2vec/train_v2.py
rename to demo/word2vec/api_train_v2.py
index 7d952b446f..c0940f0e56 100644
--- a/demo/word2vec/train_v2.py
+++ b/demo/word2vec/api_train_v2.py
@@ -1,27 +1,40 @@
+import gzip
 import math
 
 import paddle.v2 as paddle
 
-dictsize = 1953
 embsize = 32
 hiddensize = 256
 N = 5
 
 
 def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
+    wordemb = paddle.layer.embedding(
         input=inlayer,
         size=embsize,
         param_attr=paddle.attr.Param(
             name="_proj",
             initial_std=0.001,
             learning_rate=1,
-            l2_rate=0, ))
+            l2_rate=0,
+            sparse_update=True))
     return wordemb
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     firstword = paddle.layer.data(
@@ -57,6 +70,9 @@ def main():
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
             if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
                 result = trainer.test(
                     paddle.batch(
                         paddle.dataset.imikolov.test(word_dict, N), 32))
@@ -65,11 +81,15 @@ def main():
                     result.metrics)
 
     cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
     parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
+    adagrad = paddle.optimizer.AdaGrad(
         learning_rate=3e-3,
         regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
     trainer.train(
         paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
         num_passes=30,
diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index 24389c2d85..75c1b35246 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -498,6 +498,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+smooth_l1_cost
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 05817ec854..154cfe2443 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -11,8 +11,7 @@ Data layer
 
 data
 ----
-..  automodule:: paddle.v2.layer
-    :members: data
+..  autoclass:: paddle.v2.layer.data
     :noindex:
 
 Fully Connected Layers
@@ -22,14 +21,12 @@ Fully Connected Layers
 
 fc
 --
-..  automodule:: paddle.v2.layer
-    :members: fc
+..  autoclass:: paddle.v2.layer.fc
     :noindex:
 
 selective_fc
 ------------
-..  automodule:: paddle.v2.layer
-    :members: selective_fc
+..  autoclass:: paddle.v2.layer.selective_fc
     :noindex:
 
 Conv Layers
@@ -37,34 +34,29 @@ Conv Layers
 
 conv_operator
 -------------
-..  automodule:: paddle.v2.layer
-    :members: conv_operator
+..  autoclass:: paddle.v2.layer.conv_operator
     :noindex:
 
 conv_projection
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: conv_projection
+..  autoclass:: paddle.v2.layer.conv_projection
     :noindex:
 
 conv_shift
 ----------
-..  automodule:: paddle.v2.layer
-    :members: conv_shift
+..  autoclass:: paddle.v2.layer.conv_shift
     :noindex:
 
 img_conv
 --------
-..  automodule:: paddle.v2.layer
-    :members: img_conv
+..  autoclass:: paddle.v2.layer.img_conv
     :noindex:
 
 ..  _api_v2.layer_context_projection:
 
 context_projection 
 ------------------
-..  automodule:: paddle.v2.layer
-    :members: context_projection
+..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
 
 Image Pooling Layer
@@ -72,20 +64,17 @@ Image Pooling Layer
 
 img_pool
 --------
-..  automodule:: paddle.v2.layer
-    :members: img_pool
+..  autoclass:: paddle.v2.layer.img_pool
     :noindex:   
 
 spp
 ---
-..  automodule:: paddle.v2.layer
-    :members: spp
+..  autoclass:: paddle.v2.layer.spp
     :noindex:
 
 maxout
 ------
-..  automodule:: paddle.v2.layer
-    :members: maxout
+..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
 Norm Layer
@@ -93,26 +82,22 @@ Norm Layer
 
 img_cmrnorm
 -----------
-..  automodule:: paddle.v2.layer
-    :members: img_cmrnorm
+..  autoclass:: paddle.v2.layer.img_cmrnorm
     :noindex:
 
 batch_norm
 ----------
-..  automodule:: paddle.v2.layer
-    :members: batch_norm
+..  autoclass:: paddle.v2.layer.batch_norm
     :noindex:
 
 sum_to_one_norm
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: sum_to_one_norm
+..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
     
 cross_channel_norm
 ------------------
-..  automodule:: paddle.v2.layer
-    :members: cross_channel_norm
+..  autoclass:: paddle.v2.layer.cross_channel_norm
     :noindex:
     
 Recurrent Layers
@@ -120,20 +105,17 @@ Recurrent Layers
 
 recurrent
 ---------
-..  automodule:: paddle.v2.layer
-    :members: recurrent
+..  autoclass:: paddle.v2.layer.recurrent
     :noindex:
 
 lstmemory
 ---------
-..  automodule:: paddle.v2.layer
-    :members: lstmemory
+..  autoclass:: paddle.v2.layer.lstmemory
     :noindex:
 
 grumemory
 ---------
-..  automodule:: paddle.v2.layer
-    :members: grumemory
+..  autoclass:: paddle.v2.layer.grumemory
     :noindex:
 
 Recurrent Layer Group
@@ -141,38 +123,32 @@ Recurrent Layer Group
 
 memory
 ------
-..  automodule:: paddle.v2.layer
-    :members: memory
+..  autoclass:: paddle.v2.layer.memory
     :noindex:
 
 recurrent_group
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: recurrent_group
+..  autoclass:: paddle.v2.layer.recurrent_group
     :noindex:
     
 lstm_step
 ---------
-..  automodule:: paddle.v2.layer
-    :members: lstm_step
+..  autoclass:: paddle.v2.layer.lstm_step
     :noindex:
 
 gru_step
 --------
-..  automodule:: paddle.v2.layer
-    :members: gru_step
+..  autoclass:: paddle.v2.layer.gru_step
     :noindex:
 
 beam_search
 ------------
-..  automodule:: paddle.v2.layer
-    :members: beam_search
+..  autoclass:: paddle.v2.layer.beam_search
     :noindex:
     
 get_output
 ----------
-..  automodule:: paddle.v2.layer
-    :members: get_output
+..  autoclass:: paddle.v2.layer.get_output
     :noindex:
     
 Mixed Layer
@@ -182,59 +158,50 @@ Mixed Layer
 
 mixed
 -----
-..  automodule:: paddle.v2.layer
-    :members: mixed
+..  autoclass:: paddle.v2.layer.mixed
     :noindex:
 
 ..  _api_v2.layer_embedding:
 
 embedding
 ---------
-..  automodule:: paddle.v2.layer
-    :members: embedding
+..  autoclass:: paddle.v2.layer.embedding
     :noindex:
 
 scaling_projection
 ------------------
-..  automodule:: paddle.v2.layer
-    :members: scaling_projection
+..  autoclass:: paddle.v2.layer.scaling_projection
     :noindex:
 
 dotmul_projection
 -----------------
-..  automodule:: paddle.v2.layer
-    :members: dotmul_projection
+..  autoclass:: paddle.v2.layer.dotmul_projection
     :noindex:
 
 dotmul_operator
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: dotmul_operator
+..  autoclass:: paddle.v2.layer.dotmul_operator
     :noindex:
 
 full_matrix_projection
 ----------------------
-..  automodule:: paddle.v2.layer
-    :members: full_matrix_projection
+..  autoclass:: paddle.v2.layer.full_matrix_projection
     :noindex:
 
 identity_projection
 -------------------
-..  automodule:: paddle.v2.layer
-    :members: identity_projection
+..  autoclass:: paddle.v2.layer.identity_projection
     :noindex:
 
 
 table_projection
 ----------------
-..  automodule:: paddle.v2.layer
-    :members: table_projection
+..  autoclass:: paddle.v2.layer.table_projection
     :noindex:
 
 trans_full_matrix_projection
 ----------------------------
-..  automodule:: paddle.v2.layer
-    :members: trans_full_matrix_projection
+..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
     
 Aggregate Layers
@@ -244,36 +211,31 @@ Aggregate Layers
 
 pooling
 -------
-..  automodule:: paddle.v2.layer
-    :members: pooling
+..  autoclass:: paddle.v2.layer.pooling
     :noindex:
 
 ..  _api_v2.layer_last_seq:
 
 last_seq
 --------
-..  automodule:: paddle.v2.layer
-    :members: last_seq
+..  autoclass:: paddle.v2.layer.last_seq
     :noindex:
 
 ..  _api_v2.layer_first_seq:
 
 first_seq
 ---------
-..  automodule:: paddle.v2.layer
-    :members: first_seq
+..  autoclass:: paddle.v2.layer.first_seq
     :noindex:
 
 concat
 ------
-..  automodule:: paddle.v2.layer
-    :members: concat
+..  autoclass:: paddle.v2.layer.concat
     :noindex:
 
 seq_concat
 ----------
-..  automodule:: paddle.v2.layer
-    :members: seq_concat
+..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
 Reshaping Layers
@@ -281,34 +243,29 @@ Reshaping Layers
 
 block_expand
 ------------
-..  automodule:: paddle.v2.layer
-    :members: block_expand
+..  autoclass:: paddle.v2.layer.block_expand
     :noindex:
 
 ..  _api_v2.layer_expand:
 
 expand
 ------
-..  automodule:: paddle.v2.layer
-    :members: expand
+..  autoclass:: paddle.v2.layer.expand
     :noindex:
 
 repeat
 ------
-..  automodule:: paddle.v2.layer
-    :members: repeat
+..  autoclass:: paddle.v2.layer.repeat
     :noindex:
 
 rotate
 ------
-..  automodule:: paddle.v2.layer
-    :members: rotate
+..  autoclass:: paddle.v2.layer.rotate
     :noindex:
 
 seq_reshape
 -----------
-..  automodule:: paddle.v2.layer
-    :members: seq_reshape
+..  autoclass:: paddle.v2.layer.seq_reshape
     :noindex:
 
 Math Layers
@@ -316,64 +273,54 @@ Math Layers
 
 addto
 -----
-..  automodule:: paddle.v2.layer
-    :members: addto
+..  autoclass:: paddle.v2.layer.addto
     :noindex:
 
 linear_comb
 -----------
-..  automodule:: paddle.v2.layer
-    :members: linear_comb
+..  autoclass:: paddle.v2.layer.linear_comb
     :noindex:
 
 interpolation
 -------------
-..  automodule:: paddle.v2.layer
-    :members: interpolation
+..  autoclass:: paddle.v2.layer.interpolation
     :noindex:
 
 bilinear_interp
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: bilinear_interp
+..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
 power
 -----
-..  automodule:: paddle.v2.layer
-    :members: power
+..  autoclass:: paddle.v2.layer.power
     :noindex:
 
 scaling
 -------
-..  automodule:: paddle.v2.layer
-    :members: scaling
+..  autoclass:: paddle.v2.layer.scaling
     :noindex:
 
 slope_intercept
 ---------------
-..  automodule:: paddle.v2.layer
-    :members: slope_intercept
+..  autoclass:: paddle.v2.layer.slope_intercept
     :noindex:
 
 tensor
 ------
-..  automodule:: paddle.v2.layer
-    :members: tensor
+..  autoclass:: paddle.v2.layer.tensor
     :noindex:
 
 ..  _api_v2.layer_cos_sim:
 
 cos_sim
 -------
-..  automodule:: paddle.v2.layer
-    :members: cos_sim
+..  autoclass:: paddle.v2.layer.cos_sim
     :noindex:
 
 trans
 -----
-..  automodule:: paddle.v2.layer
-    :members: trans
+..  autoclass:: paddle.v2.layer.trans
     :noindex:
 
 Sampling Layers
@@ -381,14 +328,12 @@ Sampling Layers
 
 maxid
 -----
-..  automodule:: paddle.v2.layer
-    :members: maxid
+..  autoclass:: paddle.v2.layer.max_id
     :noindex:
 
 sampling_id
 -----------
-..  automodule:: paddle.v2.layer
-    :members: sampling_id
+..  autoclass:: paddle.v2.layer.sampling_id
     :noindex:
 
 Slicing and Joining Layers
@@ -396,8 +341,7 @@ Slicing and Joining Layers
 
 pad
 ----
-..  automodule:: paddle.v2.layer
-    :members: pad
+..  autoclass:: paddle.v2.layer.pad
     :noindex:
 
 ..  _api_v2.layer_costs:
@@ -407,80 +351,77 @@ Cost Layers
 
 cross_entropy_cost
 ------------------
-..  automodule:: paddle.v2.layer
-    :members: cross_entropy_cost
+..  autoclass:: paddle.v2.layer.cross_entropy_cost
     :noindex:
 
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  automodule:: paddle.v2.layer
-    :members: cross_entropy_with_selfnorm_cost
+..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
     :noindex:
 
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  automodule:: paddle.v2.layer
-    :members: multi_binary_label_cross_entropy_cost
+..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
 huber_cost
 ----------
-..  automodule:: paddle.v2.layer
-    :members: huber_cost
+..  autoclass:: paddle.v2.layer.huber_cost
     :noindex:
 
 lambda_cost
 -----------
-..  automodule:: paddle.v2.layer
-    :members: lambda_cost
+..  autoclass:: paddle.v2.layer.lambda_cost
+    :noindex:
+
+mse_cost
+--------
+..  autoclass:: paddle.v2.layer.mse_cost
     :noindex:
 
 rank_cost
 ---------
-..  automodule:: paddle.v2.layer
-    :members: rank_cost
+..  autoclass:: paddle.v2.layer.rank_cost
     :noindex:
 
 sum_cost
 ---------
-..  automodule:: paddle.v2.layer
-    :members: sum_cost
+..  autoclass:: paddle.v2.layer.sum_cost
     :noindex:
 
 crf
 ---
-..  automodule:: paddle.v2.layer
-    :members: crf
+..  autoclass:: paddle.v2.layer.crf
     :noindex:
 
 crf_decoding
 ------------
-..  automodule:: paddle.v2.layer
-    :members: crf_decoding
+..  autoclass:: paddle.v2.layer.crf_decoding
     :noindex:
 
 ctc
 ---
-..  automodule:: paddle.v2.layer
-    :members: ctc
+..  autoclass:: paddle.v2.layer.ctc
     :noindex:
 
 warp_ctc
 --------
-..  automodule:: paddle.v2.layer
-    :members: warp_ctc
+..  autoclass:: paddle.v2.layer.warp_ctc
     :noindex:
 
 nce
 ---
-..  automodule:: paddle.v2.layer
-    :members: nce
+..  autoclass:: paddle.v2.layer.nce
     :noindex:
 
 hsigmoid
 ---------
-..  automodule:: paddle.v2.layer
-    :members: hsigmoid
+..  autoclass:: paddle.v2.layer.hsigmoid
+    :noindex:
+
+smooth_l1_cost
+--------------
+..  autoclass:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
 Check Layer 
@@ -488,6 +429,5 @@ Check Layer
 
 eos
 ---
-..  automodule:: paddle.v2.layer
-    :members: eos
+..  autoclass:: paddle.v2.layer.eos
     :noindex:
diff --git a/doc/design/dist/README.md b/doc/design/cluster_train/README.md
similarity index 86%
rename from doc/design/dist/README.md
rename to doc/design/cluster_train/README.md
index 1788208bca..b88a8f382b 100644
--- a/doc/design/dist/README.md
+++ b/doc/design/cluster_train/README.md
@@ -17,12 +17,16 @@ A training job will be created once user asks Paddle cloud to train a model. The
 
 1. the *master process*, which dispatches tasks to
 1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
-1. one or more *parameter server processes*, where each holds a shard of the global model.
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
 
 Their relation is illustrated in the following graph:
 
 <img src="src/paddle-model-sharding.png"/>
 
+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
 ### Master Process
 
 The master process will:
@@ -31,7 +35,7 @@ The master process will:
 - Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
 
 
-#### Task 
+#### Task
 
 A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
 
@@ -78,7 +82,7 @@ The communication pattern between the trainers and the parameter servers depends
 - Synchronous Stochastic Gradient Descent (sync-SGD)
 
 	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
-  
+
 - Asynchronous Stochastic Gradient Descent (async-SGD)
 
 	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
@@ -118,8 +122,6 @@ When the master is started by the Kubernetes, it executes the following steps at
 1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
 1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
 
-The master process will kill itself if its etcd lease expires.
-
 When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
 
 ### Trainer Process
@@ -132,6 +134,8 @@ When the trainer is started by the Kubernetes, it executes the following steps a
 
 If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
 
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+
 ### Parameter Server Process
 
 When the parameter server is started by Kubernetes, it executes the following steps at startup:
@@ -140,11 +144,11 @@ When the parameter server is started by Kubernetes, it executes the following st
 1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
 
 	The desired number of parameter servers is 3:
-	
+
 	<img src="src/paddle-ps-0.png"/>
-	
+
 	The third parameter server joined:
-	
+
 	<img src="src/paddle-ps-1.png"/>
 
 1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
@@ -153,6 +157,13 @@ When the parameter server is started by Kubernetes, it executes the following st
 If the parameter server's etcd lease expires, the parameter server will kill itself.
 
 
+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
 ## Dynamic Scaling
 
 ### Trainer Scaling
diff --git a/doc/design/cluster_train/checkpointing.md b/doc/design/cluster_train/checkpointing.md
new file mode 100644
index 0000000000..c87ef2c7d2
--- /dev/null
+++ b/doc/design/cluster_train/checkpointing.md
@@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/design/cluster_train/data_dispatch.md
new file mode 100644
index 0000000000..a3eb4e28db
--- /dev/null
+++ b/doc/design/cluster_train/data_dispatch.md
@@ -0,0 +1,120 @@
+## 训练数据的存储和分发
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+
+<img src="src/paddle-cloud-in-data-center.png" width="500"/>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用于也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据的存储
+
+选择CephFS作为训练数据的存储服务。
+
+在Kubernetes上运行的不同的计算框架，可以通过Volume或PersistentVolume挂载存储空间到每个容器中。
+
+在CephFS存储系统中的公开目录，需要保存一些预置的公开数据集（比如MNIST, BOW, ImageNet数据集等），并且可以被提交的job直接使用。
+
+### 文件预处理
+
+在数据集可以被训练之前，文件需要预先被转换成PaddlePaddle集群内部的存储格式（SSTable）。我们提供两个转换方式：
+
+- 提供给用户本地转换的库，用户可以编写程序完成转换。
+- 用户可以上传自己的数据集，在集群运行MapReduce job完成转换。
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定SSTable文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.SSTable("/home/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash
+paddle cp filenames pfs://home/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+```bash
+paddle cp random_images-*-of-* pfs://home/
+```
+## TODO
+
+### 支持用户自定义的数据预处理job
diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/design/cluster_train/src/checkpointing.png
new file mode 100644
index 0000000000..c221e8474f
Binary files /dev/null and b/doc/design/cluster_train/src/checkpointing.png differ
diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/design/cluster_train/src/data_dispatch.png
new file mode 100644
index 0000000000..5bdcc24d6a
Binary files /dev/null and b/doc/design/cluster_train/src/data_dispatch.png differ
diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000..da5d1a7756
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png differ
diff --git a/doc/design/dist/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-etcd.graffle
rename to doc/design/cluster_train/src/paddle-etcd.graffle
diff --git a/doc/design/dist/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png
similarity index 100%
rename from doc/design/dist/src/paddle-etcd.png
rename to doc/design/cluster_train/src/paddle-etcd.png
diff --git a/doc/design/dist/src/paddle-model-sharding.graffle b/doc/design/cluster_train/src/paddle-model-sharding.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-model-sharding.graffle
rename to doc/design/cluster_train/src/paddle-model-sharding.graffle
diff --git a/doc/design/dist/src/paddle-model-sharding.png b/doc/design/cluster_train/src/paddle-model-sharding.png
similarity index 100%
rename from doc/design/dist/src/paddle-model-sharding.png
rename to doc/design/cluster_train/src/paddle-model-sharding.png
diff --git a/doc/design/dist/src/paddle-ps-0.png b/doc/design/cluster_train/src/paddle-ps-0.png
similarity index 100%
rename from doc/design/dist/src/paddle-ps-0.png
rename to doc/design/cluster_train/src/paddle-ps-0.png
diff --git a/doc/design/dist/src/paddle-ps-1.png b/doc/design/cluster_train/src/paddle-ps-1.png
similarity index 100%
rename from doc/design/dist/src/paddle-ps-1.png
rename to doc/design/cluster_train/src/paddle-ps-1.png
diff --git a/doc/design/dist/src/paddle-ps.graffle b/doc/design/cluster_train/src/paddle-ps.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-ps.graffle
rename to doc/design/cluster_train/src/paddle-ps.graffle
diff --git a/doc/design/dist/src/paddle-task-queues.graffle b/doc/design/cluster_train/src/paddle-task-queues.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-task-queues.graffle
rename to doc/design/cluster_train/src/paddle-task-queues.graffle
diff --git a/doc/design/dist/src/paddle-task-queues.png b/doc/design/cluster_train/src/paddle-task-queues.png
similarity index 100%
rename from doc/design/dist/src/paddle-task-queues.png
rename to doc/design/cluster_train/src/paddle-task-queues.png
diff --git a/doc/design/dist/src/paddle-task-states.graffle b/doc/design/cluster_train/src/paddle-task-states.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-task-states.graffle
rename to doc/design/cluster_train/src/paddle-task-states.graffle
diff --git a/doc/design/dist/src/paddle-task-states.png b/doc/design/cluster_train/src/paddle-task-states.png
similarity index 100%
rename from doc/design/dist/src/paddle-task-states.png
rename to doc/design/cluster_train/src/paddle-task-states.png
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
new file mode 100644
index 0000000000..42384a3f05
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/cluster_train/src/trainer.png b/doc/design/cluster_train/src/trainer.png
new file mode 100644
index 0000000000..6537d3d565
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.png differ
diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png
new file mode 100644
index 0000000000..ef59e56b01
Binary files /dev/null and b/doc/design/images/replica.png differ
diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png
new file mode 100644
index 0000000000..ef6f7317bd
Binary files /dev/null and b/doc/design/images/two_phase_commit.png differ
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/00.why_plain_c.md
similarity index 94%
rename from doc/design/multi_language_interface/why_plain_c.md
rename to doc/design/multi_language_interface/00.why_plain_c.md
index a3f41ca7b9..a144309334 100644
--- a/doc/design/multi_language_interface/why_plain_c.md
+++ b/doc/design/multi_language_interface/00.why_plain_c.md
@@ -58,32 +58,32 @@ typedef void* paddle_matrix;
 typedef int paddle_error;
 
 extern "C"
-paddle_error paddle_matrix_shape(paddle_matrix matrix,
-                                 uint64_t* width,
-                                 uint64_t* height);
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
 ```
 而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
 
 ```cpp
-#include "paddle/math/matrix.hpp"
+#include "paddle/math/matrix.h"
 extern "C"
 paddle_error paddle_matrix_shape(paddle_matrix matrix,
                                  uint64_t *width,
                                  uint64_t *height) {
-  auto m = (paddle::math::matrix*)(matrix);
+  auto m = (paddle::capi::CMatrix*)(matrix);
   *width = m->width();
   *height = m->height();
 }
 ```
 
-其中`paddle/math/matrix.hpp`文件内容为:
+其中`paddle/capi/CMatrix.hpp`文件内容为:
 
 ```cpp
 namespace paddle {
 namespace math {  
 
-class Matrix {
-  //...
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
 };
 
 }  // namespace math
@@ -113,6 +113,6 @@ class Matrix {
 | 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
 
 
-## 简单实现
+## 实现
 
-TBD
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/design/multi_language_interface/01.inference_implementation.md
new file mode 100644
index 0000000000..9820284523
--- /dev/null
+++ b/doc/design/multi_language_interface/01.inference_implementation.md
@@ -0,0 +1,131 @@
+# C-API 模型推断实现文档
+
+本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API，通过模型推断API的实现作为一个样例，来进行讨论。至于为什么需要C-API，请参考[Why Plain C](./00.why_plain_c.md)。
+
+## Table of Contents
+   * [C-API 模型推断实现文档](#c-api-模型推断实现文档)
+      * [暴露接口原则](#暴露接口原则)
+      * [目录结构](#目录结构)
+      * [实现方式](#实现方式)
+         * [capi.h](#capih)
+         * [具体某种类型的头文件](#具体某种类型的头文件)
+         * [capi_private.h](#capi_privateh)
+         * [具体某种类型的实现文件](#具体某种类型的实现文件)
+         * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib)
+         * [libpaddle_capi_whole.a](#libpaddle_capi_wholea)
+         * [examples](#examples)
+      * [编译选项](#编译选项)
+
+
+## 暴露接口原则
+
+1. 所有的接口均为C接口。即使用`extern "C"`
+2. 除构造某种类型的函数(`paddle_matrix_create`等)，其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。
+3. 所有类型名为`paddle_类型名`，所有与类型相关的函数，函数名为`paddle_类型名_函数名`
+4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言，那么
+	* 为了暴露的接口尽量简单。只暴露概念的接口，而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。
+	* 暴露这个概念必要函数。`必要`是指，即完成某一个任务的最少函数。
+5. 不在`capi`接口层做过多封装。
+	* 如果某一个Paddle概念必须要暴露，但是又过于琐碎。不在`capi`这一层进行封装，而是直接修改Paddle Core。让Paddle核心中，这一概念不再琐碎。
+
+
+## 目录结构
+
+```text
+Paddle
+  `-- paddle
+        `-- capi
+              `-- examples  # The example project for C-API.
+              `-- tests  # unittests for C-API
+              `-- capi.h  # C-API header file.
+              `-- capi_private.h  # The shared header file between implementation sources.
+              `-- matrix.{h, cpp}
+              `-- gradient_machine.{h, cpp}
+              `-- ...
+```
+
+
+Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件，均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即，安装后的目录结构为
+
+```text
+`-- include
+      `-- paddle
+             `-- capi.h
+             `-- matrix.h
+             `-- gradient_machine.h
+             `-- ...
+`-- lib
+     `-- libpaddle_capi_shared.{so, dylib}  # In mac, dynamic libary's file name extention is `dylib`
+     `-- libpaddle_capi_whole.a  # static library for all symbols of Paddle.
+```
+
+## 实现方式
+
+下面分别介绍某一类文件的实现方式。
+
+### capi.h
+
+`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中，引入了类型的头文件，`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时，使用相对路径的引用方式。即`#include "matrix.h"`
+
+### 具体某种类型的头文件
+
+具体某种类型的头文件，即例如`matrix.h`，`gradient_machine.h`等。在这些头文件中，包含了某种类型的类型定义和暴露的全部函数。
+
+这个头文件不假设其他文件的引用顺序，即使用户直接引用某种类型的头文件，也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型，例如`gradient_machine`需要引用`matrix`，则直接引入另一种类型的头文件，即`#include "matrix.h"`。
+
+### capi_private.h
+
+`capi_prviate.h`是各个实现中共享的头文件，他主要包含了实际暴露的类型结构。在用户使用C-API时，Paddle的类型全部退化成`void *`，即`typedef paddle_matrix void*`。但，对于每种C-API暴露的类型，均是在`capi_private.h`中实现的结构体。
+
+```cpp
+struct CMatrix {
+   int type = MatrixType;
+   std::shared_ptr<paddle::Matrix> mat;
+};
+```
+
+通常，这个结构体包含两个项目。
+
+* `type`是一个类型的标志。对于每种类型，type字段均不尽相同。这样，即使C-API接受的类型全是`void *`，我们也可以确定每一个参数的类型。
+
+  ```cpp
+  void some_c_api_function(void* some_instance) {
+     int* type = (int *) some_instance;
+     switch (*type) {
+       case MatrixType:
+         CMatrix* mat = (CMatrix *) some_instance;
+         ...
+       ...
+     }
+  }
+  ```
+* 这个结构体中的另一个项目是，Paddle Core中这一类型接口的智能指针(shared_ptr)。
+	* 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例，而不必在意Paddle Core是否还在使用这个实例。
+	* 例如，用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后，直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数，这个参数也不会一并删除。
+
+### 具体某种类型的实现文件
+
+具体某种类型的实现文件，即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中，使用C++ 11实现了C-API的接口，并且使用`extern "C"`导出这些接口。在实现过程中，对输入参数的安全性进行了必要的判断，并将C-API接口的参数转发给`Paddle Core`。
+
+### libpaddle\_capi_shared.{so, dylib}
+
+`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。
+
+### libpaddle\_capi_whole.a
+
+`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。
+
+
+### examples
+
+在样例中，使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。
+
+## 编译选项
+
+C-API的编译选项默认关闭，打开这个编译选项，需要在cmake的时候，设置
+
+```bash
+cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF
+```
+
+编译C-API的时候推荐Paddle不嵌入Python解释器，也不生成`SWIG`接口，具体原因参考[Why Plain C](./00.why_plain_c.md)。
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
new file mode 100644
index 0000000000..3692a5248a
--- /dev/null
+++ b/doc/design/releasing_process.md
@@ -0,0 +1,58 @@
+# Paddle发行规范
+
+Paddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
+
+Paddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+3. 对这个版本的提交，做如下几个操作:
+	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
+	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
+	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
+		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+6. 协同完成Release Note的书写
+
+
+需要注意的是:
+
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试Paddle的行为。
+* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+
+# Paddle 分支规范
+
+Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+
+* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
+	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
+	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+
+* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
+	* 当功能分支开发完毕后，向Paddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+
+* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+
+# Paddle回归测试列表
+
+本列表说明Paddle发版之前需要测试的功能点。
+
+## Paddle Book中所有章节
+
+Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+
+| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
+| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 22db1ef658..da2d423465 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,93 +4,112 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
-PaddlePaddle发布的docker镜像使用说明
+PaddlePaddle发布的Docker镜像使用说明
 ------------------------------
 
-对于每一个PaddlePaddle版本，我们都会发布两种Docker镜像：开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。
-我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
+PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
+像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
+PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
+行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新
+的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
+内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
+在国内，请把文档里命令中的paddlepaddle/paddle替换成
+docker.paddlepaddle.org/paddle。
+
 1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
 
-    这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-    文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-    开发镜像包含了以下工具：
-    - gcc/clang
-    - nvcc
-    - Python
-    - sphinx
-    - woboq
-    - sshd
-    很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-    也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
+   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
+   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
+   开发镜像包含了以下工具：
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
+   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
+
+   以交互容器方式运行开发镜像：
+
+   .. code-block:: bash
+
+      docker run -it --rm paddlepaddle/paddle:<version>-dev /bin/bash
+
+   或者，可以以后台进程方式运行容器：
+
+   .. code-block:: bash
+
+      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
 
-    以交互容器方式运行开发镜像：
+   然后用密码 :code:`root` SSH进入容器：
 
-    .. code-block:: bash
+   .. code-block:: bash
 
-        docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
+      ssh -p 2202 root@localhost
 
-    或者，可以以后台进程方式运行容器：
+   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-    .. code-block:: bash
+2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
 
-        docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
+   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-    然后用密码 :code:`root` SSH进入容器：
+   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-    .. code-block:: bash
+   .. code-block:: bash
 
-        ssh -p 2202 root@localhost
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-    SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+   如果输出是No，就需要选择使用no-AVX的镜像
 
-2. 运行镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-    - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-    - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
+   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
+   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
 
-    纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+   .. code-block:: bash
 
-    .. code-block:: bash
+      nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
 
-       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
 
-    如果输出是No，就需要选择使用no-AVX的镜像
+   .. code-block:: bash
 
-    以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-    为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
 
-    .. code-block:: bash
+3. 运行以及发布您的AI程序
 
-        nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
+   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
 
-    注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+   .. code-block:: bash
 
-    .. code-block:: bash
+      docker run -it -v $PWD:/work paddle /work/a.py
 
-        export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-        export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
+   如果要使用GPU，请运行：
 
-3. 使用运行镜像发布你的AI程序
-    假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+   .. code-block:: bash
 
-    .. code-block:: bash
+      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
 
-        docker run -it -v $PWD:/work paddle /work/a.py
 
-    这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
-    创建和发布自己的AI程序镜像。
+   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+   创建和发布自己的AI程序镜像。
 
-运行PaddlePaddle书籍
+运行PaddlePaddle Book
 ---------------------
 
 Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
-如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
 
-我们提供可以直接运行PaddlePaddle书籍的docker镜像，直接运行：
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
 .. code-block:: bash
 
@@ -109,53 +128,44 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 
 开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
 
-1. 构建开发镜像
+1. 制作PaddlePaddle开发镜像
 
-   .. code-block:: bash
+   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
+   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle
+   .. code-block:: bash
+      
+      git clone https://github.com/PaddlePaddle/Paddle.git
       cd Paddle
       docker build -t paddle:dev .
 
+   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要构建完开发镜像，然后执行：
-
-   .. code-block:: bash
-
-      docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
-
-
-2. 运行开发环境
+2. 制作PaddlePaddle生产镜像
 
-   当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
+   生产镜像的生成分为两步，第一步是运行：
 
    .. code-block:: bash
+      
+      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
+   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
 
-   以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
-
-   以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+   第二步是运行：
 
    .. code-block:: bash
+      
+      docker build -t paddle:prod -f build/Dockerfile ./build
 
-      ssh root@localhost -p 2202
+   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
 
-3. 在Docker开发环境中编译与安装PaddlPaddle代码
+3. 运行单元测试
 
-   当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
+   运行以下指令：
 
    .. code-block:: bash
-
-      /paddle/paddle/scripts/docker/build.sh
-
-   以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
-
-   .. code-block:: bash
-
-      cd /paddle/build
-      ctest
-
+      
+      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
 
 文档
 ----
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 8fb9369e0e..03df497506 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -53,12 +53,20 @@ Docker is simple as long as we understand a few basic concepts:
 Usage of CPU-only and GPU Images
 ----------------------------------
 
-For each version of PaddlePaddle, we release two types of Docker images:
-development image and production image. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions. We
-put the docker images on `dockerhub.com
+We package PaddlePaddle's compile environment into a Docker image,
+called the develop image, it contains all compiling tools that
+PaddlePaddle needs. We package compiled PaddlePaddle program into a
+Docker image as well, called the production image, it contains all
+runtime environment that running PaddlePaddle needs. For each version
+of PaddlePaddle, we release both of them. Production image includes
+CPU-only version and a CUDA GPU version and their no-AVX versions.
+
+We put the docker images on `dockerhub.com
 <https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com
+latest versions under "tags" tab at dockerhub.com. If you are in
+China, you can use our Docker image registry mirror to speed up the
+download process. To use it, please replace all paddlepaddle/paddle in
+the commands to docker.paddlepaddle.org/paddle.
 
 1. Production images, this image might have multiple variants:
 
@@ -179,59 +187,40 @@ Develop PaddlePaddle or Train Model Using C++ API
 We will be using PaddlePaddle development image since it contains all
 compiling tools and dependencies.
 
-Let's clone PaddlePaddle repo first:
+1. Build PaddlePaddle develop image
 
-.. code-block:: bash
-
-   git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-
-Mount both workspace folder and paddle code folder into docker
-container, so we can access them inside docker container. There are
-two ways of using PaddlePaddle development docker image:
-
-- run interactive bash directly
+   Use following command to build PaddlePaddle develop image:
 
-  .. code-block:: bash
-
-     # use nvidia-docker instead of docker if you need to use GPU
-     docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
-     # now we are inside docker container
+   .. code-block:: bash
 
-- or, we can run it as a daemon container
+      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
+      docker build -t paddle:dev .
 
-  .. code-block:: bash
+2. Build PaddlePaddle production image
 
-     # use nvidia-docker instead of docker if you need to use GPU
-     docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
+   There are two steps for building production image, the first step is to run:
 
-  and SSH to this container using password :code:`root`:
-
-  .. code-block:: bash
+   .. code-block:: bash
 
-     ssh -p 2202 root@localhost
+      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
 
-  An advantage is that we can run the PaddlePaddle container on a
-  remote server and SSH to it from a laptop.
+   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
 
-When developing PaddlePaddle, you can edit PaddlePaddle source code
-from outside of docker container using your favoriate editor. To
-compile PaddlePaddle, run inside container:
+   The second step is to run:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
+      docker build -t paddle:prod -f build/Dockerfile ./build
 
-This builds everything about Paddle in :code:`/paddle/build`.  And we
-can run unit tests there:
+   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
 
-.. code-block:: bash
+3. Run unit test
 
-   cd /paddle/build
-   ctest
+   Following command will run unit test:
 
-When training model using C++ API, we can edit paddle program in
-~/workspace outside of docker. And build from /workspace inside of
-docker.
+   .. code-block:: bash
+      
+      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
 
 PaddlePaddle Book
 ------------------
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index c6a4d3121c..cadf092f8f 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -2,7 +2,8 @@
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_cn.rst
-  basic_usage/index_cn.rst
+
+- `深度学习入门课程 <http://book.paddlepaddle.org/>`_
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 55d95d8015..9f771e93e8 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -2,7 +2,8 @@ GET STARTED
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_en.rst
-  basic_usage/index_en.rst
+
+- `Deep Learning 101 <http://book.paddlepaddle.org/index.en.html>`_
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 4b328fc9d3..79048e9248 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -19,18 +19,18 @@
 
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
 
-pooling_layer
-==============
+pooling
+========
 
-pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_pooling_layer` 配置API。
+pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API。
 
 ..	code-block:: bash
 
-        seq_pool = pooling_layer(input=layer,
-                                 pooling_type=AvgPooling(),
-                                 agg_level=AggregateLevel.EACH_SEQUENCE)
+        seq_pool = pooling(input=layer,
+                           pooling_type=pooling.Max(),
+                           agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
 
 - `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
@@ -47,7 +47,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
 last_seq 和 first_seq
 =====================
 
-last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_seq` 类似），详细见 :ref:`api_trainer_config_helpers_layers_last_seq` 配置API。
+last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详细见 :ref:`api_v2.layer_last_seq` 配置API。
 
 ..	code-block:: bash
 
@@ -65,16 +65,16 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
 
-expand_layer
-============
+expand
+======
 
-expand_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_expand_layer` 配置API。
+expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
 
 ..	code-block:: bash
 
-        expand = expand_layer(input=layer1,
-                              expand_as=layer2,
-                              expand_level=ExpandLevel.FROM_TIMESTEP)
+        ex = expand(input=layer1,
+                    expand_as=layer2,
+                    expand_level=ExpandLevel.FROM_TIMESTEP)
         
 - `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
 
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9ecab5594c..9e805ca851 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,7 +4,6 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
-  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
index 7adc79873d..13a153b05c 100644
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -1,7 +1,2 @@
 RNN Models
 ==========
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn_config_en.rst
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index ee1b3213ea..775938612e 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -1,130 +1,219 @@
 # 如何贡献代码
 
 我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
- 
+
 ## 代码要求
-- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
-- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
 - 所有代码必须具有单元测试。
 - 通过所有单元测试。
 
 以下教程将指导您提交代码。
- 
 ## [Fork](https://help.github.com/articles/fork-a-repo/)
- 
-跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
 
 ## 克隆（Clone）
 
-Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
-**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
 
-一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
 
-```shell
-# 克隆 fork 到本地
-git clone --branch develop https://github.com/USERNAME/Paddle.git
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
 ```
-如果你的仓库不包含 **develop** 分支，你只需自己创建它。
 
-```shell
-git clone https://github.com/USERNAME/Paddle.git Paddle
-cd Paddle
-git checkout -b develop  # 创建 develop 分支
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
-git pull upstream develop  # 更新 upstream
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
 ```
 
-然后你可以通过做一个本地开发分支开始开发
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
-```shell
-git checkout -b MY_COOL_STUFF_BRANCH
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
 ```
 
-## 使用 `pre-commit` 钩子
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
 
-Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
-，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
-的 PR 不能提交代码到 Paddle。
+如要build这个开发镜像，在源码目录树的根目录中运行：
 
-你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
-目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+```bash
+➜  docker build -t paddle:dev .
+```
 
-然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
-提交你的代码时，pre-commit 钩子会检查本地代码是否存在
-不适合提交的东西，等等。
+随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
-## 提交（Commit）
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+```
 
-提交你的代码：
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
 
-```shell
-# 显示工作树状态
-git status
-# 添加修改过的文件
-git add xx
-env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
 ```
-提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
 
-## 保持 Fork 状态最新
+如果要运行所有的单元测试，可以用如下命令：
 
-在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
-为此，你需要首先添加远程（remote）：
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+```
 
-```shell
-# 观察当前远程仓库配置
-git remote -v
-# 添加上游（upstream）仓库
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git
-# 验证新的 upstream
-git remote -v
+关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
 ```
 
-用最新的 upstream 更新你的 fork：
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
 
-```shell
-git pull --rebase upstream develop
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
 ```
-如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
 
-现在，你的本地主分支与上游修改的一致并是最新的。
+获取 upstream 的最新代码并更新当前分支。
 
-## 推送（Push）到 GitHub
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
 
-```shell
-# 在 GitHub 上 push 你的仓库
-git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
 ```
 
-## 拉取请求（Pull Request）
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
 
-转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+选择目标分支：
 
-## 使用最新版本更新你的 pull 请求
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
 
-在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
 
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop
-# 你可能需要根据git提示解决冲突
-# 创建并测试你的代码
-git push origin MY_COOL_STUFF_BRANCH
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
 ```
-现在你的 Pull Request 是最新的了。
 
-## 修改你的 pull request
+## 删除本地分支
 
-当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+最后，删除本地分支。
 
-可能的命令是
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
 
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop   # 将本地更新到最新的代码库
-# 可能会发生一些冲突
-# 开始开发吧！
-env EDITOR=vim git commit  # 添加修改日志
-git push origin MY_COOL_STUFF_BRANCH
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
 ```
+
+至此，我们就完成了一次代码贡献的过程。
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 5051a89230..d536f53abc 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -8,7 +8,8 @@ PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两
 如何构建PaddlePaddle的文档
 ==========================
 
-PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式，我们提供了一个构建脚本build_docs.sh来进行构建。
+PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
 
 使用Docker构建PaddlePaddle的文档
@@ -16,39 +17,62 @@ PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。
 
 使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
 
-..	code-block:: bash
+..  code-block:: bash
 
-	cd TO_YOUR_PADDLE_CLONE_PATH
-	cd paddle/scripts/tools/build_docs
-	bash build_docs.sh
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh with_docker
 
-编译完成后，该目录下会生成如下两个子目录\:
+编译完成后，会在当前目录生成两个子目录\:
 
 * doc 英文文档目录
 * doc_cn 中文文档目录
 
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
-..	code-block:: bash
-
-	open doc_cn/index.html
 
 
 直接构建PaddlePaddle的文档
 --------------------------
 
-TBD
+因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包，用户需要首先确认py_paddle包已经安装。
+
+..  code-block:: bash
+
+    python -c "import py_paddle"
+
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
+
+如果提示正确，可以执行以下命令编译生成文档，即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh local
+
+编译完成之后，会在当前目录生成两个子目录\:
+
+* doc 英文文档目录
+* doc_cn 中文文档目录
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
 
 如何书写PaddlePaddle的文档
 ==========================
 
-TBD
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
 
 如何更新www.paddlepaddle.org文档
 ================================
 
-TBD
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+
 
 
-..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
index 6278dacb17..4c3dc81ed3 100644
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@@ -14,7 +14,7 @@
 
 - [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
 
-# 部署Kubernetes集群
+## 部署Kubernetes集群
 
 Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
 
@@ -25,7 +25,7 @@ Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。
 
 可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
 
-# 选择存储方案
+## 选择存储方案
 
 容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
 常见的可选存储服务包括：
@@ -35,9 +35,9 @@ Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。
 - [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
 - [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
 
-# 配置kubectl
+## 配置kubectl
 
-## 安装kubectl
+### 安装kubectl
 ```
 # OS X
 curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
@@ -49,7 +49,7 @@ curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s htt
 curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
 ```
 
-## 配置kubectl访问你的kubernetes集群
+### 配置kubectl访问你的kubernetes集群
 
 编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
 ```
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 460fedb565..9279bac7f4 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,6 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
-  tutorials/index_cn.md
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 1d9cca7de7..168c7667c6 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,8 +5,6 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
-  tutorials/index_en.md
   howto/index_en.rst
   api/index_en.rst
   about/index_en.rst
- 
\ No newline at end of file
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
index 034740369e..65e61c5f29 100644
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -114,10 +114,7 @@
           </ul>
         </div>
         <ul class="site-page-links">
-          <li><a>Home</a></li>
-          <li><a>Get Started</a></li>
-          <li class="active"><a>Documentation</a></li>
-          <li><a>About Us</a></li>
+          <li><a href="/">Home</a></li>
         </ul>
       </div>
       <div class="doc-module">
@@ -137,7 +134,7 @@
           {{ toctree }}
         {% endblock %}
     </nav>
-    {% if toc %}
+    {% if False %}
     <nav class="local-toc">{{ toc }}</nav>
     {% endif %}
     <section class="doc-content-wrap">
@@ -168,7 +165,8 @@
             VERSION:'{{ release|e }}',
             COLLAPSE_INDEX:false,
             FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
-            HAS_SOURCE:  {{ has_source|lower }}
+            HAS_SOURCE:  {{ has_source|lower }},
+            SOURCELINK_SUFFIX: ".txt",
         };
     </script>
     {%- for scriptfile in script_files %}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9d6d67e62c..eff296bcb0 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,6 +9,10 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
+if(WITH_C_API)
+    add_subdirectory(capi)
+endif()
+
 if(WITH_SWIG_PY)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
           ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index c4f5dca26c..d512040121 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 
@@ -468,8 +469,10 @@ private:
 };
 
 enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
 };
 
 struct ParameterConfigPrivate;
@@ -817,7 +820,8 @@ private:
 public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount);
+                                               int passCount,
+                                               bool useSparseUpdater);
   ~ParameterUpdater();
 
   /**
@@ -855,6 +859,13 @@ public:
    */
   void update(Parameter* param);
 
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
   /**
    * @brief restore the average parameter.
    * @note It is only used in AverageOptimizer. Restore will get the current
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 75b0ae7cb6..79921ea6e7 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -29,10 +29,22 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 }
 
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount) {
+    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr));
+  auto remoteUpdater = new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr);
+  if (useSparseUpdater) {
+    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
+    auto sparseRemoteUpdater =
+        new paddle::SparseRemoteParameterUpdaterComposite(
+            config->m->getConfig(),
+            passCount,
+            false,
+            std::move(remoteUpdaterPtr));
+    updater->m->updater.reset(sparseRemoteUpdater);
+  } else {
+    updater->m->updater.reset(remoteUpdater);
+  }
   return updater;
 }
 
@@ -59,6 +71,10 @@ void ParameterUpdater::update(Parameter *param) {
   m->updater->update(paddleParam);
 }
 
+void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
+  m->updater->getParametersRemote(fullSize, apply);
+}
+
 void ParameterUpdater::restore() { m->updater->restore(); }
 
 void ParameterUpdater::apply() { m->updater->apply(); }
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
new file mode 100644
index 0000000000..8b81ec69e6
--- /dev/null
+++ b/paddle/capi/Arguments.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "arguments.h"
+#include "capi_private.h"
+
+using paddle::capi::cast;
+
+#define castArg(v) cast<paddle::capi::CArguments>(v)
+#define castIVec(v) cast<paddle::capi::CIVector>(v)
+
+extern "C" {
+paddle_arguments paddle_arguments_create_none() {
+  return new paddle::capi::CArguments();
+}
+
+paddle_error paddle_arguments_destroy(paddle_arguments args) {
+  if (args == nullptr) return kPD_NULLPTR;
+  delete castArg(args);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_size(paddle_arguments args, uint64_t* size) {
+  if (args == nullptr || size == nullptr) return kPD_NULLPTR;
+  *size = castArg(args)->args.size();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_resize(paddle_arguments args, uint64_t size) {
+  if (args == nullptr) return kPD_NULLPTR;
+  castArg(args)->args.resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  if (m->mat == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].value = m->mat;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].value;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = castIVec(ids);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  iv->vec = a->args[ID].ids;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  //! TODO(lizhao): Complete this method.
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(ids);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].ids = iv->vec;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    ptr = std::make_shared<paddle::ICpuGpuVector>(iv->vec);
+  });
+}
+
+paddle_error paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    iv->vec = ptr->getMutableVector(false);
+  });
+}
+}
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
new file mode 100644
index 0000000000..1b52a79ceb
--- /dev/null
+++ b/paddle/capi/CMakeLists.txt
@@ -0,0 +1,73 @@
+if (WITH_DOUBLE)
+  set(PADDLE_FLOAT_TYPE double)
+else ()
+  set(PADDLE_FLOAT_TYPE float)
+endif()
+
+# config.h used for C-API. It will store Paddle building configuration as a
+# header. Make user just include PaddleCAPI.h then can get building
+# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
+# libraries.
+configure_file(config.h.in config.h @ONLY)
+
+# PaddleCAPI.h is the only header we exposed. It currently only used for model
+# inference.
+file(GLOB CAPI_HEADERS *.h)
+set(CAPI_PRIVATE_HEADER capi_private.h)
+list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
+file(GLOB CAPI_SOURCES *.cpp)
+
+# building paddle_capi
+add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
+  ${CAPI_SOURCES})
+
+target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
+  ${CAPI_PRIVATE_HEADER})
+
+add_dependencies(paddle_capi gen_proto_cpp)
+
+
+# combine all paddle static libraries together, into libpaddle_capi_whole.a
+# user should use PaddleCAPI as -lpaddle_capi_whole
+set(capi_whole_library libpaddle_capi_whole.a)
+add_custom_target(paddle_capi_whole ALL
+        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
+        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
+        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
+        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
+        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
+        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
+        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
+        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
+        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
+        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
+        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
+        COMMAND rm -rf o_files
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
+                paddle_cuda paddle_function paddle_gserver
+                paddle_proto paddle_pserver paddle_network
+        )
+set_target_properties(paddle_capi_whole
+  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+
+add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+link_paddle_exe(paddle_capi_shared)
+
+# install library & headers.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
+install(TARGETS paddle_capi_shared DESTINATION lib)
+
+# this variable used for unittest
+set(PADDLE_CAPI_INC_PATH
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (WITH_TESTING)
+  add_subdirectory(tests)
+endif()
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
new file mode 100644
index 0000000000..7f24561e9a
--- /dev/null
+++ b/paddle/capi/Main.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "capi_private.h"
+#include "main.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+
+static void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+extern "C" {
+paddle_error paddle_init(int argc, char** argv) {
+  std::vector<char*> realArgv;
+  realArgv.reserve(argc + 1);
+  realArgv.push_back(strdup(""));
+  for (int i = 0; i < argc; ++i) {
+    realArgv.push_back(argv[i]);
+  }
+  initPaddle(argc + 1, realArgv.data());
+  free(realArgv[0]);
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
new file mode 100644
index 0000000000..d898ebe261
--- /dev/null
+++ b/paddle/capi/Matrix.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "hl_cuda.h"
+#include "matrix.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
+extern "C" {
+paddle_matrix paddle_matrix_create(uint64_t height,
+                                   uint64_t width,
+                                   bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
+  return ptr;
+}
+
+paddle_matrix paddle_matrix_create_none() {
+  return new paddle::capi::CMatrix();
+}
+
+paddle_error paddle_matrix_destroy(paddle_matrix mat) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  delete ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real* rowArray) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  paddle::real* buf = ptr->mat->getRowBuf(rowID);
+  size_t width = ptr->mat->getWidth();
+#ifndef PADDLE_ONLY_CPU
+  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
+#else
+  std::copy(rowArray, rowArray + width, buf);
+#endif
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real** rawRowBuffer) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                     uint64_t* height,
+                                     uint64_t* width) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  if (height != nullptr) {
+    *height = cast(mat)->mat->getHeight();
+  }
+  if (width != nullptr) {
+    *width = cast(mat)->mat->getWidth();
+  }
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      paddle::SPARSE_CSR,
+      false,
+      useGpu);
+  return ptr;
+}
+
+paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                            int* rowArray,
+                                            uint64_t rowSize,
+                                            int* colArray,
+                                            uint64_t colSize,
+                                            float* valueArray,
+                                            uint64_t valueSize) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (rowArray == nullptr || colArray == nullptr ||
+      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
+    return kPD_NULLPTR;
+  }
+  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
+    std::vector<int> row(rowSize);
+    row.assign(rowArray, rowArray + rowSize);
+    std::vector<int> col(colSize);
+    col.assign(colArray, colArray + colSize);
+    std::vector<paddle_real> val(valueSize);
+    if (valueSize) {
+      val.assign(valueArray, valueArray + valueSize);
+    }
+    sparseMat->copyFrom(row, col, val);
+    return kPD_NO_ERROR;
+  } else {
+    return kPD_NOT_SUPPORTED;
+  }
+}
diff --git a/paddle/capi/Vector.cpp b/paddle/capi/Vector.cpp
new file mode 100644
index 0000000000..564708e963
--- /dev/null
+++ b/paddle/capi/Vector.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "vector.h"
+
+using paddle::capi::cast;
+
+extern "C" {
+
+paddle_ivector paddle_ivector_create_none() {
+  return new paddle::capi::CIVector();
+}
+
+paddle_ivector paddle_ivector_create(int* array,
+                                     uint64_t size,
+                                     bool copy,
+                                     bool useGPU) {
+  auto ptr = new paddle::capi::CIVector();
+  if (copy) {
+    ptr->vec = paddle::IVector::create(size, useGPU);
+    ptr->vec->copyFrom(array, size);
+  } else {
+    ptr->vec = paddle::IVector::create(array, size, useGPU);
+  }
+  return ptr;
+}
+
+paddle_error paddle_ivector_destroy(paddle_ivector ivec) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  delete cast<paddle::capi::CIVector>(ivec);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer) {
+  if (ivec == nullptr || buffer == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *buffer = v->vec->getData();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  v->vec->resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get_size(paddle_ivector ivec, uint64_t* size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *size = v->vec->getSize();
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
new file mode 100644
index 0000000000..d71ea26a5d
--- /dev/null
+++ b/paddle/capi/arguments.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ARGUMENTS_H__
+#define __PADDLE_CAPI_ARGUMENTS_H__
+
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+#include "matrix.h"
+#include "vector.h"
+
+/**
+ * Arguments functions. Each argument means layer output. Arguments means a
+ * array of arguemnt.
+ */
+typedef void* paddle_arguments;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief paddle_arguments_create_none Create a array of arguments, which size
+ * is zero.
+ * @return Arguemnts
+ */
+PD_API paddle_arguments paddle_arguments_create_none();
+
+/**
+ * @brief paddle_arguments_destroy Destroy the arguments
+ * @param args arguments to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_destroy(paddle_arguments args);
+
+/**
+ * @brief paddle_arguments_get_size Get size of arguments array
+ * @param [in] args arguments array
+ * @param [out] size array size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_size(paddle_arguments args,
+                                              uint64_t* size);
+
+/**
+ * @brief PDArgsResize Resize a arguments array.
+ * @param args arguments array.
+ * @param size target size of array
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_resize(paddle_arguments args,
+                                            uint64_t size);
+
+/**
+ * @brief PDArgsSetValue Set value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetValue Get value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetIds Get the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetIds Set the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param seqPos sequence position array.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+/**
+ * @brief PDArgsGetSequenceStartPos Get sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] seqPos sequence position array
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/capi.h b/paddle/capi/capi.h
new file mode 100644
index 0000000000..4097a1a35a
--- /dev/null
+++ b/paddle/capi/capi.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_H__
+#define __PADDLE_CAPI_H__
+
+/**
+ * Paddle C API. It will replace SWIG as Multiple Language API for model
+ * training & inference. Currently it is only used in model infernece.
+ *
+ * NOTE: This is an experimental API, it could be changed.
+ */
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+#include "gradient_machine.h"
+#include "main.h"
+#include "matrix.h"
+#include "vector.h"
+
+#endif  // PADDLECAPI_H_
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
new file mode 100644
index 0000000000..c7cdbd5f6f
--- /dev/null
+++ b/paddle/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
new file mode 100644
index 0000000000..d205307588
--- /dev/null
+++ b/paddle/capi/config.h.in
@@ -0,0 +1,10 @@
+#ifndef __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+#define __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+
+typedef @PADDLE_FLOAT_TYPE@ paddle_real;
+
+// Since we only support linux and macos in compile, always use clang or
+// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
+#define PD_API __attribute__((visibility("default")))
+
+#endif
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
new file mode 100644
index 0000000000..44d8c2040d
--- /dev/null
+++ b/paddle/capi/error.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ERROR_H__
+#define __PADDLE_CAPI_ERROR_H__
+
+/**
+ * Error Type for Paddle API.
+ */
+typedef enum {
+  kPD_NO_ERROR = 0,
+  kPD_NULLPTR = 1,
+  kPD_OUT_OF_RANGE = 2,
+  kPD_PROTOBUF_ERROR = 3,
+  kPD_NOT_SUPPORTED = 4,
+  kPD_UNDEFINED_ERROR = -1,
+} paddle_error;
+
+#endif
diff --git a/paddle/capi/examples/.gitignore b/paddle/capi/examples/.gitignore
new file mode 100644
index 0000000000..2caa0a5a29
--- /dev/null
+++ b/paddle/capi/examples/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+build-*
diff --git a/paddle/capi/examples/README.md b/paddle/capi/examples/README.md
new file mode 100644
index 0000000000..14013e281f
--- /dev/null
+++ b/paddle/capi/examples/README.md
@@ -0,0 +1,3 @@
+# C-API Example Usage
+
+* [Model Inference](./model_inference/README.md)
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/capi/examples/model_inference/README.md
new file mode 100644
index 0000000000..58e6c83140
--- /dev/null
+++ b/paddle/capi/examples/model_inference/README.md
@@ -0,0 +1,42 @@
+# Use C-API for Model Inference
+
+There are several examples in this directory about how to use Paddle C-API for model inference.
+
+## Convert configuration file to protobuf binary.
+
+Firstly, the user should convert Paddle's model configuration file into a protobuf binary file. In each example directory, there is a file named `convert_protobin.sh`. It will convert `trainer_config.conf` into `trainer_config.bin`.
+
+The `convert_protobin.sh` is very simple, just invoke `dump_config` Python module to dump the binary file. The command line usages are:
+
+```bash
+python -m paddle.utils.dump_config YOUR_CONFIG_FILE 'CONFIG_EXTRA_ARGS' --binary > YOUR_CONFIG_FILE.bin
+```
+
+## Initialize paddle
+
+```c++
+char* argv[] = {"--use_gpu=False"};
+paddle_init(1, (char**)argv);
+```
+
+We must initialize global context before we invoke other interfaces in Paddle. The initialize commands just like the `paddle_trainer` command line arguments.  `paddle train --help`,  will show the list of arguments. The most important argument is `use_gpu` or not.
+
+## Load network and parameters
+
+```c
+paddle_gradient_machine machine;
+paddle_gradient_machine_create_for_inference(&machine, config_file_content, content_size));
+paddle_gradient_machine_load_parameter_from_disk(machine, "./some_where_to_params"));
+```
+
+The gradient machine is a Paddle concept, which represents a neural network can be forwarded and backward. We can create a gradient machine fo model inference, and load the parameter files from disk.
+
+Moreover, if we want to inference in multi-thread, we could create a thread local gradient machine which shared the same parameter by using `paddle_gradient_machine_create_shared_param` API. Please reference `multi_thread` as an example.
+
+## Create input
+
+The input of a neural network is an `arguments`. The examples in this directory will show how to construct different types of inputs for prediction. Please look at `dense`, `sparse_binary`, `sequence` for details.
+
+## Get inference
+
+After invoking `paddle_gradient_machine_forward`, we could get the output of the neural network.  The `value` matrix of output arguments will store the neural network output values. If the output is a `SoftmaxActivation`, the `value` matrix are the probabilities of each input samples. The height of output matrix is number of sample. The width is the number of categories.
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
new file mode 100644
index 0000000000..a78522e4a7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -0,0 +1,26 @@
+#ifndef __CAPI_EXAMPLE_COMMON_H__
+#define __CAPI_EXAMPLE_COMMON_H__
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CHECK(stmt)                                                \
+  do {                                                             \
+    paddle_error __err__ = stmt;                                   \
+    if (__err__ != kPD_NO_ERROR) {                                 \
+      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
+      exit(__err__);                                               \
+    }                                                              \
+  } while (0)
+
+void* read_config(const char* filename, long* size) {
+  FILE* file = fopen(filename, "r");
+  if (file == NULL) return NULL;
+  fseek(file, 0L, SEEK_END);
+  *size = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+  void* buf = malloc(*size);
+  fread(buf, 1, *size, file);
+  fclose(file);
+  return buf;
+}
+#endif
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
new file mode 100644
index 0000000000..008a488fd9
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(dense)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
new file mode 100755
index 0000000000..30ffc316ec
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m paddle.utils.dump_config trainer_config.py '' --binary > trainer_config.bin
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
new file mode 100644
index 0000000000..3e6bd52850
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -0,0 +1,69 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  srand(time(0));
+  paddle_real* array;
+
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
+  }
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/capi/examples/model_inference/dense/trainer_config.py
new file mode 100644
index 0000000000..873ec119e7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
@@ -0,0 +1,18 @@
+from paddle.trainer_config_helpers import *
+
+img = data_layer(name='pixel', size=784)
+
+hidden = fc_layer(
+    input=img,
+    size=200,
+    param_attr=ParamAttr(name='hidden.w'),
+    bias_attr=ParamAttr(name='hidden.b'))
+
+prob = fc_layer(
+    input=hidden,
+    size=10,
+    act=SoftmaxActivation(),
+    param_attr=ParamAttr(name='prob.w'),
+    bias_attr=ParamAttr(name='prob.b'))
+
+outputs(prob)
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/capi/examples/model_inference/multi_thread/.gitignore
new file mode 100644
index 0000000000..fab7372d79
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
new file mode 100644
index 0000000000..98e411ddc0
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -0,0 +1,8 @@
+project(multi_thread)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
+  ${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 120000
index 0000000000..3c1b353352
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
new file mode 100644
index 0000000000..d7675cd80a
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -0,0 +1,98 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+void* thread_main(void* gm_ptr) {
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  paddle_arguments out_args = paddle_arguments_create_none();
+  paddle_matrix prob = paddle_matrix_create_none();
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+
+    paddle_real* array;
+
+    // Get First row.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    for (int i = 0; i < 784; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+    CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", array[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
new file mode 120000
index 0000000000..70cfb1f7f4
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/capi/examples/model_inference/sequence/.gitignore
new file mode 100644
index 0000000000..fab7372d79
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
new file mode 100644
index 0000000000..71b73acba7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(sequence)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 120000
index 0000000000..3c1b353352
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/capi/examples/model_inference/sequence/main.c
new file mode 100644
index 0000000000..50bc0c9201
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input ids.
+  int sentence_ids[] = {83, 48, 20, 84, 394, 853, 64, 53, 64};
+
+  paddle_ivector sentence = paddle_ivector_create(
+      sentence_ids, sizeof(sentence_ids) / sizeof(int), false, false);
+  CHECK(paddle_arguments_set_ids(in_args, 0, sentence));
+
+  int seq_pos_array[] = {0, sizeof(sentence_ids) / sizeof(int)};
+
+  paddle_ivector seq_pos = paddle_ivector_create(
+      seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+
+  CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  paddle_real* array;
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 2; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_ivector_destroy(seq_pos));
+  CHECK(paddle_ivector_destroy(sentence));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/capi/examples/model_inference/sequence/trainer_config.py
new file mode 100644
index 0000000000..6bbc7a909a
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+WORD_DIM = 3000
+
+sentence = data_layer(name='sentence', size=WORD_DIM)
+sentence_embedding = embedding_layer(
+    input=sentence,
+    size=64,
+    param_attr=ParameterAttribute(
+        initial_max=1.0, initial_min=0.5))
+lstm = simple_lstm(input=sentence_embedding, size=64)
+lstm_last = last_seq(input=lstm)
+outputs(fc_layer(input=lstm_last, size=2, act=SoftmaxActivation()))
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
new file mode 100644
index 0000000000..fab7372d79
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
new file mode 100644
index 0000000000..c821956889
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
@@ -0,0 +1,7 @@
+project(sparse_binary)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 120000
index 0000000000..3c1b353352
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
new file mode 100644
index 0000000000..8ba67aee56
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
+  srand(time(0));
+  paddle_real* array;
+  int colBuf[] = {9, 93, 109};
+  int rowBuf[] = {0, sizeof(colBuf) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                       rowBuf,
+                                       sizeof(rowBuf) / sizeof(int),
+                                       colBuf,
+                                       sizeof(colBuf) / sizeof(int),
+                                       NULL,
+                                       0));
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
new file mode 120000
index 0000000000..70cfb1f7f4
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
new file mode 100644
index 0000000000..00f76e0152
--- /dev/null
+++ b/paddle/capi/gradient_machine.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
new file mode 100644
index 0000000000..d7e2dd9bf8
--- /dev/null
+++ b/paddle/capi/gradient_machine.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#define __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief GradientMachine means a neural network.
+ */
+typedef void* paddle_gradient_machine;
+
+/**
+ * @brief Create a gradient machine used for model inference.
+ * @param [out] machine that used for model inference.
+ * @param [in] modelConfigProtobuf
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
+
+/**
+ * @brief Load parameter from disk.
+ * @param machine Gradient Machine.
+ * @param path local directory path.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path);
+
+/**
+ * @brief Forward a gradient machine
+ * @param machine Gradient machine
+ * @param inArgs input arguments
+ * @param outArgs output arguments
+ * @param isTrain is train or not
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                paddle_arguments inArgs,
+                                paddle_arguments outArgs,
+                                bool isTrain);
+
+/**
+ * @brief Create a gradient machine, which parameters are shared from another
+ *        gradient machine.
+ * @param [in] origin gradient machine
+ * @param [in] modelConfigProtobuf model config protobuf
+ * @param [in] size of model config buffer.
+ * @param [out] slave gradient machine, the output value.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_shared_param(paddle_gradient_machine origin,
+                                            void* modelConfigProtobuf,
+                                            int size,
+                                            paddle_gradient_machine* slave);
+
+PD_API paddle_error
+paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
+
+/**
+ * @brief Destroy a gradient machine
+ * @param machine that need to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_destroy(paddle_gradient_machine machine);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
new file mode 100644
index 0000000000..893ebcbd58
--- /dev/null
+++ b/paddle/capi/main.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MAIN_H__
+#define __PADDLE_CAPI_MAIN_H__
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize Paddle.
+ */
+PD_API paddle_error paddle_init(int argc, char** argv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
new file mode 100644
index 0000000000..f15f7f3bbb
--- /dev/null
+++ b/paddle/capi/matrix.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MATRIX_H__
+#define __PADDLE_CAPI_MATRIX_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Matrix functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_matrix;
+
+/**
+ * @brief paddle_matrix_create Create a dense matrix
+ * @param height matrix height.
+ * @param width matrix width
+ * @param useGpu use GPU of not
+ * @return Matrix handler
+ */
+PD_API paddle_matrix paddle_matrix_create(uint64_t height,
+                                          uint64_t width,
+                                          bool useGpu);
+
+/**
+ * @brief paddle_matrix_create_sparse Create a sparse matrix.
+ * @param height the matrix height.
+ * @param width the matrix width.
+ * @param nnz the number of non-zero elements.
+ * @param isBinary is binary (either 1 or 0 in matrix) or not.
+ * @param useGpu is using GPU or not.
+ * @return paddle_matrix.
+ */
+PD_API paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+
+/**
+ * @brief paddle_matrix_destroy Destroy a matrix.
+ * @param mat
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_destroy(paddle_matrix mat);
+
+/**
+ * @brief paddle_matrix_set_row Set a row to matrix.
+ * @param mat Target Matrix
+ * @param rowID Index of row
+ * @param rowArray Row data.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real* rowArray);
+
+/**
+ * @brief PDMatGetRow Get raw row buffer from matrix
+ * @param [in] mat Target matrix
+ * @param [in] rowID Index of row.
+ * @param [out] rawRowBuffer Row Buffer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real** rawRowBuffer);
+
+/**
+ * @brief PDMatCreateNone Create None Matrix
+ * @return
+ */
+PD_API paddle_matrix paddle_matrix_create_none();
+
+/**
+ * @brief PDMatGetShape get the shape of matrix
+ * @param mat target matrix
+ * @param height The height of matrix
+ * @param width The width of matrix
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                            uint64_t* height,
+                                            uint64_t* width);
+
+/**
+ * @brief paddle_matrix_sparse_copy_from Copy from a CSR format matrix
+ * @param [out] mat output matrix
+ * @param [in] rowArray row array. The array slices in column array.
+ * @param [in] rowSize length of row array.
+ * @param [in] colArray the column array. It means the non-zero element indices
+ * in each row.
+ * @param [in] colSize length of column array.
+ * @param [in] valueArray the value array. It means the non-zero elemnt values.
+ * NULL if the matrix is binary.
+ * @param [in] valueSize length of value array. Zero if the matrix is binary.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                                   int* rowArray,
+                                                   uint64_t rowSize,
+                                                   int* colArray,
+                                                   uint64_t colSize,
+                                                   float* valueArray,
+                                                   uint64_t valueSize);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/tests/.gitignore b/paddle/capi/tests/.gitignore
new file mode 100644
index 0000000000..7ab6be95e3
--- /dev/null
+++ b/paddle/capi/tests/.gitignore
@@ -0,0 +1,2 @@
+w
+b
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
new file mode 100644
index 0000000000..d73f6b7733
--- /dev/null
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_unittest(capi_test_mats test_Vector.cpp
+  test_Matrix.cpp test_Arguments.cpp)
+
+target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_mats paddle_capi)
+
+
+add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+target_include_directories(capi_test_gradientMachine PUBLIC
+  ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_gradientMachine paddle_capi)
+add_test(NAME capi_test_gradientMachine
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/capi/tests/test_Arguments.cpp
new file mode 100644
index 0000000000..4792ceb49a
--- /dev/null
+++ b/paddle/capi/tests/test_Arguments.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "capi.h"
+#include "gtest/gtest.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(CAPIArguments, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_arguments args = paddle_arguments_create_none();
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
+  ASSERT_EQ(0UL, size);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, value) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_matrix mat = paddle_matrix_create(128, 64, false);
+  for (size_t i = 0; i < 128; ++i) {
+    std::vector<paddle_real> sampleBuf = randomBuffer(64);
+    paddle_matrix_set_row(mat, i, sampleBuf.data());
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
+
+  paddle_matrix val = paddle_matrix_create_none();
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
+
+  for (size_t i = 0; i < 128; ++i) {
+    paddle_real* row1;
+    paddle_real* row2;
+
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
+    ASSERT_EQ(row1, row2);
+  }
+
+  paddle_ivector ivec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, ids) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+template <typename T1, typename T2>
+void testSequenceHelper(T1 setter, T2 getter) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
+
+  int* rawBuf;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(array[i], rawBuf[i]);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, Sequence) {
+  auto testSequence = [](uint32_t nestedLevel) {
+    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3),
+                       std::bind(paddle_arguments_get_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3));
+  };
+  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
+    testSequence(i);
+  }
+}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000..89aa64608d
--- /dev/null
+++ b/paddle/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
new file mode 100644
index 0000000000..4bf9a9d6a9
--- /dev/null
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIMatrix, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sampleRow;
+  sampleRow.resize(32);
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    sampleRow[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_row(mat, 0, sampleRow.data()));
+  ASSERT_EQ(kPD_OUT_OF_RANGE,
+            paddle_matrix_set_row(mat, 128, sampleRow.data()));
+
+  paddle_real* arrayPtr;
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &arrayPtr));
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    ASSERT_NEAR(sampleRow[i], arrayPtr[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+TEST(CAPIMatrix, createNone) {
+  paddle_matrix mat = paddle_matrix_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/capi/tests/test_Vector.cpp
new file mode 100644
index 0000000000..365160dc9a
--- /dev/null
+++ b/paddle/capi/tests/test_Vector.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIVector, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_ivector vec;
+  int array[3] = {1, 2, 3};
+  vec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_resize(vec, 1000));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(vec, &size));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
+
+TEST(CAPIVector, createNone) {
+  paddle_ivector vec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/capi/tests/test_predict_network.py
new file mode 100644
index 0000000000..82ef5cb1a7
--- /dev/null
+++ b/paddle/capi/tests/test_predict_network.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100)
+
+x = data_layer(name='x', size=100)
+
+y = fc_layer(
+    input=x,
+    size=100,
+    bias_attr=ParamAttr(name='b'),
+    param_attr=ParamAttr(name='w'))
+
+outputs(y)
diff --git a/paddle/capi/vector.h b/paddle/capi/vector.h
new file mode 100644
index 0000000000..a92aeff164
--- /dev/null
+++ b/paddle/capi/vector.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_VECTOR_H__
+#define __PADDLE_CAPI_VECTOR_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Int Vector Functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_ivector;
+
+/**
+ * @brief Create an none int vector. It just a handler and store nothing. Used
+ *        to get output from other api.
+ * @return None int vector.
+ */
+PD_API paddle_ivector paddle_ivector_create_none();
+
+/**
+ * @brief paddle_ivector_create create a paddle int vector
+ * @param array: input array.
+ * @param size: input array size.
+ * @param copy: memory copy or just use same memory. True if copy.
+ * @param useGPU: True if use GPU
+ * @return paddle_error
+ */
+PD_API paddle_ivector paddle_ivector_create(int* array,
+                                            uint64_t size,
+                                            bool copy,
+                                            bool useGPU);
+
+/**
+ * @brief paddle_ivector_destroy destory an int vector.
+ * @param ivec vector to be destoried.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_destroy(paddle_ivector ivec);
+
+/**
+ * @brief paddle_ivector_get get raw buffer stored inside this int vector. It
+ * could be GPU memory if this int vector is stored in GPU.
+ * @param [in] ivec int vector
+ * @param [out] buffer the return buffer pointer.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer);
+
+/**
+ * @brief paddle_ivector_resize resize the int vector.
+ * @param [in] ivec: int vector
+ * @param [in] size: size to change
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size);
+
+/**
+ * @brief paddle_ivector_get_size get the size of int vector.
+ * @param [in] ivec: int vector
+ * @param [out] size: return size of this int vector.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get_size(paddle_ivector ivec,
+                                            uint64_t* size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index a28ccd6f07..f9061e96de 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -21,16 +21,13 @@ set(CUDA_CXX_WITH_GPU_SOURCES
 
 if(WITH_GPU)
     set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
         src/hl_warpctc_wrap.cc
         ${CUDA_CXX_WITH_GPU_SOURCES})
 
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
-    set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
-        src/hl_warpctc_wrap.cc)
+    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
 endif()
 
 set(CUDA_CU_SOURCES
@@ -47,7 +44,6 @@ set(CUDA_CU_SOURCES
 
 set(CUDA_HEADERS
     include/hl_time.h
-    include/hl_dso_loader.h
     include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index cdb2dba06c..93957fd964 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -40,18 +40,18 @@ public:
 namespace gpu {
 static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace gpu
 #else
 namespace cpu {
 static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace cpu
 
 #ifdef __AVX__
 namespace avx {
 static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace avx
 #endif
 #endif
 
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index c5787630ab..f55197c8c9 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -273,23 +273,23 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioW);
 
 /**
-* @brief   Bilinear interpolation backward.
-*
-* @param[out]  inGrad      input gradient.
-* @param[in]   inImgH      input image height.
-* @param[in]   inImgW      input image width.
-* @param[in]   inputH      input batchSize.
-* @param[in]   inputW      input image data dim.
-* @param[in]   outGrad     output gradient.
-* @param[in]   outImgH     output image height.
-* @param[in]   outImgW     output image width.
-* @param[in]   outputH     output batchSize.
-* @param[in]   outputW     output image data dim.
-* @param[in]   numChannels number of channels.
-* @param[in]   ratioH      inImgH / outImgH.
-* @param[in]   ratioW      inImgW / outImgW.
-*
-*/
+ * @brief   Bilinear interpolation backward.
+ *
+ * @param[out]  inGrad      input gradient.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[in]   outGrad     output gradient.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 182e8ab218..6163209e9b 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "hl_cuda_cublas.h"
 #include <sys/time.h>
-#include <mutex>
 #include "hl_cuda.h"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 6198f067ba..c53a563682 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <gflags/gflags.h>
-#include <mutex>
 #include "hl_cuda_cudnn.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 6dfb12e00b..4042d9742a 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -21,11 +21,10 @@ limitations under the License. */
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <unistd.h>
-#include <mutex>
 #include "hl_cuda.ph"
 #include "hl_thread.ph"
-#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/DynamicLoader.h"
 // clang-format on
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index f57efb2b46..9f812dd0de 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "hl_warpctc_wrap.h"
 #include <mutex>
-#include "hl_dso_loader.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1522510e8b..233a53709a 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-
+add_dependencies(paddle_function gen_proto_cpp)
 
 if(WITH_GPU)
 if(WITH_TESTING)
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 0f5d6a848d..1b25172ca5 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,11 +28,12 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test("ContextProjectionForward",
-                       FuncConfig()
-                           .set("context_length", context_length)
-                           .set("context_start", context_start)
-                           .set("begin_pad", std::max(0, -context_start)));
+  FunctionCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
 
   // prepare input arguments
   test.addSequence(SequenceIdArg(TensorShape{batch_size}));
@@ -51,7 +52,7 @@ void testMatrixProjectionForward(int context_start,
 }
 
 void testMatrixProjectionBackward(int context_start,
-                                  int context_length,
+                                  size_t context_length,
                                   bool is_padding,
                                   size_t batch_size,
                                   size_t input_dim) {
@@ -59,13 +60,14 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test("ContextProjectionBackward",
-                       FuncConfig()
-                           .set("context_length", context_length)
-                           .set("context_start", context_start)
-                           .set("begin_pad", std::max(0, -context_start))
-                           .set("is_padding", is_padding)
-                           .set("total_pad", pad));
+  FunctionCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
 
   // prepare input arguments
   test.addSequence(SequenceIdArg(TensorShape{batch_size}));
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index 8748eb0d79..8753057ebf 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -74,9 +74,9 @@ TEST(MulOp, DDDMatrixMul) {
 }
 
 /**
-  * C += A * B, B, C dense, A sparse
-  * dense = sparse * dense
-  */
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
 void testFuncDSparseDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
@@ -119,9 +119,9 @@ TEST(MuLOp, DSparseDMul) {
 }
 
 /**
-  * C += A * B, A, C dense, B sparse
-  * dense = dense * sparse
-  */
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
 void testFuncDDSparseMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
@@ -165,9 +165,9 @@ TEST(MulOp, DDSparseMul) {
 }
 
 /**
-  * C += A * B, A sparse, B, C dense
-  * sparse = dense * dense
-  */
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
 void testFuncSparseDDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 3eb87d9b85..b44e4dc202 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "MultiGradientMachine.h"
 #include "MultiNetwork.h"
 #include "NeuralNetwork.h"
-#include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 #include "hl_gpu.h"
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 6ae60102b3..3159026e6b 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -518,7 +518,7 @@ void TrainerThread::computeThread() {
         backward();
         break;
       case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        copyInArgs();
+        batchSize_ = copyInArgs();
         inArgsCopied_ = true;
         multiMachine_->waitForCopyInArgs();
         break;
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 2ab964b8fc..01158d1dce 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -637,7 +637,7 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
 /* create scattered id infomation for all realLayer of inFrameLines one time.
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
-*/
+ */
 
 void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                  const Argument& input,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 910ca4376b..c2bc52709a 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -107,18 +107,18 @@ public:
       DropCallback;
 
   /**
-    * @brief NormOrDropNodeCallback
-    *
-    * Normalize a path's probabilities or just drop it by modifying path.logProb
-    *
-    * The first parameter is sequence index in a batch
-    *
-    * The second parameter is path.ids
-    *
-    * The third parameter is probabilites for each node in this path.
-    *
-    * The fourth parameter is the probability of the whole path.
-    */
+   * @brief NormOrDropNodeCallback
+   *
+   * Normalize a path's probabilities or just drop it by modifying path.logProb
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is path.ids
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * The fourth parameter is the probability of the whole path.
+   */
   typedef std::function<void(
       int seqId, const std::vector<int>&, std::vector<real>&, real*)>
       NormOrDropNodeCallback;
@@ -348,9 +348,9 @@ protected:
   int targetInfoInlinkId_;
 
   /* create scattered id infomation for all realLayer of inFrameLines one time.
-  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-  *  for all realLayer of inFrameLines one time.
-  */
+   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+   *  for all realLayer of inFrameLines one time.
+   */
   void createInFrameInfo(int inlinks_id,
                          const Argument& input,
                          PassType passType);
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 4ae5b82870..69d5830dd2 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -217,7 +217,7 @@ void SmoothL1CostLayer::forwardImp(Matrix& output,
     targetCpu->copyFrom(target);
     outputCpu->copyFrom(output);
     labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    targetCpu->smoothL1(*outputCpu, *labelCpu);
     target.copyFrom(*targetCpu);
   } else {
     target.smoothL1(output, *label.value);
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 569a6840f0..14c0b33ec1 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -164,9 +164,11 @@ public:
  * tasks.
  * \f[
  * L =
- *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
- *   (output - label) - 0.5    / otherwise  /
+ *   0.5 * x^2    if / -1 < |x| < 1 /
+ *   |x| - 0.5    / otherwise /
  * \f]
+ *
+ * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
 public:
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 7c4bea0721..0ed482889d 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -14,20 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/parameter/Argument.h>
 #include <functional>
 #include <memory>
 #include "ModelConfig.pb.h"
 #include "paddle/function/Function.h"
+#include "paddle/gserver/activations/ActivationFunction.h"
 #include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/parameter/Argument.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
 
-#include <paddle/parameter/ParallelParameter.h>
-#include <paddle/parameter/Weight.h>
-#include "paddle/gserver/activations/ActivationFunction.h"
-
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
 #define REGISTER_LAYER(__type_name, __class_name) \
@@ -108,9 +106,9 @@ protected:
 
 public:
   /**
-    * Wait until all input value ready.
-    * Called before Layer::forward() function.
-    */
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
   virtual void waitInputValue();
 
   /**
@@ -120,9 +118,9 @@ public:
   virtual void copyOutputToOtherDevice();
 
   /**
-    * Wait until all output grad ready and merge them to output_.grad.
-    * Called before Layer::backward() function.
-    */
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
   virtual void waitAndMergeOutputGrad();
 
   /**
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index 1a64d4d5a5..d05c2065cb 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  *
  * The config file api is rotate_layer
  *
-*/
+ */
 
 class RotateLayer : public Layer {
 public:
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 8c49502011..235d9a9b0f 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -60,7 +60,7 @@ void SequencePoolLayer::forward(PassType passType) {
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
    * case, we should compute the new sequenceStartPositions.
-  */
+   */
   if (type_) {
     CHECK(input.subSequenceStartPositions)
         << "when trans_type = seq, input must hasSubseq";
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 68d150d553..50f2d89d02 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -48,8 +48,7 @@ lstm = lstmemory_group(
     size=hidden_dim,
     act=TanhActivation(),
     gate_act=SigmoidActivation(),
-    state_act=TanhActivation(),
-    lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    state_act=TanhActivation())
 
 lstm_last = last_seq(input=lstm)
 
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 88cb42798b..c01b95f7a2 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
         size=hidden_dim,
         act=TanhActivation(),
         gate_act=SigmoidActivation(),
-        state_act=TanhActivation(),
-        lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+        state_act=TanhActivation())
     return lstm_output
 
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0d7bd8c3b8..e1e8e7fae7 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1679,13 +1679,13 @@ TEST(Layer, smooth_l1) {
   TestConfig config;
   config.layerConfig.set_type("smooth_l1");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
   }
 }
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 150850da4d..4a846397e6 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/parameter/ParameterUpdateFunctions.h>
 #include <paddle/trainer/Trainer.h>
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 6203cd3b9a..178fce5b0a 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -15,6 +15,54 @@ limitations under the License. */
 #include "MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
+#include "paddle/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, #__name);                    \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+
+// clang-format off
+#ifdef PADDLE_USE_ATLAS
+  #define  PADDLE_SGETRF  clapack_sgetrf
+  #define  PADDLE_DGETRF  clapack_dgetrf
+  #define  PADDLE_SGETRI  clapack_sgetri
+  #define  PADDLE_DGETRI  clapack_dgetri
+#else
+  #define  PADDLE_SGETRF  LAPACKE_sgetrf
+  #define  PADDLE_DGETRF  LAPACKE_dgetrf
+  #define  PADDLE_SGETRI  LAPACKE_sgetri
+  #define  PADDLE_DGETRI  LAPACKE_dgetri
+#endif  
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
 
 namespace paddle {
 
@@ -85,16 +133,7 @@ int getrf<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -104,16 +143,7 @@ int getrf<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -122,16 +152,7 @@ int getri<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  const int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetri(order, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
 }
 
 template <>
@@ -140,15 +161,7 @@ int getri<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   const int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetri(order, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
   return 0;
 }
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 9f8f84a87c..c8559eefd8 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,14 +17,11 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
-#ifdef PADDLE_USE_LAPACK
 #include <mkl_lapacke.h>
-#endif
 #else
 extern "C" {
 #include <cblas.h>
 }
-#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <clapack.h>
@@ -33,7 +30,6 @@ extern "C" {
 #include <lapacke.h>
 #endif
 #endif
-#endif
 
 #include <cmath>
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 55a7344495..6ac61be0bf 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3616,17 +3616,18 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
   CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
+
   real* cost = getData();
+  real* out = output.getData();
   real* lbl = label.getData();
 
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = std::fabs(out[j] - lbl[j]);
-      if (cost[j] < 1.0)
-        cost[j] = 0.5 * cost[j] * cost[j];
+      real absVal = std::fabs(out[j] - lbl[j]);
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
       else
-        cost[j] = cost[j] - 0.5;
+        cost[i] += absVal - 0.5;
     }
   }
 }
@@ -3640,17 +3641,20 @@ void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
   CHECK_EQ(label.getHeight(), numSamples);
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
+  CHECK_EQ(getWidth(), dim);
+
   real* out = output.getData();
-  real* cost = getData();
   real* lbl = label.getData();
+  real* grad = getData();
 
-  // f'(x) = x         if |x| < 1
-  //       = sign(x)   otherwise
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = out[j] - lbl[j];
-      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+      real val = out[j] - lbl[j];
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
     }
   }
 }
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index c302096188..713f407f49 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -37,7 +37,7 @@ limitations under the License. */
  *
  *  AutoCompare test;
  *  test.cmpWithoutArg<I...>(function, height, width)
-*/
+ */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index dd19fe516f..3b1b0065af 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -235,8 +236,15 @@ TEST(Matrix, unary) {
       testMatrixTranspose(height, width);
       testMatrixRotate(height, width);
     }
-    // inverse
-    testMatrixInverse(height);
+    // inverse matrix
+    void** dso_handler = nullptr;
+    GetLapackDsoHandle(dso_handler);
+    if (nullptr == *dso_handler) {
+      LOG(WARNING) << "Failed to find liblapack.so, please specify its path "
+                      "using LD_LIBRARY_PATH.";
+    } else {
+      testMatrixInverse(height);
+    }
   }
 }
 
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 645bf73799..6d9365af2d 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -570,7 +570,7 @@ void Argument::poolSequenceWithStride(const Argument& input,
 
   CHECK(input.sequenceStartPositions);
   CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0) << "stride must larger than 0";
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
   size_t numSequences = input.getNumSequences();
   ICpuGpuVector::resizeOrCreate(
       sequenceStartPositions, numSequences + 1, false);
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 095019b74f..caa78acd98 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -126,7 +126,7 @@ protected:
 /*
  * AdaDelta Optimization.
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
-*/
+ */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
 public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
deleted file mode 100644
index cea77e5b17..0000000000
--- a/paddle/parameter/ParallelParameter.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include "paddle/utils/Logging.h"
-
-#include "ParallelParameter.h"
-
-namespace paddle {
-
-UpdateFunction paramUpdateFunctions[UPDATE_TYPE_NUM] = {
-    nullptr,  // &ParallelParameter::singleUpdate,  /* single thread */
-    nullptr,  // &ParallelParameter::controlUpdate,    /* controller thread */
-    &ParallelParameter::majorUpdate, /* major thread */
-    &ParallelParameter::minorUpdate, /* minor thread */
-
-    nullptr,                         /* master */
-    &ParallelParameter::slaveUpdate, /* slave */
-};
-ParallelParameterPtr ParallelParameter::create(TrainerRole role,
-                                               ParameterPtr localParam,
-                                               int asyncCount) {
-  ParallelParameterPtr ptr = nullptr;
-  switch (role) {
-    case TRAINER_ROLE_CONTROL:
-    case TRAINER_ROLE_MAJOR:
-    case TRAINER_ROLE_MINOR:
-      ptr = std::make_shared<SyncParameter>(role, localParam);
-      break;
-    case TRAINER_ROLE_MASTER:
-    case TRAINER_ROLE_SLAVE:
-      ptr = std::make_shared<AsyncParameter>(role, asyncCount, localParam);
-      break;
-    default:
-      LOG(FATAL) << "unknown role " << role << "\n";
-  }
-  return ptr;
-}
-void ParallelParameter::syncUpdate(TrainerRole role, real learnRate) {
-  if (paramUpdateFunctions[role]) {
-    (this->*paramUpdateFunctions[role])(learnRate);
-  }
-}
-
-void SyncParameter::attachControlParam(ParallelParameterPtr controler) {
-  controlParam_ = controler;
-}
-
-void SyncParameter::attachMajorParam(ParallelParameterPtr partner) {
-  majorPartners_.push_back(partner);
-  if (role_ == TRAINER_ROLE_CONTROL) {
-    localParam_->setSharedCount(majorPartners_.size());
-  }
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::attachMinorParam(ParallelParameterPtr partner,
-                                     int deviceId) {
-  minorPartners_.push_back(partner);
-  minorDeviceIds_.push_back(deviceId);
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::waitAllMajorGradReady() {
-  for (size_t i = 0; i < majorPartners_.size(); i++) {
-    majorPartners_[i]->waitGradReady();
-    partnerParam_ = majorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr patnrGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    if (FLAGS_use_gpu) hl_set_device(minorDeviceIds_[i]);
-    localGrad->add(*patnrGrad);
-  }
-}
-
-void SyncParameter::synchronizeParamter() {
-  valueSem_->wait();
-  if (role_ == TRAINER_ROLE_MINOR) {
-    /* copy the value from controller */
-    VectorPtr cntrlVec =
-        (controlParam_->getLocalParameter())->getBuf(PARAMETER_VALUE);
-    VectorPtr localVec = localParam_->getBuf(PARAMETER_VALUE);
-    localVec->copyFrom(*cntrlVec);
-
-    /* dispatch the value to major */
-    for (size_t i = 0; i < majorPartners_.size(); i++) {
-      VectorPtr majorVec =
-          (majorPartners_[i]->getLocalParameter())->getBuf(PARAMETER_VALUE);
-      majorVec->copyFrom(*localVec);
-      majorPartners_[i]->postValueReady();
-    }
-  }
-}
-
-void SyncParameter::singleUpdate(real learnRate) {
-  CHECK(role_ == TRAINER_ROLE_SINGLE);
-  localParam_->updateWithGradient(learnRate);
-}
-
-void SyncParameter::controlUpdate(const UpdateCallback &callBack) {
-  CHECK(role_ == TRAINER_ROLE_CONTROL);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(majorPartners_.size());
-
-  /* update */
-  if (callBack) {
-    callBack(localParam_.get());
-    localParam_->clearGradient();
-  }
-
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->postValueReady();
-  }
-}
-
-void SyncParameter::majorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MAJOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(minorPartners_.size() && controlParam_);
-
-  /* wait the minor-Gradient is ready */
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->waitGradReady();
-    partnerParam_ = minorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr minorGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    localGrad->add(*minorGrad);
-  }
-
-  /* notice the controller that the gradient is ready */
-  gradSem_->post();
-}
-
-void SyncParameter::minorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MINOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-
-  // notice the major that the gradient is ready
-  gradSem_->post();
-}
-
-AsyncParameter::AsyncParameter(TrainerRole role,
-                               int asyncCount,
-                               ParameterPtr localParam)
-    : ParallelParameter(role, localParam) {
-  asyncCount_ = asyncCount;
-  accumCounter_ = 0;
-  gradientAccum_ = Vector::create(localParam->getSize(), localParam->useGpu());
-  gradientAccum_->zeroMem();
-}
-
-void AsyncParameter::slaveUpdate(real learnRate) {
-  /* increase the accumCounter_ */
-  accumCounter_++;
-
-  /* accumulate the gradient to the buffer */
-  VectorPtr grad = localParam_->getBuf(PARAMETER_GRADIENT);
-  gradientAccum_->add(*grad);
-
-  /* if need to be synchronized with the master */
-  if (accumCounter_ == asyncCount_) {
-    gradSem_->post();
-    // accumCounter_ = 0; NOTICE: the upper-function need to reset the counter
-  } else {  // self update
-    localParam_->updateWithGradient(learnRate);
-  }
-  localParam_->clearGradient();
-}
-
-bool AsyncParameter::masterUpdate(ParallelParameterPtr slaveParam,
-                                  const UpdateCallback &callback) {
-  CHECK(slaveParam && callback);
-
-  /* wait the slave is ready */
-  if (!slaveParam->timeWaitGradReady(5)) {
-    return false;
-  }
-
-  AsyncParameter *asyncParam = dynamic_cast<AsyncParameter *>(slaveParam.get());
-
-  /* get the accum-gradient to update local parameter */
-  VectorPtr slaveVec = asyncParam->getAccum();
-  localParam_->getBuf(PARAMETER_GRADIENT)->copyFrom(*slaveVec);
-  callback(localParam_.get());
-  // slaveVec->zeroMem();
-
-  /* copy the newest parameter-value to the slave */
-  slaveVec = (slaveParam->getLocalParameter())->getBuf(PARAMETER_VALUE);
-  slaveVec->copyFrom(*(localParam_->getBuf(PARAMETER_VALUE)));
-
-  /* release the semphore */
-  slaveParam->postValueReady();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
deleted file mode 100644
index 2e7c18b808..0000000000
--- a/paddle/parameter/ParallelParameter.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <sys/time.h>
-#include <unistd.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "hl_gpu.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class ParallelParameter;
-class SyncParameter;
-class AsyncParameter;
-
-typedef std::shared_ptr<ParallelParameter> ParallelParameterPtr;
-
-const int UPDATE_TYPE_NUM = 32;
-
-/**
- * TrainRole denotes the role of current training, different roles have
- * different jobs.
- *
- * control, major, minor are three kinds of role to support mutiple GPUs
- * parallel SGD training. SM on GPU card has two groups, each group
- * consist of a major and a minor.
- *
- * @param    single  single GPU card single thread training.
- *
- *
- * @param    control current parameter updates via control role,
- *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and
- *                   update parameter value.
- *
- * @param    major   major role paticipates in real training, when local
- *                   gradient is ready, merge its corresponding minor's
- *                   gradient and notify controller: this group's gradient
- *                   is already ready.
- *
- * @param    minor   minor role participates in real training, when local
- *                   gradient is ready, only notify its corresponding major.
- *                   In order to maximum apportion jobs, after controller
- *                   updates the paramemter value, each group's minior
- *                   reponses to dispatch the latest model into local and
- *                   major.
- */
-enum TrainerRole {
-  TRAINER_ROLE_SINGLE,
-  TRAINER_ROLE_CONTROL,
-  TRAINER_ROLE_MAJOR,
-  TRAINER_ROLE_MINOR,
-  TRAINER_ROLE_MASTER,
-  TRAINER_ROLE_SLAVE
-};
-typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
-
-class ParallelParameter {
-public:
-  static ParallelParameterPtr create(TrainerRole role,
-                                     ParameterPtr localParam,
-                                     int asyncCount = 1);
-
-  ParallelParameter(TrainerRole role, ParameterPtr localParam) {
-    role_ = role;
-    gradSem_.reset(new Semaphore(0));
-    valueSem_.reset(new Semaphore(0));
-    localParam_ = localParam;
-  }
-
-  virtual ~ParallelParameter() {}
-
-  ParameterPtr getLocalParameter() { return localParam_; }
-  bool timeWaitGradReady(int sec) {
-    struct timespec ts;
-    ts.tv_nsec = 0;
-    ts.tv_sec = time(NULL) + sec;
-    return gradSem_->timeWait(&ts);
-  }
-  void waitGradReady() { gradSem_->wait(); }
-  void postValueReady() { valueSem_->post(); }
-
-  void syncUpdate(TrainerRole role, real learnRate);
-
-  virtual void synchronizeParamter() = 0;
-
-  /**
-   * for synchronous
-   */
-  virtual void singleUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void controlUpdate(const UpdateCallback& callback) { (void)callback; }
-
-  virtual void majorUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void minorUpdate(real learnRate) { (void)learnRate; }
-
-  /**
-   * for asynchronous
-   */
-  virtual void slaveUpdate(real learnRate) { (void)learnRate; }
-
-protected:
-  TrainerRole role_;
-  ParameterPtr localParam_;
-  std::unique_ptr<Semaphore>
-      gradSem_;  /// wether the local parameter-gradient is ready
-  std::unique_ptr<Semaphore>
-      valueSem_;  /// wether the local parameter-value is updated
-};
-
-/**
- * this class is designed for multi-threading training.
- *
- * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
- * but will get only one gradient
- */
-class SyncParameter : public ParallelParameter {
-public:
-  SyncParameter(TrainerRole role, ParameterPtr localParam)
-      : ParallelParameter(role, localParam) {
-    controlParam_ = nullptr;
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  ~SyncParameter() {
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  void attachControlParam(ParallelParameterPtr controler);
-
-  void attachMajorParam(ParallelParameterPtr partner);
-
-  void attachMinorParam(ParallelParameterPtr partner, int deviceId);
-
-  void waitAllMajorGradReady();
-
-  void synchronizeParamter();
-
-  void singleUpdate(real learnRate);
-
-  void controlUpdate(const UpdateCallback& callback);
-
-  void majorUpdate(real learnRate);
-
-  void minorUpdate(real learnRate);
-
-  std::vector<ParallelParameterPtr>& getMajorPartners() {
-    return majorPartners_;
-  }
-
-  std::vector<ParallelParameterPtr>& getMinorPartners() {
-    return minorPartners_;
-  }
-
-private:
-  // The following variables are used in a multithreaded training situation
-  // partnerParam_ is local-parameter's partner
-  // controlParam_ is the controller-thread 's parameter
-  ParameterPtr partnerParam_;
-  std::vector<ParallelParameterPtr> majorPartners_;
-  std::vector<ParallelParameterPtr> minorPartners_;
-  std::vector<int> minorDeviceIds_;
-  ParallelParameterPtr controlParam_;
-};
-
-class AsyncParameter : public ParallelParameter {
-public:
-  AsyncParameter(TrainerRole role, int asyncCount, ParameterPtr localParam);
-
-  void clearCounter() { accumCounter_ = 0; }
-
-  VectorPtr getAccum() { return gradientAccum_; }
-
-  void synchronizeParamter() {
-    if (accumCounter_ == asyncCount_) {
-      valueSem_->wait();
-      clearCounter();
-      gradientAccum_->zeroMem();
-    }
-  }
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update.
-   */
-  void slaveUpdate(real learnRate);
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * master: it only polls slaves, do not training data.
-   *         If slave's gradient is ready, fetch it.
-   *         Update master's parameter, then copy it into
-   *         corresponding slave.
-   */
-  bool masterUpdate(ParallelParameterPtr slaveParam,
-                    const UpdateCallback& callback);
-
-private:
-  /**
-   * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient.
-   *
-   * gradientAccum_ is used to save the sum of gradients.
-   */
-  VectorPtr gradientAccum_;
-
-  /// Asynchronous count.
-  int asyncCount_;
-  /// Accumulate counter of current gradients.
-  int accumCounter_;
-};
-
-typedef std::map<std::string, ParallelParameterPtr> ParallelParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 1ccded8187..b8efabbe2a 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -271,55 +271,6 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
   return nullptr;
 }
 
-void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(),
-            config_.momentum(),
-            config_.decay_rate(),
-            bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(),
-            bufs_[PARAMETER_MOMENTUM].get());
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   MatrixPtr gradMat,
-                                   IVectorPtr t0,
-                                   int currentTime,
-                                   bool fini) {
-  SparseRowCpuMatrix* sparseMat =
-      dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
-  CHECK(sparseMat);
-  CHECK_EQ(config_.momentum(), 0.0f)
-      << "not support momentum in sparse input sgd";
-  bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
-                       *t0,
-                       learningRate * config_.learning_rate(),
-                       currentTime,
-                       useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1,
-                       fini);
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   VectorPtr gradVec,
-                                   bool normalUpdate) {
-  if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(),
-              config_.momentum(),
-              config_.decay_rate(),
-              bufs_[PARAMETER_VALUE].get(),
-              gradVec.get(),
-              bufs_[PARAMETER_MOMENTUM].get());
-  } else {
-    size_t size = gradVec->getSize();
-    real* mom = bufs_[PARAMETER_MOMENTUM]->getData();
-    real* grad = gradVec->getData();
-    real* value = bufs_[PARAMETER_VALUE]->getData();
-    hl_matrix_add(mom, grad, mom, 1, size, 1.0f, learningRate);
-    hl_matrix_add(value, grad, value, 1, size, 1.0f, learningRate);
-  }
-}
-
 void Parameter::incUpdate(const UpdateCallback& callback) {
   // Static parameter is fixed, and does not need to be updated
   if (isStatic()) {
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 72c8336799..36d2b65f3b 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -223,29 +223,6 @@ public:
 
   bool isValueUpdated() const { return updated_; }
 
-  /**
-   * Update bufs_[PARAMETER_VALUE] using bufs_[PARAMETER_GRADIENT]
-   */
-  void updateWithGradient(real learningRate);
-
-  /**
-   * Update bufs_[PARAMETER_VALUE] using sparse row grad matrix.
-   *
-   * @see SparseRowCpuMatrix::sgdUpdate for more information.
-   */
-  void updateWithGradient(real learningRate,
-                          MatrixPtr gradMat,
-                          IVectorPtr t0,
-                          int currentTime,
-                          bool fini = false);
-
-  /**
-   * This function is used to calculate multiple gpus, but only as a candidate
-   */
-  void updateWithGradient(real learningRate,
-                          VectorPtr grad,
-                          bool normalUpdate = true);
-
   /**
    * Save parameter value to a file
    */
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
index 157ce7b44a..dff4339ea3 100644
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
@@ -58,6 +58,7 @@ _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
                    [--linelength=digits]
+                   [--write-success=success_status_file]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
@@ -499,6 +500,8 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+_write_success = None
+
 
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
     """Updates the global list of error-suppressions.
@@ -6337,7 +6340,7 @@ def ParseArguments(args):
     try:
         (opts, filenames) = getopt.getopt(args, '', [
             'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=',
-            'linelength=', 'extensions='
+            'linelength=', 'extensions=', 'write-success='
         ])
     except getopt.GetoptError:
         PrintUsage('Invalid arguments.')
@@ -6382,6 +6385,9 @@ def ParseArguments(args):
                 _valid_extensions = set(val.split(','))
             except ValueError:
                 PrintUsage('Extensions must be comma seperated list.')
+        elif opt == '--write-success':
+            global _write_success
+            _write_success = val
 
     if not filenames:
         PrintUsage('No files were specified.')
@@ -6408,6 +6414,10 @@ def main():
         ProcessFile(filename, _cpplint_state.verbose_level)
     _cpplint_state.PrintErrorCounts()
 
+    if _cpplint_state.error_count == 0 and _write_success is not None:
+        with open(_write_success, 'a'):
+            os.utime(_write_success, None)
+
     sys.exit(_cpplint_state.error_count > 0)
 
 
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
new file mode 100644
index 0000000000..91620b1ee7
--- /dev/null
+++ b/paddle/scripts/deb/postinst
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+echo "Post install paddle debian package."
+echo "Install some python package used for paddle. You can run "
+echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
+find /usr/ -name '*paddle*.whl' | xargs pip install
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 132f8cd8aa..76bc30e59b 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com
 Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
 
 ```bash
-docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
+docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" paddle:dev
 ```
 
 This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
@@ -119,7 +119,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 The following command builds the production image:
 
 ```bash
-docker build -t paddle -f build/Dockerfile .
+docker build -t paddle -f build/Dockerfile ./build
 ```
 
 This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e4c322bb18..4172063d92 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -5,13 +5,8 @@ set -e
 # Set BASE_IMAGE according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
   BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu14.04"
-  # additional packages to install when building gpu images
-  GPU_DOCKER_PKG="python-pip python-dev"
 else
-  BASE_IMAGE="python:2.7.13-slim"
-  # FIXME: python base image uses different python version than WITH_GPU
-  # need to change PYTHONHOME to /usr/local when using python base image
-  CPU_DOCKER_PYTHON_HOME_ENV="ENV PYTHONHOME /usr/local"
+  BASE_IMAGE="ubuntu:14.04"
 fi
 
 DOCKERFILE_GPU_ENV=""
@@ -29,25 +24,44 @@ rm *.deb 2>/dev/null || true
 
 cmake .. \
       -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=${WITH_DOC:-OFF} \
+      -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DON_COVERALLS=${WITH_TEST:-OFF} \
+      -DWITH_TESTING=${WITH_TESTING:-OFF} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
-if [[ ${RUN_TEST:-OFF} == "ON" ]]; then
-    make coveralls
+if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
+    make test
 fi
 make install
+pip install /usr/local/opt/paddle/share/wheels/*.whl
 
+# To build documentation, we need to run cmake twice.
+# This awkwardness is due to https://github.com/PaddlePaddle/Paddle/issues/1854.
+# It also describes a solution.
+if [ ${WITH_DOC} == "ON" ]; then
+    mkdir -p /paddle/build_doc
+    pushd /paddle/build_doc
+    cmake .. \
+          -DWITH_DOC=ON \
+          -DWITH_GPU=OFF \
+          -DWITH_AVX=${WITH_AVX:-OFF} \
+          -DWITH_SWIG_PY=ON \
+          -DWITH_STYLE_CHECK=OFF
+    make paddle_docs paddle_docs_cn
+    DOC_DIR="/paddle/paddle/scripts/tools/build_docs/"
+    mkdir -p $DOC_DIR/doc
+    mkdir -p $DOC_DIR/doc_cn
+    cp -r /paddle/build_doc/doc/en/html/* $DOC_DIR/doc
+    cp -r /paddle/build_doc/doc/cn/html/* $DOC_DIR/doc_cn
+    popd
+    rm -rf /paddle/build_doc
+fi
 # generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-# FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
-# install them in docker
-cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+cpack -D CPACK_GENERATOR='DEB' ..
 
 if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
     apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
@@ -75,31 +89,30 @@ fi
 
 paddle version
 
-if [[ -n ${APT_MIRROR} ]]; then
-  MIRROR_UPDATE="sed -i '${APT_MIRROR}' /etc/apt/sources.list && \\"
-else
-  MIRROR_UPDATE="\\"
-fi
-
 cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ENV HOME /root
 ENV LANG en_US.UTF-8
 # Use Fix locales to en_US.UTF-8
-RUN ${MIRROR_UPDATE}
-    apt-get update && \
-    apt-get install -y libgfortran3 libpython2.7 ${GPU_DOCKER_PKG} && \
-    apt-get clean -y && \
-    pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' requests numpy
+EOF
+
+if [[ -n ${APT_MIRROR} ]]; then
+cat >> /paddle/build/Dockerfile <<EOF
+RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+EOF
+fi
+
+cat >> /paddle/build/Dockerfile <<EOF
 # Use different deb file when building different type of images
-ADD build/*.deb /usr/local/opt/paddle/deb/
+ADD *.deb /
 # run paddle version to install python packages first
-RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \
-    rm -f /usr/local/opt/paddle/deb/*.deb && \
+RUN apt-get update &&\
+    apt-get install -y python-pip && pip install -U pip && \
+    dpkg -i /*.deb ; apt-get install -f -y && \
+    apt-get clean -y && \
+    rm -f /*.deb && \
     paddle version
-${CPU_DOCKER_PYTHON_HOME_ENV}
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
deleted file mode 100644
index 78dc756bd1..0000000000
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM paddledev/paddle:cpu-devel-latest
-COPY build.sh /
-RUN pip install sphinx &&\
-    pip install sphinx_rtd_theme &&\
-    apt install -y doxygen graphviz &&\
-    pip install recommonmark numpy protobuf==2.6.1
-CMD /build.sh
diff --git a/paddle/scripts/tools/build_docs/build.sh b/paddle/scripts/tools/build_docs/build.sh
deleted file mode 100755
index a23b6e61d4..0000000000
--- a/paddle/scripts/tools/build_docs/build.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -ex
-
-mkdir -p /build
-cd /build
-cmake /paddle -DWITH_DOC=ON
-make paddle_docs paddle_docs_cn -j `nproc`
-mkdir -p /output/doc
-mkdir -p /output/doc_cn
-cp -r doc/html/* /output/doc/
-cp -r doc_cn/html/* /output/doc_cn/
-cd /
-rm -rf /paddle/build
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index 9f8b80435c..00123dcb87 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -1,4 +1,36 @@
 #!/bin/bash
 set -e
-docker build . -t paddle_build_doc
-docker run --rm -v $PWD/../../../../:/paddle -v $PWD:/output paddle_build_doc
+function usage(){
+        echo "usage: build_doc [--help] [<args>]"
+        echo "This script generates doc and doc_cn in the script's directory."
+        echo "These are common commands used in various situations:"
+        echo "    with_docker       build doc and doc_cn with docker"
+        echo "    local             build doc and doc_cn locally"
+}
+
+
+case "$1" in
+    "with_docker")
+        docker run --rm -v $PWD/../../../../:/paddle \
+            -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_DOC=ON" paddledev/paddle:dev
+        ;;
+    "local")
+        mkdir -p doc
+        mkdir -p doc_cn
+        PADDLE_SOURCE_DIR=$PWD/../../../../
+        mkdir -p $PADDLE_SOURCE_DIR/build_doc
+        pushd $PADDLE_SOURCE_DIR/build_doc
+        cmake .. -DWITH_DOC=ON
+        make paddle_docs paddle_docs_cn
+        popd
+        cp -r $PADDLE_SOURCE_DIR/build_doc/doc/en/html/* doc
+        cp -r $PADDLE_SOURCE_DIR/build_doc/doc/cn/html/* doc_cn
+        rm -rf $PADDLE_SOURCE_DIR/build_doc
+        ;;
+    "--help")
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 53e998ef6c..67b89adb4d 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -5,8 +5,8 @@ source ./common.sh
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
 mkdir output
-make DESTDIR=./output install -j `nproc`
-pip install ./output/usr/local/opt/paddle/share/wheels/*
+make -j `nproc`
+find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
@@ -60,6 +60,7 @@ function deploy_docs() {
 
 deploy_docs "master" "." 
 deploy_docs "develop" "./develop/"
+deploy_docs "release/0.10.0" "./release/0.10.0/"
 
 # Check is there anything changed.
 set +e
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 0b62436a7f..06d55d3abc 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -23,7 +23,7 @@ setup(name="py_paddle",
       install_requires = [
         'nltk>=3.2.2',
         'numpy>=1.8.0',      # The numpy is required.
-        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
+        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
       ],
       url='http://www.paddlepaddle.org/',
       license='Apache 2.0',
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index c5c76a030d..08b2d8a38e 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -17,14 +17,17 @@ add_test(NAME test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
 ############### test_TrainerOnePass ##########################
-add_unittest_without_exec(test_TrainerOnePass
-    test_TrainerOnePass.cpp)
-add_test(NAME test_TrainerOnePass
-  COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-        ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
+if(WITH_PYTHON)
+  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
+  # is using PyDataProvider2.
+  add_unittest_without_exec(test_TrainerOnePass
+      test_TrainerOnePass.cpp)
+  add_test(NAME test_TrainerOnePass
+    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
+          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+endif()
 ################ test_CompareTwoNets ######################
 add_unittest_without_exec(test_CompareTwoNets
     test_CompareTwoNets.cpp)
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index 23bfa16408..4aa64961d0 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -1059,14 +1059,14 @@ inline bool operator==(const value& x, const value& y) {
 }
 
 inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}
+}  // namespace picojson
 
 namespace std {
 template <>
 inline void swap(picojson::value& x, picojson::value& y) {
   x.swap(y);
 }
-}
+}  // namespace std
 
 inline std::istream& operator>>(std::istream& is, picojson::value& x) {
   picojson::set_last_error(std::string());
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/utils/DynamicLoader.cpp
similarity index 94%
rename from paddle/cuda/src/hl_dso_loader.cc
rename to paddle/utils/DynamicLoader.cpp
index 53164dd27c..368c35e151 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/utils/DynamicLoader.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_dso_loader.h"
+#include "DynamicLoader.h"
 #include <gflags/gflags.h>
-#include "paddle/utils/Logging.h"
+#include "Logging.h"
 
 DEFINE_string(cudnn_dir,
               "",
@@ -30,6 +30,8 @@ DEFINE_string(cuda_dir,
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -160,3 +162,11 @@ void GetWarpCTCDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
 #endif
 }
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/utils/DynamicLoader.h
similarity index 83%
rename from paddle/cuda/include/hl_dso_loader.h
rename to paddle/utils/DynamicLoader.h
index 276a07d3c7..9b5ad21724 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef HL_DSO_LOADER_H_
-#define HL_DSO_LOADER_H_
+#ifndef DYNAMIC_LOAD_H_
+#define DYNAMIC_LOAD_H_
 
 #include <dlfcn.h>
 #include <memory>
+#include <mutex>
 #include <string>
-#include "hl_base.h"
 
 /**
  * @brief    load the DSO of CUBLAS
@@ -52,4 +52,12 @@ void GetCurandDsoHandle(void** dso_handle);
  */
 void GetWarpCTCDsoHandle(void** dso_handle);
 
-#endif  // HL_DSO_LOADER_H_
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+#endif  // DYNAMIC_LOAD_H_
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e7a0895533..bfa19d5ecc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -24,9 +24,12 @@ add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
 add_subdirectory(paddle/trainer_config_helpers/tests)
-add_subdirectory(paddle/v2/tests)
-add_subdirectory(paddle/v2/reader/tests)
-add_subdirectory(paddle/v2/plot/tests)
+if (WITH_SWIG_PY)
+  # enable v2 API unittest only when paddle swig api is compiled
+  add_subdirectory(paddle/v2/tests)
+  add_subdirectory(paddle/v2/reader/tests)
+  add_subdirectory(paddle/v2/plot/tests)
+endif()
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index dc89419c40..32e31fe2c4 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2119,6 +2119,7 @@ define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
 define_cost('SumCost', 'sum_cost')
+define_cost('SmoothL1Cost', 'smooth_l1')
 
 
 @config_layer('hsigmoid')
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index bf02088346..7ae9e5cb30 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -196,7 +196,7 @@ class ExtraLayerAttribute(object):
                       <https://www.cs.toronto.edu/~hinton/absps/
                       JMLRdropout.pdf>`_.
     :type drop_rate: float
-    :param device: device ID of layer. device=-1, use CPU. device>0, use GPU.
+    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
                    The details allocation in parallel_nn please refer to `here
                    <http://www.paddlepaddle.org/doc/ui/cmd_argument/
                    use_case.html#case-2-specify-layers-in-different-devices>`_.
@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
                  drop_rate=None,
                  device=None):
         self.attr = dict()
-        if isinstance(error_clipping_threshold, float):
-            assert error_clipping_threshold > 0
-            self.attr["error_clipping_threshold"] = error_clipping_threshold
-
-        if isinstance(drop_rate, float):
-            assert drop_rate > 0
+        if error_clipping_threshold is not None:
+            error_clipping_threshold = float(error_clipping_threshold)
+            if error_clipping_threshold < 0:
+                raise ValueError("Error clipping must > 0")
+            self.attr['error_clipping_threshold'] = error_clipping_threshold
+        if drop_rate is not None:
+            drop_rate = float(drop_rate)
+            if drop_rate < 0:
+                raise ValueError("Dropout rate must > 0")
             self.attr["drop_rate"] = drop_rate
 
         if isinstance(device, int):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 97db3c2d4c..31652613fb 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -84,6 +84,7 @@ __all__ = [
     'GeneratedInput',
     'SubsequenceInput',
     'gru_step_layer',
+    'gru_step_naive_layer',
     'recurrent_layer',
     'BaseGeneratedInput',
     'conv_operator',
@@ -116,6 +117,7 @@ __all__ = [
     'spp_layer',
     'pad_layer',
     'eos_layer',
+    'smooth_l1_cost',
     'layer_support',
 ]
 
@@ -201,6 +203,7 @@ class LayerType(object):
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
     SUM_COST = "sum_cost"
+    SMOOTH_L1 = "smooth_l1"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -567,7 +570,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
     DotMulOperator takes two inputs and performs element-wise multiplication:
 
     .. math::
-       out.row[i] += scale * (x.row[i] .* y.row[i])
+       out.row[i] += scale * (a.row[i] .* b.row[i])
 
     where :math:`.*` means element-wise multiplication, and
     scale is a config scalar, its default value is one.
@@ -576,7 +579,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
 
     .. code-block:: python
 
-       op = dotmul_operator(x=layer1, y=layer2, scale=0.5)
+       op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
 
     :param a: Input layer1
     :type a: LayerOutput
@@ -1347,9 +1350,9 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride, 
-    and return the last value of the window as the output. Thus, a long sequence 
-    will be shorten. Note that for sequence with sub-sequence, the default value 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the last value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
     The simple usage is:
@@ -1363,7 +1366,7 @@ def last_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.  
+    :param stride: window size.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1403,9 +1406,9 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride, 
-    and return the first value of the window as the output. Thus, a long sequence 
-    will be shorten. Note that for sequence with sub-sequence, the default value 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the first value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
     The simple usage is:
@@ -1419,7 +1422,7 @@ def first_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.  
+    :param stride: window size.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1559,7 +1562,7 @@ def seq_reshape_layer(input,
                       bias_attr=None):
     """
     A layer for reshaping the sequence. Assume the input sequence has T instances,
-    the dimension of each instance is M, and the input reshape_size is N, then the 
+    the dimension of each instance is M, and the input reshape_size is N, then the
     output sequence has T*M/N instances, the dimension of each instance is N.
 
     Note that T*M/N must be an integer.
@@ -2116,8 +2119,8 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt", 
-                       otherwise layer_type has to be either "exconv" or 
+                       layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or
                        "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
@@ -2284,7 +2287,7 @@ def img_pool_layer(input,
 
     type_name = pool_type.name + '-projection' \
         if (
-    isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
         else pool_type.name
 
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
@@ -2335,9 +2338,9 @@ def spp_layer(input,
 
     ..  code-block:: python
 
-        spp = spp_layer(input=data, 
-                        pyramid_height=2, 
-                        num_channels=16, 
+        spp = spp_layer(input=data,
+                        pyramid_height=2,
+                        num_channels=16,
                         pool_type=MaxPooling())
 
     :param name: layer name.
@@ -2431,7 +2434,7 @@ def img_cmrnorm_layer(input,
     The example usage is:
 
     ..  code-block:: python
-    
+
         norm = img_cmrnorm_layer(input=net, size=5)
 
     :param name: layer name.
@@ -2492,7 +2495,7 @@ def batch_norm_layer(input,
     The example usage is:
 
     ..  code-block:: python
-    
+
         norm = batch_norm_layer(input=net, act=ReluActivation())
 
     :param name: layer name.
@@ -2793,11 +2796,11 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     """
     Concat sequence a with sequence b.
 
-    Inputs: 
+    Inputs:
       - a = [a1, a2, ..., an]
       - b = [b1, b2, ..., bn]
       - Note that the length of a and b should be the same.
-        
+
     Output: [a1, b1, a2, b2, ..., an, bn]
 
     The example usage is:
@@ -3084,6 +3087,78 @@ def gru_step_layer(input,
         activation=act)
 
 
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(act=TanhActivation())
+@wrap_name_default('gru_step_naive')
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def gru_step_naive_layer(input,
+                         output_mem,
+                         size=None,
+                         name=None,
+                         act=None,
+                         gate_act=None,
+                         bias_attr=None,
+                         param_attr=None,
+                         layer_attr=None):
+    """
+    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    and DROPOUT.
+
+    :param input:
+    :param output_mem:
+    :param size:
+    :param name:
+    :param act:
+    :param gate_act:
+    :param bias_attr:
+    :param param_attr:
+    :param layer_attr:
+    :return:
+    """
+    if input.size % 3 != 0:
+        raise ValueError("GruStep input size must be divided by 3")
+    if size is None:
+        size = input.size / 3
+
+    def __gate__(gate_name, offset):
+        with mixed_layer(
+                name=name + "_" + gate_name,
+                size=size,
+                layer_attr=layer_attr,
+                bias_attr=bias_attr,
+                act=gate_act) as gate:
+            gate += identity_projection(input=input, offset=offset)
+            gate += full_matrix_projection(
+                input=output_mem, param_attr=param_attr)
+        return gate
+
+    update_gate = __gate__("update", 0)
+    reset_gate = __gate__("reset", size)
+
+    with mixed_layer(
+            name=name + "_reset_output", bias_attr=False) as reset_output:
+        reset_output += dotmul_operator(a=output_mem, b=reset_gate)
+
+    with mixed_layer(
+            name=name + "_output_candidate",
+            size=size,
+            layer_attr=layer_attr,
+            bias_attr=bias_attr,
+            act=act) as output_candidate:
+        output_candidate += identity_projection(input=input, offset=2 * size)
+        output_candidate += full_matrix_projection(
+            input=reset_output, param_attr=param_attr)
+
+    with mixed_layer(name=name) as output:
+        output += identity_projection(output_mem)
+        output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
+        output += dotmul_operator(a=output_candidate, b=update_gate)
+
+    return output
+
+
 @wrap_name_default()
 @layer_support()
 def get_output_layer(input, arg_name, name=None, layer_attr=None):
@@ -3561,9 +3636,15 @@ def beam_search(step,
                 simple_rnn += last_time_step_output
             return simple_rnn
 
+        generated_word_embedding = GeneratedInput(
+                               size=target_dictionary_dim,
+                               embedding_name="target_language_embedding",
+                               embedding_size=word_vector_dim)
+
         beam_gen = beam_search(name="decoder",
                                step=rnn_step,
-                               input=[StaticInput(encoder_last)],
+                               input=[StaticInput(encoder_last),
+                                      generated_word_embedding],
                                bos_id=0,
                                eos_id=1,
                                beam_size=5)
@@ -3582,7 +3663,8 @@ def beam_search(step,
                  You can refer to the first parameter of recurrent_group, or
                  demo/seqToseq/seqToseq_net.py for more details.
     :type step: callable
-    :param input: Input data for the recurrent unit
+    :param input: Input data for the recurrent unit, which should include the
+                  previously generated words as a GeneratedInput object.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -3689,8 +3771,7 @@ def mse_cost(input, label, weight=None, name=None, layer_attr=None):
 
     ..  math::
 
-       $\frac{1}{N}\sum_{i=1}^N(t _i- y_i)^2$
-
+        \frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
 
     :param name: layer name.
     :type name: basestring
@@ -5250,8 +5331,6 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param type: The type of cost.
-    :type type: basestring
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
@@ -5280,3 +5359,52 @@ def multi_binary_label_cross_entropy(input,
         LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
         parents=[input, label],
         size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def smooth_l1_cost(input, label, name=None, layer_attr=None):
+    """
+    This is a L1 loss but more smooth. It requires that the
+    size of input and label are equal. The formula is as follows,
+
+    .. math::
+
+        L = \sum_{i} smooth_{L1}(input_i - label_i)
+
+    in which
+
+    .. math::
+
+        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
+
+    More details can be found by referring to `Fast R-CNN
+    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+
+    .. code-block:: python
+
+       cost = smooth_l1_cost(input=input_layer,
+                             label=label_layer)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    assert input.size == label.size
+
+    Layer(
+        name=name,
+        type=LayerType.SMOOTH_L1,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index cadde11ff8..fb533a47e0 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -825,7 +825,8 @@ def gru_unit(input,
              gru_param_attr=None,
              act=None,
              gate_act=None,
-             gru_layer_attr=None):
+             gru_layer_attr=None,
+             naive=False):
     """
     Define calculations that a gated recurrent unit performs in a single time
     step. This function itself is not a recurrent layer, so that it can not be
@@ -857,7 +858,12 @@ def gru_unit(input,
 
     out_mem = memory(name=name, size=size)
 
-    gru_out = gru_step_layer(
+    if naive:
+        __step__ = gru_step_naive_layer
+    else:
+        __step__ = gru_step_layer
+
+    gru_out = __step__(
         name=name,
         input=input,
         output_mem=out_mem,
@@ -879,7 +885,8 @@ def gru_group(input,
               gru_param_attr=None,
               act=None,
               gate_act=None,
-              gru_layer_attr=None):
+              gru_layer_attr=None,
+              naive=False):
     """
     gru_group is a recurrent layer group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
@@ -928,7 +935,8 @@ def gru_group(input,
             gru_param_attr=gru_param_attr,
             act=act,
             gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr)
+            gru_layer_attr=gru_layer_attr,
+            naive=naive)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
@@ -949,7 +957,8 @@ def simple_gru(input,
                gru_param_attr=None,
                act=None,
                gate_act=None,
-               gru_layer_attr=None):
+               gru_layer_attr=None,
+               naive=False):
     """
     You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
     simple_gru in network.py. The reason why there are so many interfaces is
@@ -1018,7 +1027,8 @@ def simple_gru(input,
         gru_param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
-        gru_layer_attr=gru_layer_attr)
+        gru_layer_attr=gru_layer_attr,
+        naive=naive)
 
 
 @wrap_name_default('simple_gru2')
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 93dd7796c2..6c860fd497 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,8 +9,7 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-add_paddle_exe(protobuf_equal
-  ProtobufEqualMain.cpp)
+add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
   COMMAND
   ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index c9178e3c6a..c5dc8e1aab 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape)
+test_seq_concat_reshape test_pad test_smooth_l1)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index ee5961af75..8a31887963 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -11,6 +11,9 @@ for conf in ${configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
@@ -18,5 +21,8 @@ for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2afc3afef6..d8bd7b9dfb 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -320,6 +320,7 @@ layers {
     }
   }
   drop_rate: 0.5
+  error_clipping_threshold: 40.0
 }
 parameters {
   name: "___embedding_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
new file mode 100644
index 0000000000..15c6ab4dc8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -0,0 +1,120 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 32256
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 42
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 48
+      img_size_y: 48
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 48
+  width: 42
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 8064
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 16
+      size_x: 2
+      stride: 2
+      output_x: 21
+      img_size: 42
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      padding_y: 0
+    }
+  }
+  height: 24
+  width: 21
+}
+layers {
+  name: "__pad_0__"
+  type: "pad"
+  size: 14175
+  active_type: ""
+  inputs {
+    input_layer_name: "__pool_0__"
+    pad_conf {
+      image_conf {
+        channels: 16
+        img_size: 21
+        img_size_y: 24
+      }
+      pad_c: 2
+      pad_c: 3
+      pad_h: 1
+      pad_h: 2
+      pad_w: 3
+      pad_w: 1
+    }
+  }
+  height: 27
+  width: 25
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__pad_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__pad_0__"
+  input_layer_names: "data"
+  output_layer_names: "__pad_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
new file mode 100644
index 0000000000..4aa041ea2e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
@@ -0,0 +1,40 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__smooth_l1_cost_0__"
+  type: "smooth_l1"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+input_layer_names: "input"
+input_layer_names: "label"
+output_layer_names: "__smooth_l1_cost_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "__smooth_l1_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  output_layer_names: "__smooth_l1_cost_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
index bb5f13410d..491e8c8caa 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
 
-data = data_layer(name='data', size=2304, height=48, width=42)
+data = data_layer(name='data', size=2016, height=48, width=42)
 
 conv = img_conv_layer(
     input=data,
@@ -13,8 +13,7 @@ conv = img_conv_layer(
     act=LinearActivation(),
     bias_attr=True)
 
-pool = img_pool_layer(
-    input=conv, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
 
 pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
new file mode 100644
index 0000000000..66629662dd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=300)
+smooth_l1 = smooth_l1_cost(input=data, label=lbl)
+
+outputs(smooth_l1)
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index 73bf349c46..d27af7f762 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -20,6 +20,7 @@ __all__ = []
 
 if __name__ == '__main__':
     whole_conf = False
+    binary = False
     if len(sys.argv) == 2:
         conf = parse_config(sys.argv[1], '')
     elif len(sys.argv) == 3:
@@ -28,6 +29,8 @@ if __name__ == '__main__':
         conf = parse_config(sys.argv[1], sys.argv[2])
         if sys.argv[3] == '--whole':
             whole_conf = True
+        elif sys.argv[3] == '--binary':
+            binary = True
     else:
         raise RuntimeError()
 
@@ -36,4 +39,7 @@ if __name__ == '__main__':
     if whole_conf:
         print conf
     else:
-        print conf.model_config
+        if binary:
+            sys.stdout.write(conf.model_config.SerializeToString())
+        else:
+            print conf.model_config
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 41fda1e8f2..81af0a8e66 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -80,7 +80,7 @@ def train100():
 
 def test100():
     """
-    CIFAR-100 test set cretor.
+    CIFAR-100 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
@@ -107,7 +107,7 @@ def train10():
 
 def test10():
     """
-    CIFAR-10 test set cretor.
+    CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index 41ca27e236..bf88fe1557 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -41,7 +41,7 @@ def word_count(f, word_freq=None):
     return word_freq
 
 
-def build_dict():
+def build_dict(typo_freq=50):
     """
     Build a word dictionary from the corpus,  Keys of the dictionary are words,
     and values are zero-based IDs of these words.
@@ -59,8 +59,7 @@ def build_dict():
             # remove <unk> for now, since we will set it as last index
             del word_freq['<unk>']
 
-        TYPO_FREQ = 50
-        word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
+        word_freq = filter(lambda x: x[1] > typo_freq, word_freq.items())
 
         word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
         words, _ = list(zip(*word_freq_sorted))
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index c1347d3c66..435556b292 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -93,7 +93,7 @@ def train():
 
 def test():
     """
-    MNIST test set cretor.
+    MNIST test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 384de9b9d5..89cca7acd3 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -356,6 +356,9 @@ def mixed(size=0,
     return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
 
 
+mixed.__doc__ = conf_helps.mixed_layer.__doc__
+
+
 class RecurrentLayerInput(Layer):
     def __init__(self, recurrent_name, index, parent_layers):
         parents_len = len(parent_layers)
@@ -404,6 +407,8 @@ data.__name__ = 'data'
 AggregateLevel = conf_helps.layers.AggregateLevel
 ExpandLevel = conf_helps.layers.ExpandLevel
 memory = MemoryV2
+memory.__name__ = 'memory'
+memory.__doc__ = conf_helps.memory.__doc__
 
 
 def __layer_name_mapping__(inname):
@@ -512,6 +517,9 @@ def recurrent_group(step, input, name=None):
         return retv
 
 
+recurrent_group.__doc__ = conf_helps.recurrent_group.__doc__
+
+
 @wrap_name_default()
 def beam_search(step,
                 input,
@@ -579,6 +587,8 @@ def beam_search(step,
     return tmp
 
 
+beam_search.__doc__ = conf_helps.beam_search.__doc__
+
 __projection_names__ = filter(lambda x: x.endswith('_projection'),
                               dir(conf_helps))
 
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index feefd7d758..5e99d4a241 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -38,12 +38,35 @@ class Optimizer(object):
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
-    def create_local_updater(self):
+    def __create_local_updater__(self):
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
-    def create_remote_updater(self, pass_num):
-        return swig_api.ParameterUpdater.createRemoteUpdater(self.__opt_conf__,
-                                                             pass_num)
+    def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        return swig_api.ParameterUpdater.createRemoteUpdater(
+            self.__opt_conf__, pass_num, use_sparse_updater)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater):
+        """
+        create proper parameter_updater by configuration.
+        :param is_local: create local or remote parameter updater
+        :param num_passes: remote parameter updater will use this to config
+        parameter server.
+        :param use_sparse_updater: when use remote updater, if some parameter is
+        sparse, updater should do some extra thing:
+
+        ..  code-block:: python
+
+            if use_sparse_remote_updater:
+                        gradient_machine.prefetch(in_args)
+                        parameter_updater.getParametersRemote()
+        :return: parameter_updater
+        """
+        if is_local:
+            parameter_updater = self.__create_local_updater__()
+        else:
+            parameter_updater = self.__create_remote_updater__(
+                num_passes, use_sparse_updater)
+        return parameter_updater
 
 
 class Momentum(Optimizer):
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index 737b6bf1e2..ff28c85c53 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -73,6 +73,18 @@ class Topology(object):
 
         assert isinstance(self.__model_config__, ModelConfig)
 
+    def use_sparse_updater(self):
+        """
+        check if any parameter require to use sparse_update
+        :return:
+        """
+        use_sparse = False
+        for parameter in self.__model_config__.parameters:
+            if parameter.sparse_update or parameter.sparse_remote_update:
+                use_sparse = True
+                break
+        return use_sparse
+
     def proto(self):
         return self.__model_config__
 
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 68b4967cc0..ec9fcfb749 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,6 +2,8 @@
 Module Trainer
 """
 import collections
+import gzip
+import os
 
 import py_paddle.swig_paddle as api
 
@@ -42,7 +44,12 @@ class SGD(object):
     :type extra_layers: paddle.v2.config_base.Layer
     """
 
-    def __init__(self, cost, parameters, update_equation, extra_layers=None):
+    def __init__(self,
+                 cost,
+                 parameters,
+                 update_equation,
+                 extra_layers=None,
+                 is_local=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -55,20 +62,48 @@ class SGD(object):
         self.__topology__ = topology
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
+        self.__is_local__ = is_local
 
-        # In local mode, disable sparse_remote_update.
-        for param in self.__topology_in_proto__.parameters:
-            if param.sparse_remote_update:
-                param.sparse_remote_update = False
+        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
+        # # In local mode, disable sparse_remote_update.
+        if is_local:
+            for param in self.__topology_in_proto__.parameters:
+                if param.sparse_remote_update:
+                    param.sparse_remote_update = False
 
+        self.__gm_create_mode__ = api.CREATE_MODE_NORMAL if not \
+            self.__use_sparse_updater__ else api.CREATE_MODE_SGD_SPARSE_CPU_TRAINING
         self.__data_types__ = topology.data_type()
         gm = api.GradientMachine.createFromConfigProto(
-            self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
+            self.__topology_in_proto__, self.__gm_create_mode__,
             self.__optimizer__.enable_types())
         assert isinstance(gm, api.GradientMachine)
         self.__gradient_machine__ = gm
         self.__gradient_machine__.randParameters()
-        parameters.append_gradient_machine(gm)
+        self.__parameters__.append_gradient_machine(gm)
+        self.__parameter_updater__ = None
+
+    def __use_remote_sparse_updater__(self):
+        return self.__use_sparse_updater__ and not self.__is_local__
+
+    def __prepare_parameter__(self, in_args):
+        """
+        prepare parameter before forward backward.
+        1. When use remote sparse updater, parameters should be got
+        from ps according to input arguments.
+        :param in_args: input arguments of this batch.
+        :return:
+        """
+        if self.__use_remote_sparse_updater__():
+            self.__gradient_machine__.prefetch(in_args)
+            self.__parameter_updater__.getParametersRemote()
+
+    def save_parameter_to_tar(self, f):
+        self.__parameter_updater__.catchUpWith()
+        self.__parameter_updater__.apply()
+        self.__parameter_updater__.getParametersRemote(True, True)
+        self.__parameters__.to_tar(f)
+        self.__parameter_updater__.restore()
 
     def train(self, reader, num_passes=1, event_handler=None, feeding=None):
         """
@@ -90,8 +125,9 @@ class SGD(object):
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
-        updater = self.__optimizer__.create_local_updater()
-        updater.init(self.__gradient_machine__)
+        self.__parameter_updater__ = self.__optimizer__.create_updater(
+            self.__is_local__, num_passes, self.__use_sparse_updater__)
+        self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
         batch_evaluator = self.__gradient_machine__.makeEvaluator()
@@ -103,23 +139,26 @@ class SGD(object):
         for pass_id in xrange(num_passes):
             event_handler(v2_event.BeginPass(pass_id))
             pass_evaluator.start()
-            updater.startPass()
+            self.__parameter_updater__.startPass()
             for batch_id, data_batch in enumerate(reader()):
                 batch_evaluator.start()
                 event_handler(
                     v2_event.BeginIteration(
                         pass_id=pass_id, batch_id=batch_id))
-                pass_type = updater.startBatch(len(data_batch))
-                self.__gradient_machine__.forwardBackward(
-                    feeder(data_batch), out_args, pass_type)
+                pass_type = self.__parameter_updater__.startBatch(
+                    len(data_batch))
+                in_args = feeder(data_batch)
+                self.__prepare_parameter__(in_args)
+                self.__gradient_machine__.forwardBackward(in_args, out_args,
+                                                          pass_type)
                 self.__gradient_machine__.eval(pass_evaluator)
                 self.__gradient_machine__.eval(batch_evaluator)
                 for each_param in self.__gradient_machine__.getNonStaticParameters(
                 ):
-                    updater.update(each_param)
+                    self.__parameter_updater__.update(each_param)
                 cost_sum = out_args.sum()
                 cost = cost_sum / len(data_batch)
-                updater.finishBatch(cost)
+                self.__parameter_updater__.finishBatch(cost)
                 batch_evaluator.finish()
                 event_handler(
                     v2_event.EndIteration(
@@ -128,7 +167,7 @@ class SGD(object):
                         cost=cost,
                         evaluator=batch_evaluator))
 
-            updater.finishPass()
+            self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
             event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
         self.__gradient_machine__.finish()
@@ -152,8 +191,9 @@ class SGD(object):
         num_samples = 0.0
         for data_batch in reader():
             num_samples += len(data_batch)
-            self.__gradient_machine__.forward(
-                feeder(data_batch), out_args, api.PASS_TEST)
+            in_args = feeder(data_batch)
+            self.__prepare_parameter__(in_args)
+            self.__gradient_machine__.forward(in_args, out_args, api.PASS_TEST)
             total_cost += out_args.sum()
             self.__gradient_machine__.eval(evaluator)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 4ac35e3b8d..5dfb46192a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,6 +13,12 @@ packages=['paddle',
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
+      install_requires=[
+          "requests",
+          "numpy",
+          "protobuf==${PROTOBUF_VERSION}",
+          "matplotlib",
+      ],
       packages=packages,
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'