diff --git a/.gitignore b/.gitignore
index 00368ede67..7e21ba0b75 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.DS_Store
 build/
+*.user
+
+.vscode
+.idea
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000..d3dae9efd4
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,38 @@
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+env:
+  - JOB=DOCS
+  - JOB=BUILD_AND_TEST
+addons:
+  apt:
+    packages:
+      - gcc-4.8
+      - g++-4.8
+      - wget
+      - git
+      - build-essential
+      - libatlas-base-dev
+      - python
+      - python-pip
+      - python2.7-dev
+      - m4
+      - libprotobuf-dev
+      - doxygen
+      - protobuf-compiler
+      - python-protobuf
+      - python-numpy
+      - python-wheel
+      - libgoogle-glog-dev
+      - libgflags-dev
+      - libgtest-dev
+before_install:
+  - pip install wheel protobuf sphinx breathe recommonmark
+  - sudo paddle/scripts/travis/before_install.sh
+script:
+  - paddle/scripts/travis/main.sh
+notifications:
+  email:
+    on_success: change
+    on_failure: always
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b83ab256e..af6a13efbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_PATCH_VERSION 0b1)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -14,8 +14,10 @@ find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
-find_package(NumPy)
+find_package(ZLIB REQUIRED)
+find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
+find_package(AVX QUIET)
 find_package(Glog)
 find_package(Gflags QUIET)
 find_package(GTest)
@@ -27,7 +29,7 @@ find_program(M4_EXECUTABLE m4)
 option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
 option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
 option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ON) # TODO(yuyang18): Check AVX is supported or not as default value
+option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
@@ -37,6 +39,7 @@ option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
+option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
         "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@@ -99,8 +102,8 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)
 
 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
 else(WITH_AVX)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
diff --git a/README.md b/README.md
index cba47e87bc..cc2fc68ac3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,11 @@
 # PaddlePaddle
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+
+Welcome to the PaddlePaddle GitHub.
+
+The software will be released on Sept. 30 with full documentation and installation support. 
+
+A pre-release version is available now for those who are eager to take a look.
 
 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally
diff --git a/cmake/FindAVX.cmake b/cmake/FindAVX.cmake
new file mode 100644
index 0000000000..58b89918ec
--- /dev/null
+++ b/cmake/FindAVX.cmake
@@ -0,0 +1,65 @@
+# This file is use to check all support level of AVX on your machine
+# so that PaddlePaddle can unleash the vectorization power of muticore.
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(FIND_AVX_10)
+SET(FIND_AVX_20)
+SET(AVX_FLAGS)
+SET(AVX_FOUND)
+
+# Check AVX 2
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
+  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+ENDIF()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" FIND_AVX_20)
+
+# Check AVX
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    SET(CMAKE_REQUIRED_FLAGS "-mavx")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)
+    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+endif()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" FIND_AVX_10)
+
+IF(${FIND_AVX_20})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
+    ENDIF()
+ENDIF()
+
+IF(${FIND_AVX_10})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
+    ENDIF()
+ENDIF()
+
+IF("${FIND_AVX_10}" OR "${FIND_AVX_20}")
+    SET(AVX_FOUND TRUE)
+    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
+ENDIF()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 617bd7ea71..529b4b9d15 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
         ${OPENBLAS_ROOT}/include
         /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
         ${OPENBLAS_ROOT}/lib
         /usr/lib
         /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)
 
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
   PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index e2ff923a22..e5b59be193 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 351af42ee6..4b99e7f7fb 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -8,7 +8,7 @@ include(CheckCXXSymbolExists)
 # is_c: is C flag or C++ flag, bool type.
 # src_list: The list name which the flag name will be append to.
 # flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
-# rest arguments: not used. 
+# rest arguments: not used.
 function(safe_set_flag is_c src_list flag_name)
     string(REPLACE "-" "_" safe_name ${flag_name})
     string(REPLACE "=" "_" safe_name ${safe_name})
@@ -44,7 +44,7 @@ CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
   set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
   CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
-  if(UINT64_MAX_EXISTS_HERE) 
+  if(UINT64_MAX_EXISTS_HERE)
     set(CMAKE_REQUIRED_DEFINITIONS)
     add_definitions(-D__STDC_LIMIT_MACROS)
   else()
@@ -74,13 +74,37 @@ endforeach()
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
 
+function(specify_cuda_arch cuda_version cuda_arch)
+    if(${cuda_version} VERSION_GREATER "8.0")
+        foreach(capability 61 62)
+          if(${cuda_arch} STREQUAL ${capability})
+            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+          endif()
+        endforeach()
+    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
+        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+    endif()
+endfunction()
+
+# Common gpu architectures: Kepler, Maxwell
 foreach(capability 30 35 50)
-    list(APPEND __arch_flags "-gencode arch=compute_${capability},code=sm_${capability}")
+    list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()
 
 if (CUDA_VERSION VERSION_GREATER "7.0")
-    list(APPEND __arch_flags "-gencode arch=compute_52,code=sm_52")
+      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()
 
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+# Modern gpu architectures: Pascal
+if (CUDA_VERSION VERSION_GREATER "8.0")
+      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+endif()
 
+# Custom gpu architecture
+set(CUDA_ARCH)
+
+if(CUDA_ARCH)
+  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index e0e372fed0..d776c3ae49 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,16 +1,55 @@
 # Some common routine for paddle compile.
 
-
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
 #
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
+        endforeach()
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
+        list(REVERSE libsInArgn)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
+        target_link_libraries(${TARGET_NAME}
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
+    endif()
 endfunction()
 
 # compile_cu_as_cpp
@@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
+            ARCHIVE_START
             paddle_internal_gserver
             paddle_internal_owlqn
-            -Wl,--no-whole-archive
+            ARCHIVE_END
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+        ARCHIVE_START
         paddle_gserver
         ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+        ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
         paddle_network
@@ -67,9 +106,9 @@ function(link_paddle_exe TARGET_NAME)
         ${PROTOBUF_LIBRARY}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
-        ${CMAKE_DL_LIBS}
+        ${ZLIB_LIBRARIES}
         ${INTERAL_LIBS}
-        -lz)
+        ${CMAKE_DL_LIBS})
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 6d585ee094..5d9e932658 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -20,9 +20,8 @@ from optparse import OptionParser
 
 import paddle.utils.image_util as image_util
 
-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@@ -75,8 +74,8 @@ class ImageClassifier():
         self.network.loadParameters(self.model_dir)
 
         data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        self.converter = util.DataProviderWrapperConverter(False, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)
 
     def get_data(self, img_path):
         """
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
index fe89c8f4bb..dfe3eb95d1 100755
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@@ -14,8 +14,6 @@
 # limitations under the License.
 set -e
 
-export PYTHONPATH=$PYTHONPATH:../../
-
 data_dir=./data/cifar-out
 
 python preprocess.py -i $data_dir -s 32 -c 1
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index fbc30d30e6..06d471722f 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -22,9 +22,8 @@ from optparse import OptionParser
 
 import paddle.utils.image_util as image_util
 
-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@@ -85,9 +84,8 @@ class ImageClassifier():
         self.network.loadParameters(self.model_dir)
 
         data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        is_sequence = False
-        self.converter = util.DataProviderWrapperConverter(is_sequence, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)
 
     def get_data(self, img_path):
         """
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index 516b655e4b..fb2bee98be 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ test_num=$((min_len/10))
 if [ $test_num -gt 12500 ];then
  test_num=12500
 fi
-train_num=((min_len-test_num))
+train_num=$((min_len-test_num))
 
 head -n$train_num pos.shuffed >train.pos
 head -n$train_num neg.shuffed >train.neg
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
index 29cfd72248..454467f40b 100755
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 
-
 def hook(settings, meta, **kwargs):
     """
     Init hook is invoked before process data. It will set obj.slots and store
@@ -47,7 +41,6 @@ def hook(settings, meta, **kwargs):
     settings.input_types = headers
     settings.meta = meta
 
-
 @provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):
     with open(filename, 'r') as f:
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index 5250ec6dc6..9a27112828 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -15,12 +15,12 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config
 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """
 UNK_IDX = 0
 
@@ -43,16 +43,22 @@ class Prediction():
 
         conf = parse_config(
             train_conf,
-            'dict_len=' + str(len_dict) + 
+            'dict_len=' + str(len_dict) +
             ',label_len=' + str(len_label) +
             ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
         self.network.loadParameters(model_dir)
 
-        slots = [IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(len_dict),
-                 IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(2)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(2)
+        ]
+        self.converter = DataProviderConverter(slots)
 
     def load_dict_label(self, dict_file, label_file):
         """
@@ -109,7 +115,7 @@ class Prediction():
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
+    usage = ("python predict.py -c config -w model_dir "
              "-d word dictionary -l label_file -i input_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 4ece6bb06d..c61628d34d 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -15,13 +15,13 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config
 
 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """
 
 class SentimentPrediction():
@@ -46,8 +46,8 @@ class SentimentPrediction():
         conf = parse_config(train_conf, "is_predict=1")
         self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
         self.network.loadParameters(self.model_dir)
-        slots = [IndexSlot(self.dict_dim)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(slots)
 
     def load_dict(self):
         """
diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py
index f9f784c1f0..31e585edca 100644
--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
@@ -65,7 +65,7 @@ def bidirectional_lstm_net(input_dim,
     bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
     dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
     output = fc_layer(input=dropout, size=class_dim,
-                      act_type=SoftmaxActivation())
+                      act=SoftmaxActivation())
 
     if not is_predict:
         lbl = data_layer("label", 1)
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index 479a64fa00..2b0c3f3464 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -128,12 +128,16 @@ def gru_encoder_decoder(data_conf,
         return out
 
     decoder_group_name = "decoder_group"
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+
     if not is_generating:
         trg_embedding = embedding_layer(
             input=data_layer(name='target_language_word',
                              size=target_dict_dim),
             size=word_vector_dim,
             param_attr=ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
 
         # For decoder equipped with attention mechanism, in training,
         # target embeding (the groudtruth) is the data input,
@@ -142,22 +146,13 @@ def gru_encoder_decoder(data_conf,
         # for the recurrent_group.
         decoder = recurrent_group(name=decoder_group_name,
                                   step=gru_decoder_with_attention,
-                                  input=[
-                                      StaticInput(input=encoded_vector,
-                                                  is_seq=True),
-                                      StaticInput(input=encoded_proj,
-                                                  is_seq=True), trg_embedding
-                                  ])
+                                  input=group_inputs)
 
         lbl = data_layer(name='target_language_next_word',
                          size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl, )
+        cost = classification_cost(input=decoder, label=lbl)
         outputs(cost)
     else:
-        gen_inputs = [StaticInput(input=encoded_vector,
-                                  is_seq=True),
-                      StaticInput(input=encoded_proj,
-                                  is_seq=True), ]
         # In generation, the decoder predicts a next target word based on
         # the encoded source sequence and the last generated target word.
 
@@ -171,16 +166,18 @@ def gru_encoder_decoder(data_conf,
             size=target_dict_dim,
             embedding_name='_target_language_embedding',
             embedding_size=word_vector_dim)
-        gen_inputs.append(trg_embedding)
+        group_inputs.append(trg_embedding)
+
         beam_gen = beam_search(name=decoder_group_name,
                                step=gru_decoder_with_attention,
-                               input=gen_inputs,
-                               id_input=data_layer(name="sent_id",
-                                                   size=1),
-                               dict_file=trg_dict_path,
+                               input=group_inputs,
                                bos_id=0,
                                eos_id=1,
                                beam_size=beam_size,
-                               max_length=max_length,
-                               result_file=gen_trans_file)
+                               max_length=max_length)
+
+        seqtext_printer_evaluator(input=beam_gen,
+                                  id_input=data_layer(name="sent_id", size=1),
+                                  dict_file=trg_dict_path,
+                                  result_file=gen_trans_file)
         outputs(beam_gen)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index b8ccfc6be5..ef4e9d102d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,6 +1,3 @@
-
-
-
 if(NOT DEFINED SPHINX_THEME)
     set(SPHINX_THEME default)
 endif()
@@ -46,4 +43,4 @@ sphinx_add_target(paddle_docs
 
 add_dependencies(paddle_docs 
   gen_proto_py
-  paddle_doxygen_docs)
\ No newline at end of file
+  paddle_doxygen_docs)
diff --git a/doc/algorithm/rnn/index.rst b/doc/algorithm/rnn/rnn.rst
similarity index 91%
rename from doc/algorithm/rnn/index.rst
rename to doc/algorithm/rnn/rnn.rst
index a918f02ab1..343f55a20e 100644
--- a/doc/algorithm/rnn/index.rst
+++ b/doc/algorithm/rnn/rnn.rst
@@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
     yield src_ids, trg_ids, trg_ids_next
 
 
-For more details description of how to write a data provider, please refer to :doc:`Python Data Provider <../py_data_provider_wrapper>`. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to `PyDataProvider2 <../../ui/data_provider/index.html>`_. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -106,7 +106,7 @@ We will use the sequence to sequence model with attention as an example to demon
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
 
-The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :doc:`Layers <../trainer_config_helpers/layers>`  for more details.
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to `Layers <../../ui/api/trainer_config_helpers/layers_index.html>`_  for more details.
 
 We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:
 
@@ -143,11 +143,15 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
 
 .. code-block:: python
 
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
     trg_embedding = embedding_layer(
         input=data_layer(name='target_language_word',
                          size=target_dict_dim),
         size=word_vector_dim,
         param_attr=ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
     # For decoder equipped with attention mechanism, in training,
     # target embedding (the groudtruth) is the data input,
     # while encoded source sequence is accessed to as an unbounded memory.
@@ -156,13 +160,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
     # All sequence inputs should have the same length.
     decoder = recurrent_group(name=decoder_group_name,
                               step=gru_decoder_with_attention,
-                              input=[
-                                  StaticInput(input=encoded_vector,
-                                              is_seq=True),
-                                  StaticInput(input=encoded_proj,
-                                              is_seq=True),
-                                  trg_embedding
-                              ])
+                              input=group_inputs)
 
 
 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -205,22 +203,23 @@ After training the model, we can use it to generate sequences. A common practice
 * use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step.
 * use :code:`beam_search` function. This function needs to set:
 
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
   - :code:`bos_id`: the start token. Every sentence starts with the start token.
   - :code:`eos_id`: the end token. Every sentence ends with the end token.
   - :code:`beam_size`: the beam size used in beam search.
   - :code:`max_length`: the maximum length of the generated sentences.
-  - :code:`result_file`: the path of the generation result file.
 
+* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
+
+  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
+  - :code:`dict_file`: the dictionary file for converting word id to word.
+  - :code:`result_file`: the path of the generation result file.
+    
 The code is listed below:
 
 .. code-block:: python
 
-    gen_inputs = [StaticInput(input=encoded_vector,
-                              is_seq=True),
-                  StaticInput(input=encoded_proj,
-                              is_seq=True), ]
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
     # In generation, decoder predicts a next target word based on
     # the encoded source sequence and the last generated target word.
     # The encoded source sequence (encoder's output) must be specified by
@@ -231,21 +230,22 @@ The code is listed below:
         size=target_dict_dim,
         embedding_name='_target_language_embedding',
         embedding_size=word_vector_dim)
-    gen_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
     beam_gen = beam_search(name=decoder_group_name,
                            step=gru_decoder_with_attention,
-                           input=gen_inputs,
-                           id_input=data_layer(name="sent_id",
-                                               size=1),
-                           dict_file=trg_dict_path,
+                           input=group_inputs,
                            bos_id=0, # Beginnning token.
                            eos_id=1, # End of sentence token.
                            beam_size=beam_size,
-                           max_length=max_length,
-                           result_file=gen_trans_file)
+                           max_length=max_length)
+
+    seqtext_printer_evaluator(input=beam_gen,
+                              id_input=data_layer(name="sent_id", size=1),
+                              dict_file=trg_dict_path,
+                              result_file=gen_trans_file)
     outputs(beam_gen)
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :doc:`Semantic Role Labeling Demo <../../../demo/semantic_role_labeling>` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `Semantic Role Labeling Demo <../../demo/semantic_role_labeling/index.html>`_ for more details.
 
 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index 9d3b030802..c671f48386 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -1,138 +1,337 @@
-Build and Install
+Installing from Sources
 =================
 
-## Requirement
+* [1. Download and Setup](#download)
+* [2. Requirements](#requirements)
+* [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Mac OS X](#mac)
 
-### Dependents
+## <span id="download">Download and Setup</span> 
+You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
 
-- **CMake**: required for 2.8+ version
-- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5
-- **BLAS library**: such as openBLAS, MKL, ATLAS
-- **protobuf**: required for 2.4+ version, 3.x is not supported
-- **python**: currently only 2.7 version is supported
+```bash
+git clone https://github.com/baidu/Paddle paddle
+cd paddle
+```
+
+## <span id="requirements">Requirements</span>
+
+To compile the source code, your computer must be equipped with GCC >=4.6 or Clang compiler.
+### Dependencies
+
+- **CMake**: version >= 2.8
+- **BLAS**: MKL, OpenBlas or ATLAS
+- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
+- **python**: only python 2.7 is supported currently
+
+### Options
+
+PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+
+<style type="text/css">
+.tg  {border-collapse:collapse;border-spacing:0;border-color:#ccc;}
+.tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;border-color:#ccc;color:#333;background-color:#fff;border-top-width:1px;border-bottom-width:1px;}
+.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;border-color:#ccc;color:#333;background-color:#f0f0f0;border-top-width:1px;border-bottom-width:1px;}
+.tg .tg-yw4l{vertical-align:top}
+.tg .tg-9hbo{font-weight:bold;vertical-align:top}
+</style>
+<table class="tg">
+  <tr>
+    <th class="tg-yw4l">Optional</th>
+    <th class="tg-yw4l">Description</th>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GPU</td>
+    <td class="tg-yw4l">Compile with GPU mode.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_DOUBLE</td>
+    <td class="tg-yw4l">Compile with double precision floating-point, default: single precision.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GLOG</td>
+    <td class="tg-yw4l">Compile with glog. If not found, default: an internal log implementation.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GFLAGS</td>
+    <td class="tg-yw4l">Compile with gflags. If not found, default: an internal flag implementation.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_TESTING</td>
+    <td class="tg-yw4l">Compile with gtest for PaddlePaddle's unit testing.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_DOC</td>
+    <td class="tg-yw4l">Compile to generate PaddlePaddle's docs, default: disabled (OFF)</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_SWIG_PY</td>
+    <td class="tg-yw4l">Compile with python predict API, default: disabled (OFF).</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_STYLE_CHECK</td>
+    <td class="tg-yw4l">Compile with code style check, default: enabled (ON).</td>
+  </tr>
+</table>
+
+**Note:**
+  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
+  - Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
+  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
+
+As a simple example, consider the following:  
+
+1. **Python Dependencies(optional)**
+  
+    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+
+    ```bash
+    # install swig on ubuntu
+    sudo apt-get install swig
+    # install swig on Mac OS X
+    brew install swig
 
-### Optional
+    # active swig in cmake
+    cmake .. -DWITH_SWIG_PY=ON
+    ```
 
-PaddlePaddle also support some build options, you have to install related libraries. 
+2. **Doc Dependencies(optional)**
 
-- **WITH_GPU**: Compile with gpu mode
-  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
-  - Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported
-  - Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
-- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision 
-- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally
-- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally
-- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle 
-- **WITH_DOC**: Compile with documentation
-- **WITH_SWIG_PY**: Compile with python predict api
-- **WITH_STYLE_CHECK**: Style check for source code
+    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
 
+    ```bash
+    pip install 'sphinx>=1.4.0'
+    pip install sphinx_rtd_theme breathe recommonmark
 
-## Building on Ubuntu14.04
+    # install doxygen on Ubuntu
+    sudo apt-get install doxygen 
+    # install doxygen on Mac OS X
+    brew install doxygen
+
+    # active docs in cmake
+    cmake .. -DWITH_DOC=ON`
+    ```
+
+## <span id="ubuntu">Build on Ubuntu 14.04</span>
 
 ### Install Dependencies
 
 - **CPU Dependencies**
 
-```bash
-# necessary
-sudo apt-get update
-sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git 
-# optional
-sudo apt-get install libgoogle-glog-dev
-sudo apt-get install libgflags-dev
-sudo apt-get install libgtest-dev
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
-```
-    
+    ```bash
+    # necessary
+    sudo apt-get update
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    # optional
+    sudo apt-get install libgoogle-glog-dev
+    sudo apt-get install libgflags-dev
+    sudo apt-get install libgtest-dev
+    sudo pip install wheel
+    pushd /usr/src/gtest
+    cmake .
+    make
+    sudo cp *.a /usr/lib
+    popd
+    ```
   
-- **GPU Dependencies(optional)**
+- **GPU Dependencies (optional)**
 
-If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed.
-And you also need to install cuDNN.
+    To build GPU version, you will need the following installed:
 
-You can download CUDA toolkit and cuDNN from nvidia website:
-    
-```bash
-https://developer.nvidia.com/cuda-downloads
-https://developer.nvidia.com/cudnn
-```
-You can copy cuDNN files into the CUDA toolkit directory, such as:
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+mkdir build && cd build
+cmake ..
 ```
-Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
+
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
+
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can build PaddlePaddle:
 
 ```bash
-export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-export CUDA_HOME=/usr/local/cuda
-export PATH=/usr/local/cuda/bin:$PATH
+# you can add build option here, such as:    
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
 ```
-- **Python Dependencies(optional)**
 
-If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
-sudo apt-get install swig
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
 ```
 
-- **Doc Dependencies(optional)**
+## <span id="mac">Building on Mac OS X</span>
 
-If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+### Prerequisites
+This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
+you will already have Python 2.7.10 and Numpy 1.8 installed.
+
+The best option is to use the package manager homebrew to handle installations and upgrades for you.
+To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
 
 ```bash
-pip install sphinx
-pip install sphinx_rtd_theme breathe recommonmark
-sudo apt-get install python-sphinx doxygen 
+# install brew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+# install pip
+easy_install pip
 ```
 
-### Build and Install
+### Install Dependencies
 
-CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+- **CPU Dependencies**
 
-Here are some examples of cmake command with different options:
+  ```bash
+  # Install fundamental dependents 
+  brew install glog gflags cmake protobuf openblas
+
+  # Install google test on Mac OS X
+  # Download gtest 1.7.0
+  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
+  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  # Build gtest
+  mkdir build && cmake ..
+  make
+  # Install gtest library
+  sudo cp -r ../include/gtest /usr/local/include/
+  sudo cp lib*.a /usr/local/lib
+  ```
 
-**only cpu**
+- **GPU Dependencies(optional)**
 
-```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
-```
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. Mac OS X 10.11 or later
+        2. the Clang compiler and toolchain installed using Xcode
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    1. After downloading cuDNN library, issue the following commands:
+
+        ```bash
+        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        ```
+    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
 
-**gpu**
+        ```bash
+        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
+        export PATH=/usr/local/cuda/bin:$PATH
+        ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+mkdir build && cd build
+cmake ..
 ```
 
-**gpu with doc and swig**
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
 
-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-``` 
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
 
-Finally, you can download source code and build:
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can build PaddlePaddle:
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
-cd paddle
-mkdir build
-cd build
 # you can add build option here, such as:    
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+# please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
-# PaddlePaddle installation path
-export PATH=<path to install>/bin:$PATH
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<installation path>/bin:$PATH
 ```
-**Note**
+**Note:**
 
-And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
-pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
+```
\ No newline at end of file
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index b3d5fa7c9f..06fcff6172 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -25,9 +25,12 @@ repo or just head straight to the command line:
  
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/paddle.git
+git clone https://github.com/USERNAME/Paddle.git
+```
+Then you can start to develop by making a local developement branch
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH origin/master
 ```
-Then you can start to develop. 
 
 ## Commit
 
@@ -45,14 +48,14 @@ are the details if any.
 
 ## Keeping Fork Up to Date
 
-Before pull your request, you shold sync you code from the latest PaddlePaddle.
+Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:
 
 ```shell
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/paddle/paddle.git
+git remote add upstream https://github.com/baidu/Paddle.git
 # verify the new upstream
 git remote -v
 ```
@@ -60,8 +63,7 @@ git remote -v
 Update your fork with the latest upstream changes:
 
 ```shell
-git fetch upstream
-git pull upstream master
+git pull --rebase upstream HEAD
 ```
 
 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.
 
 ```shell
 # push to your repository in Github
-git push origin master
+git push origin HEAD
 ```
 
 ## Pull Request
 
 Go to the page for your fork on GitHub, select your development branch,
 and click the **pull request button**.
+
+## Update your pull request with the lastest version
+
+During the code review, your pull request may become stale because new commits in
+baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
+by clicking the "Update Branch" button in your pull request page. However, in the case
+of conflict, you need to do the update manually. You need to do the following on
+your local repository:
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull --rebase upstream HEAD
+# You may need to resolve the conflict according to the git prompt.
+# Make and test your code.
+git push -f origin HEAD
+```
+Now your Pull Request is updated with the latest version.
diff --git a/doc/build/docker_install.md b/doc/build/docker_install.md
new file mode 100644
index 0000000000..3cd9d1730a
--- /dev/null
+++ b/doc/build/docker_install.md
@@ -0,0 +1,91 @@
+Docker installation guide
+====================
+PaddlePaddle provides some pre-compiled binary, including Docker images, ubuntu deb packages. It is welcomed to contributed more installation package of different linux distribution (such as ubuntu, centos, debian, gentoo and so on). We recommend to use Docker images to deploy PaddlePaddle.
+## Docker installation
+
+Docker is a tool designed to make it easier to create, deploy, and run applications by using containers.
+
+### PaddlePaddle Docker images
+There are six Docker images:
+
+- paddledev/paddle:cpu-latest: PaddlePaddle CPU binary image.
+- paddledev/paddle:gpu-latest: PaddlePaddle GPU binary image.
+- paddledev/paddle:cpu-devel-latest: PaddlePaddle CPU binary image plus source code.
+- paddledev/paddle:gpu-devel-latest: PaddlePaddle GPU binary image plus source code.
+- paddledev/paddle:cpu-demo-latest: PaddlePaddle CPU binary image plus source code and demo
+- paddledev/paddle:gpu-demo-latest: PaddlePaddle GPU binary image plus source code and demo
+
+Tags with latest will be replaced by a released version. 
+
+### Download and Run Docker images
+
+You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
+
+You can use ```docker pull ```to download images first, or just launch a container with ```docker run```:
+```bash
+docker run -it paddledev/paddle:cpu-latest
+```
+
+If you want to launch container with GPU support, you need to set some environment variables at the same time:
+
+```bash
+export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+docker run -it paddledev/paddle:gpu-latest
+``` 
+
+### Notice
+
+#### Performance
+
+Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
+
+If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
+
+
+
+
+#### Remote access
+If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
+
+Following is a simple Dockerfile with ssh:
+```bash
+FROM paddledev/paddle
+
+MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+RUN apt-get update
+RUN apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+
+CMD    ["/usr/sbin/sshd", "-D"]
+```
+
+Then you can build an image with Dockerfile and launch a container:
+
+```bash
+# cd into Dockerfile directory
+docker build . -t paddle_ssh
+# run container, and map host machine port 8022 to container port 22
+docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
+```
+Now, you can ssh on port 8022 to access the container, username is root, password is also root:
+
+```bash
+ssh -p 8022 root@YOUR_HOST_MACHINE
+```
+
+
+You can stop and delete the container as following:
+```bash
+# stop
+docker stop paddle_ssh_machine
+# delete
+docker rm paddle_ssh_machine
+```
diff --git a/doc/build/index.rst b/doc/build/index.rst
index 2227234d77..d6d0d19e11 100644
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@@ -5,9 +5,11 @@ Install PaddlePaddle
 ----------------------
 
 ..  toctree::
+    :maxdepth: 1
     :glob:
 
     install_*
+    internal/install_from_jumbo.md
 
 Build from Source
 -----------------
@@ -15,20 +17,24 @@ Build from Source
 If you want to hack and contribute PaddlePaddle source code, following guides can help you\:
 
 ..  toctree::
+    :maxdepth: 1
     :glob:
 
     build_from_source.md
     contribute_to_paddle.md
 
-Build Docker Images
--------------------
+Docker and Debian Package installation
+--------------------------------------
 
-Note: The intallation packages are still in pre-release 
+Note: The installation packages are still in pre-release 
 state and your experience of installation may not be smooth.
 
 If you want to pack docker image, the following guide can help you\:
 
 ..  toctree::
+    :maxdepth: 1
     :glob:
 
-    docker/*
+    docker_install.md
+    ubuntu_install.md
+
diff --git a/doc/build/ubuntu_install.md b/doc/build/ubuntu_install.md
new file mode 100644
index 0000000000..c30a8f6db5
--- /dev/null
+++ b/doc/build/ubuntu_install.md
@@ -0,0 +1,21 @@
+Debian Package installation guide
+=================================
+
+## Debian Package installation
+Currently , PaddlePaddle only provides ubuntu14.04 debian packages.
+There are two versions package, including CPU and GPU. The download address is:
+
+https://github.com/baidu/Paddle/releases/tag/V0.8.0b0
+
+
+After downloading PaddlePaddle deb packages, you can run:
+
+```bash
+dpkg -i paddle-0.8.0b-cpu.deb
+apt-get install -f
+```
+And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
+
+**Note**
+
+PaddlePaddle package only supports x86 CPU with AVX instructions. If not, you have to download and build from source code.
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
index cf1ea97715..9062f85f98 100644
--- a/doc/cluster/index.rst
+++ b/doc/cluster/index.rst
@@ -5,3 +5,4 @@ Cluster Train
   :glob:
 
   opensource/cluster_train.md
+  internal/index.md
diff --git a/doc/conf.py.in b/doc/conf.py.in
index 8515042747..6c221f598b 100644
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -23,6 +23,8 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
 
+templates_path = ["@PROJ_ROOT@/doc/templates"]
+
 # -- Doxygen Settings
 breathe_projects = {
    'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
@@ -66,8 +68,6 @@ extensions = [
 
 autodoc_member_order = 'bysource'
 
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
diff --git a/doc/demo/embedding_model/index.md b/doc/demo/embedding_model/index.md
index 45992ad856..06f3ff1f00 100644
--- a/doc/demo/embedding_model/index.md
+++ b/doc/demo/embedding_model/index.md
@@ -93,7 +93,7 @@ where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the
 - `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
 - `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
 
-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](text_generation.md).
+For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/text_generation.md).
 
 ## Optional Function ##
 ###  Embedding Parameters Observation
diff --git a/doc/demo/image_classification/image_classification.md b/doc/demo/image_classification/image_classification.md
index 069100d3a1..29cfc99702 100644
--- a/doc/demo/image_classification/image_classification.md
+++ b/doc/demo/image_classification/image_classification.md
@@ -1,4 +1,5 @@
-#Image Classification Tutorial
+Image Classification Tutorial
+==============================
 
 This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
 As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
@@ -172,7 +173,7 @@ python -m paddle.utils.plotcurve -i $log > plot.png
 - The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.
 
 
-After training finishes, the training and testing error curve will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
+After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
 
 <center>![Training and testing curves.](./plot.png)</center>
 
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/demo/imagenet_model/resnet_model.md
index 76dddd1ec0..5403ab9f17 100644
--- a/doc/demo/imagenet_model/resnet_model.md
+++ b/doc/demo/imagenet_model/resnet_model.md
@@ -1,6 +1,6 @@
 # Model Zoo - ImageNet #
 
-[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provided convolutional neural network(CNN) models for ImageNet.
+[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provides convolutional neural network(CNN) models for ImageNet.
 
 ## ResNet Introduction
 
@@ -48,11 +48,11 @@ We present three ResNet models, which are converted from the models provided by
 
 ## ResNet Model
 
-See ```demo/model_zoo/resnet/resnet.py```. This confgiure contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like this ```--config_args=layer_num=50``` in command line arguments.
+See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like ```--config_args=layer_num=50``` in command line arguments.
 
 ### Network Visualization
 
-You can get a diagram of ResNet network by running the following command. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
 
 ```
 cd demo/model_zoo/resnet
@@ -165,7 +165,7 @@ We provide both C++ and Python interfaces to extract features. The following exa
 
 ### C++ Interface
 
-First, specify image data list in `define_py_data_sources` in the config, see example `demo/model_zoo/resnet/resnet.py`.
+First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.
 
 ```
     train_list = 'train.list' if not is_test else None
@@ -190,8 +190,7 @@ Second, specify layers to extract features in `Outputs()` of `resnet.py`. For ex
 Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
 ```
 
-Third, specify model path and output directory in `extract_fea_c++.sh
-`, and then run following commands
+Third, specify model path and output directory in `extract_fea_c++.sh`, and then run the following commands.
 
 ```
 cd demo/model_zoo/resnet
diff --git a/doc/demo/index.md b/doc/demo/index.md
index 4d0e4554cb..289199d496 100644
--- a/doc/demo/index.md
+++ b/doc/demo/index.md
@@ -9,7 +9,7 @@ There are serveral examples and demos here.
 
 * [Sentiment Analysis](sentiment_analysis/index.rst)
 * [Text Generation](text_generation/index.rst)
-* [Semantic Role Labeling](semantic_role_labeling/index.md)
+* [Semantic Role Labeling](semantic_role_labeling/index.rst)
 
 ## Recommendation
 
@@ -19,6 +19,3 @@ There are serveral examples and demos here.
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model.md)
 * [Embedding: Chinese Word](embedding_model/index.md)
-
-## Customization
-* [Writing New Layers](new_layer/index.rst)
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
index b537d8c834..ee3fa2a216 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -59,7 +59,7 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
 
 ```bash
 cd demo/quick_start
@@ -157,9 +157,7 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-
-You can refer to the following link for more detailed examples
-: <a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，The detailed documentation on data format is: <a href = "../../ui/api/py_data_provider_wrapper.html"> PyDataProviderWrapper</a>。
+You can refer to the following link for more detailed examples and data formats: <a href = "../../ui/data_provider/pydataprovider2.html">PyDataProvider2</a>.
 
 ## Network Architecture
 You will describe four kinds of network architectures in this section.
@@ -425,7 +423,7 @@ paddle train \
 
 mv rank-00000 result.txt
 ```
-There are several differences between training and inference network configurations.
+User can choose the best model base on the training log instead of model `output/pass-00003`. There are several differences between training and inference network configurations.
 - You do not need labels during inference.
 - Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet.
 - batch_size = 1.
diff --git a/doc/demo/rec/ml_regression.rst b/doc/demo/rec/ml_regression.rst
index 47bcef2d6d..0c14e4f5bb 100644
--- a/doc/demo/rec/ml_regression.rst
+++ b/doc/demo/rec/ml_regression.rst
@@ -219,9 +219,9 @@ The network structure shows below.
 
 The demo's neural network config file "trainer_config.py" show as below.
 
-..  include:: ../../../demo/recommendation/trainer_config.py
-    :code: python
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-
 
 In this :code:`trainer_config.py`, we just map each feature type to
 a feature vector, following shows how to map each feature to a vector shows below.
@@ -257,15 +257,15 @@ In these network, we use several api in `trainer_config_helpers
 *  Text Convolution Pooling Layer, `text_conv_pool
    <../../ui/api/trainer_config_helpers/networks.html
    #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources
+*  Declare Python Data Sources, `define_py_data_sources2
    <../../ui/api/trainer_config_helpers/data_sources.html>`_
 
 Data Provider
 '''''''''''''
 
-..  include:: ../../../demo/recommendation/dataprovider.py
-    :code: python
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-
 
 The data provider just read the meta.bin and rating file, yield each sample for training.
 In this :code:`dataprovider.py`, we should set\:
@@ -274,7 +274,7 @@ In this :code:`dataprovider.py`, we should set\:
 * use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
 * process\: Return each sample of data to :code:`paddle`.
 
-The data provider details document see `there <../../ui/DataProvider.html>`_.
+The data provider details document see `there <../../ui/data_provider/pydataprovider2.html>`_.
 
 Train
 `````
@@ -283,15 +283,15 @@ After prepare data, config network, writting data provider, now we can run paddl
 
 The run.sh is shown as follow:
 
-..  include:: ../../../demo/recommendation/run.sh
-    :code: bash
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-
 
 It just start a paddle training process, write the log to `log.txt`,
 then print it on screen.
 
 Each command line argument in :code:`run.sh`, please refer to the `command line
-arguments <TBD>`_ page. The short description of these arguments is shown as follow.
+arguments <../../ui/index.html#command-line-argument>`_ page. The short description of these arguments is shown as follow.
 
 *  config\: Tell paddle which file is neural network configuration.
 *  save_dir\: Tell paddle save model into './output'
@@ -303,8 +303,6 @@ arguments <TBD>`_ page. The short description of these arguments is shown as fol
 *  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
 *  num_passes\: Train at most :code:`num_passes`.
 
-
-
 If training process starts successfully, the output likes follow:
 
 ..  code-block:: text
diff --git a/doc/demo/semantic_role_labeling/index.rst b/doc/demo/semantic_role_labeling/index.rst
new file mode 100644
index 0000000000..ff3035059b
--- /dev/null
+++ b/doc/demo/semantic_role_labeling/index.rst
@@ -0,0 +1,7 @@
+Semantic Role Labeling Tutorial
+===============================
+
+.. toctree::
+    :maxdepth: 3
+
+    semantic_role_labeling.md
diff --git a/doc/demo/semantic_role_labeling/index.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
similarity index 97%
rename from doc/demo/semantic_role_labeling/index.md
rename to doc/demo/semantic_role_labeling/semantic_role_labeling.md
index 58b1b8abbc..05fbc8278d 100644
--- a/doc/demo/semantic_role_labeling/index.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
@@ -1,6 +1,6 @@
-# Semantic Role Labelling Tutorial
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+# Semantic Role labeling Tutorial #
 
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
 
  [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
 
@@ -12,12 +12,10 @@ Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is
 - AM-MOD: modal 
 - AM-NEG: negation
 
-
 Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
 
 To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
 
-
 ## Data Description
 The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
 
@@ -36,7 +34,6 @@ src.dict：the dictionary of words in sentences
 tgt.dict：the labels dictionary
 feature: the extracted features from data set
 ```
- 
 
 ## Training
 ### DB-LSTM
@@ -49,8 +46,6 @@ The following figure shows a temporal expanded 2-layer DB-LSTM network.
 ![pic](./network_arch.png)
 </center>
 
-
-
 ### Features
 Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
 <center>
@@ -130,7 +125,6 @@ paddle train \
 2>&1 | tee 'train.log'
 ```
 
-
 -  \--config=./db_lstm.py : network config file.
 -  \--save_di=./output: output path to save models.
 -  \--trainer_count=4 : set thread number (or GPU count).
@@ -183,12 +177,7 @@ python predict.py
 
 After prediction,  the result is saved in `predict.res`.
 
-
-
-
-
 ## Reference
 [1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
 
 [2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils/utils.rst
index e00dc6229c..3df721a47b 100644
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils/utils.rst
@@ -1,10 +1,6 @@
 Utils
 =======
 
-Bits
--------
-.. doxygenfile:: paddle/math/Bits.h
-
 Memory Handle
 --------------
 .. doxygenfile:: paddle/math/MemoryHandle.h
diff --git a/doc/templates/layout.html b/doc/templates/layout.html
new file mode 100644
index 0000000000..47329c2a92
--- /dev/null
+++ b/doc/templates/layout.html
@@ -0,0 +1,16 @@
+{# layout.html #}
+{# Import the theme's layout. #}
+{% extends "!layout.html" %}
+
+
+{%- block extrahead %} 
+<script>
+var _hmt = _hmt || [];
+(function() {
+  var hm = document.createElement("script");
+  hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+  var s = document.getElementsByTagName("script")[0]; 
+  s.parentNode.insertBefore(hm, s);
+})();
+</script>
+{% endblock %}
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
index 294f6e4d31..c4e14ed779 100644
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -12,6 +12,13 @@ AbsActivation
     :members: AbsActivation
     :noindex:
     
+ExpActivation
+===============
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: ExpActivation
+    :noindex:
+    
 IdentityActivation
 ==================
 
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index a09d5e3d4d..c1d7a7ce81 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -82,12 +82,6 @@ img_cmrnorm_layer
     :members: img_cmrnorm_layer
     :noindex:
 
-img_rnorm_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: img_rnorm_layer
-    :noindex:
-
 batch_norm_layer
 ---------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -175,6 +169,12 @@ dotmul_projection
     :members: dotmul_projection
     :noindex:
 
+dotmul_operator
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: dotmul_operator
+    :noindex:
+
 full_matrix_projection
 ----------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -251,10 +251,10 @@ addto_layer
     :members: addto_layer
     :noindex:
 
-convex_comb_layer
+linear_comb_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
-    :members: convex_comb_layer
+    :members: linear_comb_layer
     :noindex:
 
 interpolation_layer
@@ -286,7 +286,13 @@ tensor_layer
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: tensor_layer
     :noindex:
-    
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
 trans_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -347,12 +353,6 @@ rank_cost
     :members: rank_cost
     :noindex:
 
-cos_sim
--------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst
index 3c683914f4..b487fec64c 100644
--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -4,6 +4,12 @@ BaseSGDOptimizer
     :members: BaseSGDOptimizer
     :noindex:
 
+MomentumOptimizer
+=================
+..  automodule:: paddle.trainer_config_helpers.optimizers
+    :members: MomentumOptimizer
+    :noindex:
+
 AdamOptimizer
 =============
 ..  automodule:: paddle.trainer_config_helpers.optimizers
diff --git a/doc/ui/data_provider/index.rst b/doc/ui/data_provider/index.rst
index db890c2ab1..3db5b57376 100644
--- a/doc/ui/data_provider/index.rst
+++ b/doc/ui/data_provider/index.rst
@@ -1,5 +1,5 @@
-PaddlePaddle DataProvider Introduction
-================================
+DataProvider Introduction
+=========================
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.
 
@@ -10,7 +10,7 @@ customized, with sacrificing the efficiency only a little. This is extremly
 useful when you have to dynamically generate certain kinds of data according to,
 for example, the training performance.
 
-Besides, users also can also customize a C++ :code:`DataProvider` for a more
+Besides, users also can customize a C++ :code:`DataProvider` for a more
 complex usage, or for a higher efficiency.
 
 The following parameters are required to define in the PaddlePaddle network
diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/ui/data_provider/pydataprovider2.rst
index c4897a7bfc..e105d3be30 100644
--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/ui/data_provider/pydataprovider2.rst
@@ -17,24 +17,23 @@ how to write a simple PyDataProvider.
 
 MNIST is a handwriting classification data set. It contains 70,000 digital
 grayscale images. Labels of the training sample range from 0 to 9. All the
-images have been size-normalized and centered into images with a same size
+images have been size-normalized and centered into images with the same size
 of 28 x 28 pixels.
 
-A small part of the original data as an example can be found in the path below:
+A small part of the original data as an example is shown as below:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
 
-Each line of the data contains two parts, separated by ';'. The first part is
+Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/train.list
 
-The corresponding dataprovider can be found in the path below:
+The corresponding dataprovider is shown as below:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.py
-   : linenos:
 
 The first line imports PyDataProvider2 package.
 The main function is the process function, that has two parameters.
@@ -45,8 +44,8 @@ This parameter is passed to the process function by PaddlePaddle.
 :code:`@provider` is a Python
 `Decorator <http://www.learnpython.org/en/Decorators>`_ .
 It sets some properties to DataProvider, and constructs a real PaddlePaddle
-DataProvider from a very sample user implemented python function. It does not
-matter if you are not familiar with `Decorator`_. You can keep it sample by
+DataProvider from a very simple user implemented python function. It does not
+matter if you are not familiar with `Decorator`_. You can keep it simple by
 just taking :code:`@provider` as a fixed mark above the provider function you
 implemented.
 
@@ -59,9 +58,9 @@ document of `input_types`_ for more details.
 
 The process method is the core part to construct a real DataProvider in
 PaddlePaddle. It implements how to open the text file, how to read one sample
-from the original text file, converted them into `input_types`_, and give them
+from the original text file, convert them into `input_types`_, and give them
 back to PaddlePaddle process at line 23.
-Note that data yields by the process function must follow a same order that
+Note that data yielded by the process function must follow the same order that
 `input_types`_ are defined.
 
 
@@ -75,7 +74,20 @@ you can take this as an example.
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
 
-Here we specify training data by 'train.list', and no testing data is specified.
+Here we specify training data by :code:`train.list`, and no testing data is specified.
+The method which actually provide data is :code:`process`.
+
+User also can use another style to provide data, which defines the
+:code:`data_layer`'s name explicitly when `yield`. For example,
+the :code:`dataprovider` is shown as below.
+
+.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+   :linenos:
+
+If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
+the order of :code:`data_layer` definition roughly to determine which feature to
+which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
+:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
 
 Now, this simple example of using PyDataProvider is finished.
 The only thing that the user should know is how to generte **one sample** from
@@ -94,7 +106,7 @@ DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
 timesteps. The so-called timestep, is not necessary to have something to do
-with 'time'. It can also be explained to that the order of data are taken into
+with time. It can also be explained to that the order of data are taken into
 consideration into model design and training.
 For example, the sentence can be interpreted as a kind of sequence data in NLP
 tasks.
@@ -111,7 +123,7 @@ The corresponding data provider can be found in the path below:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_provider.py
 
-This data provider for sequential model is a little bit complex than that
+This data provider for sequential model is a little more complex than that
 for MINST dataset.
 A new initialization method is introduced here.
 The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s
@@ -153,49 +165,29 @@ Please refer to the following section reference for details.
 Reference
 ---------
 
-.. _@provider::
 @provider
 +++++++++
 
-'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in
-PaddlePaddle from a user defined function. Its parameters are:
-
-* `input_types`_ defines format of the data input.
-* should_shuffle defines whether to shuffle data or not. By default, it is set
-  true during training, and false during testing.
-* pool_size is the memory pool size (in sample number) in DataProvider.
-  -1 means no limit.
-* can_over_batch_size defines whether PaddlePaddle can store little more
-  samples than pool_size. It is better to set True to avoid some deadlocks.
-* calc_batch_size is a function define how to calculate batch size. This is
-  usefull in sequential model, that defines batch size is counted upon sequence
-  or token. By default, each sample or sequence counts to 1 when calculating
-  batch size.
-* cache is a data cache strategy, see `cache`_
-* Init_hook function is invoked once the data provider is initialized,
-  see `init_hook`_
-
-.. _input_types::
+.. autofunction:: paddle.trainer.PyDataProvider2.provider
+
 input_types
 +++++++++++
 
 PaddlePaddle has four data types, and three sequence types.
 The four data types are: 
 
-* dense_vector represents dense float vector.
-* sparse_binary_vector sparse binary vector, most of the value is 0, and
+* :code:`dense_vector`: dense float vector.
+* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
   the non zero elements are fixed to 1.
-* sparse_float_vector sparse float vector, most of the value is 0, and some
-  non zero elements that can be any float value. They are given by the user.
-* integer represents an integer scalar, that is especially used for label or
-  word index.
-
+* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some
+  non zero elements can be any float value. They are given by the user.
+* :code:`integer`: an integer scalar, that is especially used for label or word index.
 
-The three sequence types are
+The three sequence types are:
 
-* SequenceType.NO_SEQUENCE means the sample is not a sequence
-* SequenceType.SEQUENCE means the sample is a sequence
-* SequenceType.SUB_SEQUENCE means it is a nested sequence, that each timestep of
+* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
+* :code:`SequenceType.SEQUENCE` means the sample is a sequence.
+* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of
   the input sequence is also a sequence.
 
 Different input type has a defferenct input format. Their formats are shown
@@ -215,36 +207,39 @@ in the above table.
 
 where f represents a float value, i represents an integer value.
 
-.. _init_hook::
-.. _settings::
 init_hook
 +++++++++
 
 init_hook is a function that is invoked once the data provoder is initialized.
 Its parameters lists as follows:
 
-* The first parameter is a settings object, which is the same to :code:'settings'
-  in :code:`process` method.  The object contains several attributes, including:
-  * settings.input_types the input types. Reference `input_types`_
-  * settings.logger a logging object
+* The first parameter is a settings object, which is the same to :code:`settings`
+  in :code:`process` method. The object contains several attributes, including:
+
+  * :code:`settings.input_types`: the input types. Reference `input_types`_.
+  * :code:`settings.logger`: a logging object.
+
 * The rest parameters are the key word arguments. It is made up of PaddpePaddle
   pre-defined parameters and user defined parameters.
-  * PaddlePaddle defines parameters including:
-    * is_train is a bool parameter that indicates the DataProvider is used in
-      training or testing
-    * file_list is the list of all files.
+
+  * PaddlePaddle-defined parameters including:
+
+    * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
+      training or testing.
+    * :code:`file_list` is the list of all files.
+      
   * User-defined parameters args can be set in training configuration.
 
 Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
 use :code:`**kwargs` in init_hook to ensure compatibility by accepting the
 parameters which your init_hook does not use.
 
-.. _cache ::
 cache
 +++++
-DataProvider provides two simple cache strategy. They are
-* CacheType.NO_CACHE means do not cache any data, then data is read runtime by
+DataProvider provides two simple cache strategy. They are:
+
+* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by
   the user implemented python module every pass.
-* CacheType.CACHE_PASS_IN_MEM means the first pass reads data by the user
+* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user
   implemented python module, and the rest passes will directly read data from
   memory.
diff --git a/doc/ui/index.md b/doc/ui/index.md
index 829994d56b..9c1ba27bdc 100644
--- a/doc/ui/index.md
+++ b/doc/ui/index.md
@@ -7,7 +7,7 @@
 
 ## API Reference
 
-* [Trainer Config Helpers](api/trainer_config_helpers/index.md)
+* [Model Config Interface](api/trainer_config_helpers/index.md)
 
 ## Command Line Argument
 
diff --git a/doc/ui/predict/predict_sample.py b/doc/ui/predict/predict_sample.py
index ac16b2b48b..d55d2c730d 100644
--- a/doc/ui/predict/predict_sample.py
+++ b/doc/ui/predict/predict_sample.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from py_paddle import swig_paddle, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
 TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,12 +89,12 @@ TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 
 def main():
-    conf = parse_config("./mnist_model/trainer_config.conf.norm", "")
+    conf = parse_config("./mnist_model/trainer_config.py", "")
     print conf.data_config.load_data_args
     network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
     assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
     network.loadParameters("./mnist_model/")
-    converter = DataProviderWrapperConverter(False, [DenseSlot(784)])
+    converter = DataProviderConverter([dense_vector(784)])
     inArg = converter(TEST_DATA)
     print network.forwardTest(inArg)
 
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/ui/predict/swig_py_paddle_en.rst
index e22d0bff33..b743fc4569 100644
--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/ui/predict/swig_py_paddle_en.rst
@@ -10,27 +10,35 @@ SWIG. The main steps of predict values in python are:
 * Predict
 
 Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem.
+MNIST classification problem. A complete sample code could be found at
+:code:`src_root/doc/ui/predict/predict_sample.py`.
 
 ..  literalinclude:: ./predict_sample.py
     :language: python
-    :linenos:
+    :lines: 15-18,90-100,101-104
 
 The module that does the most of the job is py_paddle.swig_paddle, it's
 generated by SWIG and has complete documents, for more details you can use
 python's :code:`help()` function. Let's walk through the above python script:
 
-* At the beginning, initialize PaddlePaddle with command line arguments(line 90).
-* Parse the configuration file that is used in training(line 93).
-* Create a neural network at line 95 according the parsed configuration, then
-  load the trained parameters from model at line 97.
-* A utility class for data transformation is created at line 98.
+* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
+  PaddlePaddle with command line arguments, for more about command line arguments
+  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+* Parse the configuration file that is used in training with :code:`parse_config()`.
+  Because data to predict with always have no label, and output of prediction work
+  normally is the output layer rather than the cost layer, so you should modify
+  the configuration file accordingly before using it in the prediction work.
+* Create a neural network with
+  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
+  parsed configuration :code:`conf.model_config` as argument. Then load the
+  trained parameters from the model with :code:`network.loadParameters()`.
+* Create a data converter object of utility class :code:`DataProviderConverter`.
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderWraaperConverter that can accept the same input data with
-      PyDataProviderWrapper, for more information please refer to document
-      of `PyDataProviderWrapper <../py_data_provider_wrapper_api.html>`_.
-* Do the prediction and output the result at line 100, forwardTest is another
-  utility class that directly takes the activations of the output layer.
+      class DataProviderConverter that can accept the same input data with
+      PyDataProvider2, for more information please refer to document
+      of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
+* Do the prediction with :code:`forwardTest()`, which takes the converted
+  input data and outputs the activations of the output layer.
 
 Here is a typical output:
 
diff --git a/doc_cn/algorithm/rnn/rnn.rst b/doc_cn/algorithm/rnn/rnn.rst
deleted file mode 100644
index f073ac4e20..0000000000
--- a/doc_cn/algorithm/rnn/rnn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN 配置
-========
-
-.. toctree::
-  :maxdepth: 3
-
-* `RNN配置 <../../../doc/algorithm/rnn/rnn.html>`_
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
index 80cb31fe0f..e21fc98c63 100644
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@@ -1,7 +1,19 @@
 编译与安装
 ========================
 
-..  toctree::
-    
-    install/index.rst
-    cmake/index.rst
+PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+
+Note: The intallation packages are still in pre-release state and your experience of installation may not be smooth.
+
+注意：目前PaddlePaddle的安装包还处在pre-release的状态，使用起来或许会不是很顺畅。
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+   
+   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
+   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
+   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
+   install/docker_install.rst 
+   install/ubuntu_install.rst
+   cmake/index.rst
diff --git a/doc_cn/build_and_install/install/index.rst b/doc_cn/build_and_install/install/index.rst
deleted file mode 100644
index ce463728c7..0000000000
--- a/doc_cn/build_and_install/install/index.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-安装PaddlePaddle
-==========
-
-PaddlePaddle提供数个预编译的二进制来进行安装。他们包括Docker镜像，ubuntu的deb安装包等
-。欢迎贡献更多的安装包。我们更推荐使用Docker镜像来部署PaddlePaddle环境。
-
-Note: The intallation packages are still in pre-release
-state and your experience of installation may not be smooth.
-
-注意!目前PaddlePaddle的安装包还处在pre-release的状态，
-使用起来或许会不是很顺畅。
-
-..	toctree::
-	docker_install.rst
-	ubuntu_install.rst
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
index e9e3bf28d7..7cdd470677 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -4,10 +4,8 @@
 PaddlePaddle目前支持ubuntu 14.04版本使用deb包安装。更多的安装包PaddlePaddle会在近期提供。
 欢迎大家贡献各个发行版的安装包(例如，ubuntu，centos，debian，gentoo)。
 
-PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本，他们的下载地址是:
-
-* CPU版本的PaddlePaddle安装包:  TBD
-* GPU版本的PaddlePaddle安装包:  TBD
+PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本，他们的下载地址是\:
+https://github.com/baidu/Paddle/releases/tag/V0.8.0b0
 
 需要注意的是，目前PaddlePaddle的安装包只支持 
 `AVX <https://en.wikipedia.org/wiki/Advanced_Vector_Extensions>`_
@@ -21,8 +19,10 @@ PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本
     dpkg -i paddle-0.8.0b-cpu.deb
     apt-get install -f
 
-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并
-设置好对应的环境变量(LD_LIBRARY_PATH等等)。
+在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
+在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
+需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
+并设置好对应的环境变量(LD_LIBRARY_PATH等等)。
 
 可能遇到的问题
 --------------
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
index 16c1f0e37b..25313a9635 100644
--- a/doc_cn/cluster/index.rst
+++ b/doc_cn/cluster/index.rst
@@ -1,4 +1,11 @@
 集群训练
 ========
 
-参见 `集群训练 <../../doc/cluster/index.html>`_
+* `集群训练 <../../doc/cluster/index.html>`_
+
+.. toctree::
+    :maxdepth: 2
+    :glob:
+
+    集群训练(对内) <internal/index.md>
+
diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
index e1c63cf9f1..391f7981ea 100644
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -22,6 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
+templates_path = ["@PROJ_ROOT@/doc/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -51,9 +52,6 @@ table_styling_embed_css = True
 
 autodoc_member_order = 'bysource'
 
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 # source_suffix = ['.rst', '.md']
diff --git a/doc_cn/demo/embedding_model/index.md b/doc_cn/demo/embedding_model/index.md
deleted file mode 100644
index 5894a4de5a..0000000000
--- a/doc_cn/demo/embedding_model/index.md
+++ /dev/null
@@ -1 +0,0 @@
-# Embedding Demo
diff --git a/doc_cn/demo/image_classification/index.rst b/doc_cn/demo/image_classification/index.rst
deleted file mode 100644
index 98cbdc29b9..0000000000
--- a/doc_cn/demo/image_classification/index.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-图片分类教程
-============
-
-TBD
diff --git a/doc_cn/demo/imagenet_model/index.md b/doc_cn/demo/imagenet_model/index.md
deleted file mode 100644
index b54b28401c..0000000000
--- a/doc_cn/demo/imagenet_model/index.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Resnet
- TBD
diff --git a/doc_cn/demo/index.rst b/doc_cn/demo/index.rst
index f7f6185cf3..71f54bc18f 100644
--- a/doc_cn/demo/index.rst
+++ b/doc_cn/demo/index.rst
@@ -21,5 +21,6 @@
 
 常用模型
 ''''''''
+
 * `ImageNet: ResNet <../../doc/demo/imagenet_model/resnet_model.html>`_
 * `Embedding: Chinese Word <../../doc/demo/embedding_model/index.html>`_
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
index e799e454f2..aa6b66ca8c 100644
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -4,7 +4,7 @@
 
 ## 安装(Install)
 
-首先请参考<a href = "../../build/index.html">安装教程</a>安装PaddlePaddle。
+首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。
 
 ## 使用概述(Overview)
 
@@ -32,7 +32,7 @@
 
 ## 数据格式准备(Data Preparation)
 在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。`demo/quick_start`里提供了数据下载脚本
+将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了数据下载脚本
 和预处理脚本。
 
 ```bash
@@ -134,8 +134,8 @@ define_py_data_sources2(train_list='data/train.list',
 * obj="process": 指定生成数据的函数
 * args={"dictionary": word_dict}: 额外的参数，这里指定词典
 
-更详细用例请参考文档<a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，
-数据格式和详细文档请参考<a href = "../../ui/py_data_provider_wrapper_api.html">
+更详细用例请参考文档<a href = "../../../doc/ui/data_provider/python_case.html">Python Use Case</a>，
+数据格式和详细文档请参考<a href = "../../../doc/ui/data_provider/pydataprovider2.html">
 PyDataProviderWrapper</a>。
 
 ## 网络结构(Network Architecture)
@@ -143,8 +143,8 @@ PyDataProviderWrapper</a>。
 <center> ![](./PipelineNetwork.jpg) </center>
 
 我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
-连接请参考<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.layers">Layer文档</a>。
-所有配置在`demo/quick_start`目录，首先列举逻辑回归网络。
+连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
+所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
 
 ### 逻辑回归模型(Logistic Regression)
 
@@ -350,7 +350,7 @@ lstm = simple_lstm(input=emb, size=lstm_size)
 <br>
 
 ## 优化算法(Optimization Algorithm)
-<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
+<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
 Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
 
 ```python
@@ -375,7 +375,7 @@ paddle train \
 --num_passes=15 \
 --use_gpu=false
 ```
-这里没有介绍多机分布式训练，可以参考<a href = "../../platform/index.html">分布式训练</a>的demo学习如何进行多机训练。
+这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。
 
 ## 预测(Prediction)
 可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
@@ -407,7 +407,7 @@ paddle train \
 
 mv rank-00000 result.txt
 ```
-与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
+这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
 指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
 
 预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
diff --git a/doc_cn/demo/semantic_role_labeling/index.md b/doc_cn/demo/semantic_role_labeling/index.md
deleted file mode 100644
index a1594577bb..0000000000
--- a/doc_cn/demo/semantic_role_labeling/index.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# 语义标注
-TBD
diff --git a/doc_cn/demo/sentiment_analysis/index.md b/doc_cn/demo/sentiment_analysis/index.md
deleted file mode 100644
index d95f2803a4..0000000000
--- a/doc_cn/demo/sentiment_analysis/index.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# 情感分析
-TBD
diff --git a/doc_cn/demo/text_generation/index.rst b/doc_cn/demo/text_generation/index.rst
deleted file mode 100644
index 147b776465..0000000000
--- a/doc_cn/demo/text_generation/index.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-文本生成
-========
-TBD
diff --git a/doc_cn/dev/new_layer/index.rst b/doc_cn/dev/new_layer/index.rst
deleted file mode 100644
index aafeceff5b..0000000000
--- a/doc_cn/dev/new_layer/index.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-新写Layer
-=========
-
-* `新写Layer <../../../doc/dev/new_layer/index.html>`_
diff --git a/doc_cn/index.md b/doc_cn/index.md
deleted file mode 100644
index f21f60e146..0000000000
--- a/doc_cn/index.md
+++ /dev/null
@@ -1,19 +0,0 @@
-PaddlePaddle文档
-================
-
-使用指南
---------
-* [快速入门](demo/quick_start/index.md)
-* [编译与安装](build_and_install/index.rst)
-* [用户接口](ui/index.rst)
-* [使用示例](demo/index.rst)
-* [模型配置](ui/model.rst)
-* [集群训练](cluster/index.rst)
-
-开发指南
---------
-* [新写Layer](dev/new_layer/index.rst)
-
-算法教程
---------
-* [RNN配置](algorithm/rnn/rnn.rst)
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
new file mode 100644
index 0000000000..6cf5588b5b
--- /dev/null
+++ b/doc_cn/index.rst
@@ -0,0 +1,19 @@
+PaddlePaddle文档
+================
+
+使用指南
+--------
+* `快速入门 <demo/quick_start/index.html>`_
+* `编译与安装 <build_and_install/index.html>`_
+* `用户接口 <ui/index.html>`_
+* `使用示例 <demo/index.html>`_
+* `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
+* `集群训练 <cluster/index.html>`_
+
+开发指南
+--------
+* `新写Layer <../doc/dev/new_layer/index.html>`_
+
+算法教程
+--------
+* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
diff --git a/doc_cn/ui/data_provider/index.rst b/doc_cn/ui/data_provider/index.rst
index 681a131b66..ec8f8e5dc5 100644
--- a/doc_cn/ui/data_provider/index.rst
+++ b/doc_cn/ui/data_provider/index.rst
@@ -1,24 +1,15 @@
 PaddlePaddle的数据提供(DataProvider)介绍
-==================================
+========================================
 
-数据提供(DataProvider，后用DataProvider代替)是PaddlePaddle负责提供数据的模块。其作用是将训练数据
-传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的
-:code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，
-用户也可以在C++端自定义一个 :code:`DataProvider` 。
+数据提供(DataProvider)是PaddlePaddle负责提供数据的模块。其作用是将训练数据传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的 :code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 :code:`DataProvider` 。
 
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用什么DataProvider，和DataProvider
-的一些参数，训练文件列表(train.list)和测试文件列表(test.list)。
+PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider及其参数，训练文件列表(train.list)和测试文件列表(test.list)。
 
-其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果
-test.list不设置，或者设置为None的话，那么在训练过程中，不会执行测试操作。否则，则会根据命令行
-参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
+其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果test.list不设置，或者设置为None，那么在训练过程中，不会执行测试操作。否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
 
-一般情况下，train.list和test.list为纯文本文件，其每一行对应这每一个数据文件。数据文件存放在
-本地磁盘中，将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)的方式写在train.list和
-test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
-用户在DataProvider中需要实现如何访问其中每一个文件。
+一般情况下，train.list和test.list为纯文本文件，一行对应一个数据文件，数据文件存放在本地磁盘中。将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)写在train.list和test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
 
-DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
+用户在DataProvider中需要实现如何访问其中每一个文件。DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
 
 ..	toctree::
 
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py
index 0f9094cd27..7ba344338c 100644
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -4,3 +4,5 @@ define_py_data_sources2(train_list='train.list',
                         test_list=None,
                         module='mnist_provider',
                         obj='process')
+img = data_layer(name='pixel', size=784)
+label = data_layer(name='label', size=10)
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py
new file mode 100644
index 0000000000..4eab5b1fd3
--- /dev/null
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -0,0 +1,25 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types=[
+    dense_vector(28 * 28),
+    integer_value(10)
+])
+def process(settings, filename):  # settings is not used currently.
+    f = open(filename, 'r')  # open one of training file
+
+    for line in f:  # read each line
+        label, pixel = line.split(';')
+
+        # get features and label
+        pixels_str = pixel.split(' ')
+
+        pixels_float = []
+        for each_pixel_str in pixels_str:
+            pixels_float.append(float(each_pixel_str))
+
+        # give data to paddle.
+        yield { "pixel": pixels_float, 'label': int(label) }
+
+    f.close()  # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
index 766f583538..9e1d8c531f 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一
 这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
 这个模块中的 'process' 函数。
 
+同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
+
+.. literalinclude:: mnist_provider.dict.py
+   :linenos:
+
+如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
+来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
+
 至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
 知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
 
@@ -116,16 +124,16 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
 参考(Reference)
 ---------------
 
-..  _@provider::
-
 @provider
 +++++++++
 
-'@provider'是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
 
 *  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
 *  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle
+   测试的时候默认不shuffle。
+*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
+   设置成-1的话，会预先读取全部数据到内存中。
 *  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
 *  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
    一般推荐设置成True
@@ -133,9 +141,11 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
    是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
-
-
-..  _input_types::
+*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
+   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
+*  check 设置成true的话，会根据input_types检查数据的合法性。
+*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
+   check是false的话，没有作用。
 
 input_types
 +++++++++++
@@ -169,16 +179,11 @@ PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中
 
 其中，f代表一个浮点数，i代表一个整数。
 
-..  _init_hook::
-..  _settings::
-
 init_hook
 +++++++++
 
 init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
 
-
-
 * 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
     * settings.input_types 设置输入类型。参考 `input_types`_
     * settings.logger 一个logging对象
@@ -192,8 +197,6 @@ init_hook可以传入一个函数。这个函数在初始化的时候会被调
 注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
 函数来保证兼容性。
 
-..  _cache::
-
 cache
 +++++
 
@@ -202,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是
 * CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
 * CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
   读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
+即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
+严重的问题。
+
+但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
+会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
+用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
+大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
+个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+而如果按顺序调用这些generator就不会出现这个问题。
+
+所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
+文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
+下非常少的变量引用。例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
+
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
+:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
+的情况下越大越好。
+
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
index 14080eba17..5aba272c62 100644
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -5,6 +5,7 @@
 ''''''''
 
 ..  toctree::
+    :maxdepth: 1
 
     data_provider/index.rst
 
diff --git a/doc_cn/ui/model.rst b/doc_cn/ui/model.rst
deleted file mode 100644
index 7a81236d6f..0000000000
--- a/doc_cn/ui/model.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-模型配置
-========
-
-* `Model Config Interface <../../doc/ui/api/trainer_config_helpers/index.html>`_
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
index 284c60686d..012ac4ff6e 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -9,22 +9,30 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
 * 准备数据
 * 预测
 
-典型的预测代码如下，使用mnist手写识别作为样例。
+典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
+:code:`src_root/doc/ui/predict/predict_sample.py` 。
 
 ..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
     :language: python
-    :linenos:
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的 :code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用命令行参数初始化PaddlePaddle
-* 在98行载入PaddlePaddle的训练文件。读取config
-* 在100行创建神经网络，并在83行载入参数。
-* 103行创建一个从工具类，用来转换数据。
+    :lines: 15-18,90-100,101-104
+
+主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
+:code:`help()` 函数查询文档。主要步骤为:
+
+* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
+  PaddlePaddle。详细的命令行参数请参考
+  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
+* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
+  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
+  layer作为输出，所以用于预测的配置文件要做相应的修改。
+* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
+  析好的配置创建神经网络。
+* 创建一个 :code:`DataProviderConverter` 对象converter。
     - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-    - 这个接口并不用户友好。所以，我们提供了一个工具类DataProviderWrapperConverter.
-    - 这个工具类接收和PyDataProviderWrapper一样的输入数据，请参考PyDataProviderWrapper的文档。
-* 在第105行执行预测。forwardTest是一个工具类，直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
+      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
+      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
+* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:
 
 ..  code-block:: text
 
@@ -37,4 +45,4 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
           2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
           4.48684503e-08]], dtype=float32)}]
 
-其中，value即为softmax层的输出。由于数据是两个，所以输出的value。
+其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
diff --git a/paddle/.gitignore b/paddle/.gitignore
index b89bd9d946..f921eef141 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -40,3 +40,4 @@ HPPL_ERROR_LOG
 unittest.list
 proto
 dist
+setup.py
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index afde3e51db..f7019b27f8 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -22,15 +22,21 @@
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
 
-PYPATH=""
-set -x
-while getopts "d:" opt; do
-  case $opt in
-    d)
-      PYPATH=$OPTARG
-      ;;
-  esac
-done
-shift $(($OPTIND - 1))
-export PYTHONPATH=$PYPATH
-$@
+if ! python -c "import paddle" >/dev/null 2>/dev/null; then
+  PYPATH=""
+  set -x
+  while getopts "d:" opt; do
+    case $opt in
+      d)
+        PYPATH=$OPTARG
+        ;;
+    esac
+  done
+  shift $(($OPTIND - 1))
+  export PYTHONPATH=$PYPATH
+  $@
+else
+  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
+  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
+  exit 1
+fi
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c6fa7dc2b1..cae0f64400 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -7,6 +7,9 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
+
 if(WITH_PREDICT_SDK)
     add_subdirectory(predict)
 endif()
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index f9f191c711..8f73e76260 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -110,8 +110,8 @@ IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
   }
 }
 
-IVector*Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw (RangeError){
+IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
   auto& a = m->getArg(idx);
   if (a.subSequenceStartPositions) {
     return IVector::createByPaddleVectorPtr(
@@ -129,7 +129,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw (RangeError) {
+    size_t idx, IVector *vec) throw(RangeError) {
   auto& a = m->getArg(idx);
   auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
   a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index c92ed205d5..b3140617af 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/TypeDefs.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
@@ -392,7 +393,7 @@ public:
   void setSlotSequenceStartPositions(size_t idx,
                                      IVector* vec) throw(RangeError);
   void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw (RangeError);
+                                        IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
 private:
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 4e655c324a..8a6741078f 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
 
 #include <fenv.h>
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 21b4ca1dd6..bc1afc5898 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -15,6 +15,19 @@
 try:
     from paddle_api_config import *
     import os.path
+    import platform
+
+    system = platform.system().lower()
+    is_osx = (system == 'darwin')
+    is_win = (system == 'windows')
+    is_lin = (system == 'linux')
+
+    if is_lin:
+        whole_start = "-Wl,--whole-archive"
+        whole_end = "-Wl,--no-whole-archive"
+    elif is_osx:
+        whole_start = ""
+        whole_end = ""
 
     LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
     PARENT_LIB_DIRS = ['proto']
@@ -56,9 +69,9 @@ try:
 
         def libs_str(self):
             libs = [
-                "-Wl,--whole-archive",
+                whole_start,
                 "-lpaddle_gserver",
-                "-Wl,--no-whole-archive",
+                whole_end,
                 "-lpaddle_pserver",
                 "-lpaddle_trainer_lib",
                 "-lpaddle_network",
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 2ccbf311bf..e03a9a1baa 100644
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -1,8 +1,11 @@
+set(AVX_SOURCES
+    src/hl_math.cc
+    src/hl_avx_functions.cc
+)
 set(CUDA_SOURCES
     src/hl_time.cc
-    src/hl_math.cc
     src/hl_cpu_functions.cc
-    src/hl_avx_functions.cc)
+    ${AVX_SOURCES})
 
 set(CUDA_CXX_WITH_GPU_SOURCES
     src/hl_cuda_cublas.cc
@@ -12,7 +15,7 @@ set(CUDA_CXX_WITH_GPU_SOURCES
 set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 
-set_source_files_properties(${CUDA_SOURCES}
+set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
 set(CUDA_DSO_SOURCES
@@ -73,4 +76,3 @@ endif()
 
 add_style_check_target(paddle_cuda ${CUDA_SOURCES})
 add_style_check_target(paddle_cuda ${CUDA_HEADERS})
-# add_style_check_target(hppl ${HPPL_CU_SOURCES})   # TODO(yuyang18): Format hppl style
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index ffdf71229a..3196db67f6 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -321,13 +321,14 @@ extern const char* hl_get_device_error_string(size_t err);
 extern int hl_get_device_last_error();
 
 /**
- * @brief   hppl query event.
+ * @brief   check cuda event is ready
  *
- * @param[in]   event       cuda event to query.
- * @param[out]  isNotReady  this work under device has not yet been
- *                          completed, vice versa.
+ * @param[in]  event        cuda event to query.
+ *
+ * @return     true    cuda event is ready.
+ *             false   cuda event is not ready.
  */
-extern void hl_cuda_event_query(hl_event_t event, bool& isNotReady);
+extern bool hl_cuda_event_is_ready(hl_event_t event);
 
 /**
  * @brief   hppl device synchronization.
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
old mode 100644
new mode 100755
index 408ff35d96..88d950d6c1
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -16,26 +16,37 @@ limitations under the License. */
 #ifndef HL_DEVICE_FUNCTIONS_CUH_
 #define HL_DEVICE_FUNCTIONS_CUH_
 
-namespace hppl {
-
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    // NOLINTNEXTLINE
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull,
-                        assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
+namespace paddle {
+
+template <class T>
+inline __device__ T paddleAtomicAdd(T* address, T val);
+
+template <>
+inline __device__ float paddleAtomicAdd(float* address, float val) {
+  return atomicAdd(address, val);
 }
 
-}  // namespace hppl
+template <>
+inline __device__ double paddleAtomicAdd(double* address, double val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  // NOLINTNEXTLINE
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+#endif
+}
+}  // namespace paddle
 
-using hppl::atomicAdd;
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
index 2ca33f2b13..07806e11c1 100644
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
 
   if (isBatch) {
     if (value.prevStateValue) {
-      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
     }
-    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
   } else {
     if (value.prevStateValue) {
       if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 85b60cc313..6917f36290 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -27,6 +27,8 @@ typedef float4 vecType;
 typedef double2 vecType;
 #endif
 #else
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef HPPL_TYPE_DOUBLE
 typedef __m128  vecType;
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index d774150c21..c90d49e4ad 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -25,6 +25,9 @@ limitations under the License. */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
+#if   defined(__APPLE__) || defined(__OSX__)
+#define     _mm_set_pd1     _mm_set1_pd
+#endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 395101c6f7..675ac03b0e 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -89,7 +89,7 @@ inline const char* hl_get_device_error_string() { return NULL; }
 
 inline const char* hl_get_device_error_string(size_t err) { return NULL; }
 
-inline void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {}
+inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
 inline void hl_device_synchronize() {}
 
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
index c0b84b087b..4eb775eb79 100644
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -261,11 +261,7 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
 
   struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}
 
   KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
            (A_d, t_resource.gpu_mem, dimM);
@@ -275,7 +271,10 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
-  CHECK_SYNC("hl_vector_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }
 
 template <int blockSize>
@@ -317,11 +316,7 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
 
   struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}
 
   KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
            (A_d, t_resource.gpu_mem, dimM);
@@ -331,5 +326,8 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
-  CHECK_SYNC("hl_vector_abs_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 445279fa01..dc109487de 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -217,7 +217,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
   CHECK_SYNC("hl_matrix_mul failed");
 }
 
@@ -266,7 +266,7 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
     LOG(FATAL) << "parameter transa error!";
   }
 
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
   CHECK_SYNC("hl_matrix_mul_vector");
 }
 
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 19c94b2453..c2dce1977b 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -150,7 +150,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
 
 
 // APIs available after R4:
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
   __macro(cudnnBatchNormalizationForwardTraining)            \
   __macro(cudnnBatchNormalizationForwardInference)           \
@@ -999,7 +999,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
                                     double epsilon,
                                     real *savedMean,
                                     real *savedVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
@@ -1024,7 +1024,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1039,7 +1039,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
                                     real *estimatedMean,
                                     real *estimatedInvVar,
                                     double epsilon) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
   cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
@@ -1053,7 +1053,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1071,7 +1071,7 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                             double epsilon,
                             real *savedMean,
                             real *savedInvVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
     LOG(FATAL) << "savedMean and savedVar can be NULL "
@@ -1087,16 +1087,14 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
   CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
               t_resource.cudnn_handle, mode, &alpha, &beta,
-#if CUDNN_VERSION >= 5000
               &alpha, &beta,
-#endif
               xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
               bnDesc, scale, scaleGrad, biasGrad, epsilon,
               savedMean, savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 774eef8b89..acd8e2fe6a 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
 bool hl_start_flag = false;
 
-#define gettid() syscall(SYS_gettid)
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;    
+}
 
 void hl_init(int device) {
   CHECK(hl_start_flag)
@@ -751,11 +762,12 @@ void hl_set_device_flags_block() {
              cudaDeviceScheduleBlockingSync));
 }
 
-void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {
+bool hl_cuda_event_is_ready(hl_event_t event) {
   cudaError_t err = dynload::cudaEventQuery(event->cu_event);
   CHECK(cudaSuccess == err || cudaErrorNotReady == err);
 
   if (cudaErrorNotReady == err) {
-    isNotReady = true;
+    return false;
   }
+  return true;
 }
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index 64699c9f6d..cf009620bf 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
 
   /* TODO: Temporary save & merger in another kernel */
   if (frameIdy == 1) {
-    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 2) {
-    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 3) {
-    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
   }
 }
 
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 15799919fa..38e4f16217 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -266,25 +267,21 @@ template<int blockSize>
 __global__ void KeMatrixClassificationError(real* in_A,
                                             int* in_B,
                                             real* out_C,
-                                            int dimM,
                                             int dimN) {
   __shared__ real max_s[blockSize];
   __shared__ int max_l[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int tid = threadIdx.x;
-  int lmt = tid;
-  int index = 0;
-  real t;
+  const int tid = threadIdx.x;
+  const int rowId = blockIdx.x;
 
   max_s[tid] = -1e30f;
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    index = blockIdx.y*dimN + lmt;
-    t = in_A[index];
-    if (max_s[tid] < t) {
-      max_s[tid] = t;
-      max_l[tid] = lmt;
+  in_A += rowId * dimN;
+  real tmp;
+  for (int colId = tid; colId < dimN; colId += blockSize) {
+    tmp = in_A[colId];
+    if (max_s[tid] < tmp) {
+      max_s[tid] = tmp;
+      max_l[tid] = colId;
     }
-    lmt += blockSize;
   }
   __syncthreads();
 
@@ -300,7 +297,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
   __syncthreads();
 
   if (tid == 0) {
-    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
   }
 }
 
@@ -313,12 +310,9 @@ void hl_matrix_classification_error(real* A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  int blocksX = 1;
-  int blocksY = dimM;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, B_d, C_d, dimM, dimN);
+  // each sample is calculated by one block
+  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
+    (A_d, B_d, C_d, dimN);
   CHECK_SYNC("hl_matrix_classification_error");
 }
 
@@ -629,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
         prevGradY[index] +=
           scale * grad[ty] * prevOutX[index] * reciprocal;
       } else {
-        atomicAdd(prevGradY + index,
+        paddle::paddleAtomicAdd(prevGradY + index,
           scale * grad[ty] * prevOutX[index] * reciprocal);
       }
     }
@@ -646,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY);
       } else {
-        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY));
       }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index f88a2682fd..e028880156 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
         if (AddRow == 0) {
           outputData[i] += tableData[i];
         } else {
-          atomicAdd(&tableData[i], outputData[i]);
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
         }
       }
     }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index becb6c6649..db5c9ce979 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
   for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
     int colIdx = csr_col[idx];
     real val = csr_val[idx];
-    atomicAdd(a_val + colIdx, val);
+    paddle::paddleAtomicAdd(a_val + colIdx, val);
   }
 }
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 3558b163b5..eee9984e07 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(
 
     CHECK(nullptr != *dso_handle)
       << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << " Please make sure you already specify its path."
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
-      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+      << dlPath.c_str() << ". Please make sure you already specify its path. "
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
+      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
+      << "export DYLD_LIBRARY_PATH for MAC OS.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+#endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
 }
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 05335c5f83..52ee4610ed 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
       real *tab = table + tableId * ldt;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow) {
-          atomicAdd(&tab[i], out[i]);
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
         } else {
           out[i] += tab[i];
         }
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index ba05b70fe9..c3b4769f76 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() {
   taskReadySem_.post();
 }
 
-ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
-DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
 }
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index aab5d93fca..534491d70d 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -39,15 +39,30 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 
 namespace paddle {
-
 /**
  * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)               \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
+  static InitFunction __reg_type_##__type_name([]() {\
+  DataProvider::registrar_.registerClass(\
+  #__type_name, \
+  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+    DataProvider* dp = new __class_name (conf, useGpu);\
+    return dp;\
+  });\
+})
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
+})
 
 class DataBatch;
 class BufferBatch;
@@ -285,10 +300,18 @@ protected:
  */
 class DataProvider {
 public:
-  static ClassRegistrar<DataProvider, DataConfig, bool> registrar_;
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
   static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
                               bool useGpu = FLAGS_use_gpu);
 
+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
   DataProvider(const DataConfig& config, bool useGpu)
       : config_(config),
         skipShuffle_(false),
@@ -336,13 +359,13 @@ public:
    * @note return -1 to indicate unlimited number of samples.
    */
   virtual int64_t getSize() = 0;
+
   /**
    * @brief Get next batch training samples internally
    * @param[in]    size      size of training samples to get
    * @param[out]   batch     a batch of training samples
    * @return actual size of obtained training samples
    */
-
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
 
 protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index decbde6c91..0689f90f3e 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::random_shuffle(fileList_.begin(), fileList_.end());
+  std::shuffle(fileList_.begin(), fileList_.end(),
+      ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index c3d14a7069..8e4f53978a 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -22,7 +22,9 @@ namespace paddle {
 
 using namespace std;
 
-MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
+MultiDataProvider::MultiDataProvider(const DataConfig& config,
+                                     const ModelConfig& modelConfig,
+                                     bool useGpu)
     : DataProvider(config, useGpu) {
   bool atLeastOneMainDataFlag = false;
   totalDataRatio_ = 0;
@@ -58,7 +60,9 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
       subConfig.set_async_load_data(false);
     }
     subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig, useGpu_));
+        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
+                                                           modelConfig,
+                                                           useGpu_));
   }
 }
 
@@ -116,6 +120,6 @@ int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
   return batch->getSize();
 }
 
-REGISTER_DATA_PROVIDER(multi, MultiDataProvider);
+REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
 
 }  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 7144212863..b498ba6516 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -24,7 +24,9 @@ protected:
   std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
 
 public:
-  MultiDataProvider(const DataConfig& config, bool useGpu);
+  MultiDataProvider(const DataConfig& config,
+                    const ModelConfig& modelConfig,
+                    bool useGpu);
   ~MultiDataProvider() {}
   virtual void reset();
   virtual void shuffle();
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index b0c14c85b2..344644755f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
+      ThreadLocalRandomEngine::get());
 }
 
 /*
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index aeefd16063..1332c0ab63 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
+
 
 namespace paddle {
 
@@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
 }
 
 void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  int feFlag = fegetexcept();
   VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
   parseHeaderData(headerInfo);
-  feenableexcept(feFlag);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }
 
 void PyDataProvider::parseHeaderData(const std::string& headerData) {
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index f7886c4e01..2f9a1223c6 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -24,6 +24,27 @@ limitations under the License. */
 
 namespace paddle {
 
+namespace unittest {
+
+static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
+         OnPoolFilled;
+
+namespace pydp2 {
+
+void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
+  OnPoolFilled.reset(new std::function<void(size_t)>());
+  *OnPoolFilled = callback;
+}
+
+void clearOnPoolFilledHook() {
+  OnPoolFilled.reset();
+}
+
+}  // namespace pydp2
+}  // namespace unittest
+
+
+
 /**
  * Slot type
  */
@@ -179,6 +200,7 @@ public:
    * Ctor
    */
   PyDataProvider2(const DataConfig& config,
+                  const ModelConfig& modelConfig,
                   bool useGpu)
     :DataProvider(config, useGpu), callingContextCreated_(2) {
     auto& args = config.load_data_args();
@@ -192,6 +214,12 @@ public:
 
     py::DictHelper kwargsDict(kwargs);
     kwargsDict.setBool("is_train", !config.for_test());
+    std::vector<std::string> inputs;
+    inputs.reserve(modelConfig.input_layer_names().size());
+    std::copy(modelConfig.input_layer_names().begin(),
+              modelConfig.input_layer_names().end(),
+              std::back_inserter(inputs));
+    kwargsDict.setStringList("input_order", inputs);
 
     // kwargs is keyword arguemts to create object.
     this->createPyDataObj(config.load_data_module(),
@@ -199,7 +227,7 @@ public:
                           config.files(),
                           std::move(kwargs));
     DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields();
+    this->readPyFields(config.for_test());
     DBG << "Py Field Done";
   }
 
@@ -253,14 +281,28 @@ private:
     CHECK_PY(instance_) << "Cannot Create instance";
   }
 
-  void readPyFields() {
+  void readPyFields(bool testing) {
     py::ObjectHelper self(this->instance_);
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle");
     bool ok;
+
+    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
+                                           &ok /*isBoolType*/);
+    if (!ok) {
+      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
+                                     // when is testing.
+    }
+    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
+
     this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
     if (!ok) {
       this->poolSize_ = -1UL;
     }
+    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
+    if (!ok) {
+      this->minPoolSize_ = -1UL;
+    }
+    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
+
     this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
 
     calcBatchSize_.reset(self.getAttr("calc_batch_size"));
@@ -307,7 +349,6 @@ private:
   }
 
   void loadThread() {
-    callingContexts_.reserve(fileLists_.size());
     DBG << "Creating context";
     for (auto& filename : fileLists_) {
       PyGuard g;
@@ -332,7 +373,20 @@ private:
         bool atEnd;
         data = py::iterNext(callingContexts_[cid], &atEnd);
         if (atEnd || data == nullptr) {
-          callingContexts_.erase(callingContexts_.begin() + cid);
+          if (cid != 0) {
+            std::swap(callingContexts_[cid], callingContexts_[0]);
+            cid = 0;
+          }
+
+          PyObjectPtr front;
+          {
+            std::unique_lock<std::mutex> l(mtx_);
+            front = pop_get_front(callingContexts_);
+          }
+          {
+            PyGuard g;
+            front.reset();
+          }
           this->pullCV_.notify_all();
           continue;
         }
@@ -340,6 +394,7 @@ private:
 
       size_t additionalBatchSize = 1;
       if (calcBatchSize_) {
+        PyGuard guard;
         py::CallableHelper calcBatchSize(this->calcBatchSize_);
         calcBatchSize.setArgsSize(1);
         calcBatchSize.getArgs().set(0, data);
@@ -353,11 +408,7 @@ private:
       if (this->loadThread_){  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
         pushCV_.wait(l, [this, additionalBatchSize] {
-          if (this->canOverBatchSize_) {
-            return this->poolActualSize_ < poolSize_;
-          } else {
-            return this->poolActualSize_ + additionalBatchSize < poolSize_;
-          }
+          return this->poolActualSize_ < poolSize_;
         });
       }
 
@@ -366,10 +417,7 @@ private:
         poolActualSize_ += additionalBatchSize;
         dataPool_.emplace_back(data);
       }
-
-      {
-        pullCV_.notify_all();
-      }
+      pullCV_.notify_all();
     }
     DBG << "load thread end";
   }
@@ -401,7 +449,7 @@ private:
 private:
   std::unique_ptr<std::thread> loadThread_;
   std::atomic<bool> exit_;
-  std::vector<PyObjectPtr> callingContexts_;
+  std::deque<PyObjectPtr> callingContexts_;
   std::deque<PyObjectPtr> dataPool_;
   size_t poolActualSize_;
   std::condition_variable pushCV_;
@@ -412,6 +460,7 @@ private:
 
   PyObjectPtr instance_;
   size_t poolSize_;
+  size_t minPoolSize_;
   bool canOverBatchSize_;
   PyObjectPtr calcBatchSize_;
   PyObjectPtr generator_;
@@ -477,8 +526,13 @@ public:
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
       pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= size || callingContexts_.empty();
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
+            || callingContexts_.empty();
       });
+
+      if (unittest::OnPoolFilled) {
+        (*unittest::OnPoolFilled)(this->poolActualSize_);
+      }
     }
     std::deque<PyObjectPtr> data;
     size_t bsize = 0;
@@ -494,7 +548,8 @@ public:
     std::deque<PyObjectPtr>& pool = *poolPtr;
 
     while (bsize < size && !pool.empty()) {
-      {  // move data from pool to data
+      {
+        // move data from pool to data
         std::lock_guard<std::mutex> guard(mtx_);
         if (skipShuffle_) {
           size_t i = 0;
@@ -504,23 +559,32 @@ public:
         } else {  // when shuffle, use swap to drop only last pool element.
           size_t i = ThreadLocalRand::rand() % pool.size();
           CHECK(pool[i] != nullptr);
-          if (i != pool.size() - 1) {
-            std::swap(pool[i], pool.back());
+          if (i != 0) {
+            std::swap(pool[i], pool.front());
           }
-          data.emplace_back(std::move(pool.back()));
-          pool.pop_back();
+          data.emplace_back(std::move(pool.front()));
+          pool.pop_front();
         }
-      }
-      {
+
         if (calcBatchSize_) {  // custom calc batch size.
+          PyGuard guard;
           Py_INCREF(data.back().get());
           py::CallableHelper calcBatchSize(calcBatchSize_);
           calcBatchSize.setArgsSize(1);
           calcBatchSize.getArgs().set(0, data.back());
           PyObjectPtr customBatchSize(calcBatchSize());
           bool ok;
-          bsize += py::castInt<size_t>(customBatchSize.get(), &ok);
+          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
           CHECK(ok) << "calc_batch_size must return int";
+
+          if (bsize + tmp > size && !canOverBatchSize_) {
+            // Put data back.
+            pool.push_front(std::move(data.back()));
+            data.pop_back();
+            break;
+          } else {
+            bsize += tmp;
+          }
         } else {
           bsize += 1;
         }
@@ -575,6 +639,11 @@ public:
       scanners[i]->finishFill(inArgs[i]);
     }
 
+    {
+      PyGuard g;
+      cache_->drop(&data);
+    }
+
     DBG << "Reading CPU Batch Done.";
 
     if (useGpu_) {
@@ -591,11 +660,6 @@ public:
     } else {
       *batch = cpuBatch;
     }
-
-    {
-      PyGuard g;
-      cache_->drop(&data);
-    }
     return bsize;
   }
 };
@@ -603,7 +667,8 @@ public:
 std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
-REGISTER_DATA_PROVIDER(py2, PyDataProvider2);
+REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
+
 
 /**
  * Scanner for dense slot.
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index d0b1c0447d..e397c71c87 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -194,8 +194,8 @@ public:
   virtual real evalImp(std::vector<Argument>& arguments) {
     CHECK_EQ(arguments.size(), (size_t)2);
     Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false);
-    label.resizeAndCopyFrom(arguments[1], false);
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     CHECK(label.sequenceStartPositions);
     CHECK(label.ids);
@@ -207,7 +207,7 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          output.value->getHeight(), output.value->getWidth(),
+          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
@@ -224,6 +224,9 @@ public:
     for (const std::string& name : config_.input_layers()) {
       arguments.push_back(nn.getLayer(name)->getOutput());
     }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     numSequences_ += arguments[1].getNumSequences();
   }
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index fca5282895..3127b4dd9a 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -277,6 +277,7 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.pop("");  // tell layer trace is during backward.
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
     if ((*layer)->needGradient()) {
@@ -384,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNeuralNetwork(
-    const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(
+  const std::string& name, NeuralNetwork* network) __attribute__((weak));
 
 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
     const std::string& name,
     NeuralNetwork* rootNetwork) {
-  if (newCustomNeuralNetwork) {
-    return newCustomNeuralNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
+    if (newCustomNerualNetwork) {
+      return newCustomNerualNetwork(name, rootNetwork);
+    } else {
+      return new NeuralNetwork(name, rootNetwork);
+    }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 7bc5fe5181..fc38bca3c4 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
@@ -291,6 +290,8 @@ void RecurrentGradientMachine::init(
   if (subModelConfig->evaluator_names_size() > 0) {
     evaluator_.reset(frames_[0]->makeEvaluator());
   }
+
+  targetInfoInlinkId_ = subModelConfig->target_inlinkid();
 }
 
 void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
@@ -325,7 +326,7 @@ void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
 
   for (int i = frames_.size(); i < numFrames; ++i) {
     std::unique_ptr<NeuralNetwork> frame(
-          NeuralNetwork::newNeuralNetwork(subModelName_));
+        NeuralNetwork::newNeuralNetwork(subModelName_));
     frame->init(config_, subParamInitCb);
 
     for (auto& inFrameLine : inFrameLines_) {
@@ -382,6 +383,16 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
   size_t numSequences = input.getNumSequences();
   const int* starts = input.sequenceStartPositions->getData(false);
   bool hasSubseq = input.hasSubseq();
+
+  // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the
+  // same inframe info
+  bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1;
+
+  // Defaultly, share info with the first inlink
+  if (shareInlinkInfo) {
+    targetInfoInlinkId_ = 0;
+  }
+
   // check hasSubseq in both config and input are the same
   CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq);
 
@@ -394,9 +405,17 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     CHECK_EQ((size_t)input1.getNumSequences(), numSequences);
     // check all inputs should have same hasSubseq flag
     CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq);
-    CHECK_EQ(input1.getBatchSize(), batchSize);
-    CHECK(std::equal(starts, starts + numSequences + 1,
-                     input1.sequenceStartPositions->getData(false)));
+
+    // if shareInlinkInfo, checks:
+    // 1. all inlinks have same number of total tokens
+    // 2. all inlinks have same number of tokens for each sentence of each
+    //    sample. If hasSubseq, one sample has multiple sentence, else, one
+    //    sample is one sentence
+    if (shareInlinkInfo) {
+      CHECK_EQ(input1.getBatchSize(), batchSize);
+      CHECK(std::equal(starts, starts + numSequences + 1,
+                       input1.sequenceStartPositions->getData(false)));
+    }
   }
 
   if (hasSubseq) {
@@ -408,19 +427,46 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     for (size_t i = 1; i < inFrameLines_.size(); ++i) {
       const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
       CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
-      CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
-                       input1.subSequenceStartPositions->getData(false)));
+      if (shareInlinkInfo) {
+        CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+                         input1.subSequenceStartPositions->getData(false)));
+      }
     }
   }
 
-  seqLengthAndStart_.clear();
-  input.getSeqLengthAndStart(&seqLengthAndStart_, &maxSequenceLength_);
+  info_.clear();
+  info_.resize(inFrameLines_.size());
+
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    // if shareInlinkInfo, only calculate info of the first inlink
+    // else, calculate info for each inlink
+    if (shareInlinkInfo) {
+      input.getSeqInfo(&seqInfos_[0]);
+      maxSequenceLength_ = seqInfos_[0][0].topLevelLength;
+      createInFrameInfo(0, input, passType);
+    } else {
+      for (size_t i = 0; i < inFrameLines_.size(); i++) {
+        const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
+        input1.getSeqInfo(&seqInfos_[i]);
+        maxSequenceLength_ = seqInfos_[i][0].topLevelLength;
+        createInFrameInfo(i, input1, passType);
+      }
+    }
+
+    // inFrameLine select rows in real layer one time
+    for (size_t i = 0; i < inFrameLines_.size(); i++) {
+      int curInlinkId = shareInlinkInfo ? 0 : i;
+      selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds,
+                        &(inFrameLines_[i].outArg), passType);
+    }
+  }
   resizeOrCreateFrames(maxSequenceLength_);
   resizeBootFrame(numSequences);
 
-  AsyncGpuBlock asyncGpuBlock;
-  createInFrameInfo(input, passType);
-
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -443,23 +489,30 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
     CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input, info_.allIds, info_.idIndex);
+    gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds,
+                                       info_[targetInfoInlinkId_].idIndex);
   }
 
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    int idSize = info_.idIndex[i + 1] - info_.idIndex[i];
-
+    int idSize = 0;
     // connect in_links
-    for (auto& inFrameLine : inFrameLines_) {
+    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      Info& info = info_[shareInlinkInfo ? 0 : j];
+      // idSize denotes the sum number of tokens in each length i
+      idSize = info.idIndex[i + 1] - info.idIndex[i];
+      InFrameLine inFrameLine = inFrameLines_[j];
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info_.allIds,
-                                          info_.idIndex[i], idSize);
+                                          inFrameLine.outArg, info.allIds,
+                                          info.idIndex[i], idSize);
       if (hasSubseq) {
-        int size = info_.seqStartPosIndex[i + 1] - info_.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info_.sequenceStartPositions, info_.seqStartPosIndex[i], size);
+        // size: the length of subsequence
+        int size =
+            info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(info.sequenceStartPositions,
+                                                info.seqStartPosIndex[i],
+                                                size);
       }
     }
 
@@ -469,13 +522,16 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
       gatherAgent->addRealLayer(outFrameLine.frames[i]);
     }
-
     // connect memory links
+    // Adopt info_[0].idIndex because seq which has_subseq=True
+    // doesn't support Memory with !hasSubseq bootlayer;
+    // And inlinks that !hasSubSeq must have same inlink length.
+    idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i];
     for (auto& memoryFrameLine : memoryFrameLines_) {
       NeuralNetwork::connect(
           memoryFrameLine.agents[i],
           i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          idSize /*height of agent*/);
+          numSeqs_[i] /*height of agent*/);
     }
   }
 
@@ -560,62 +616,78 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
 */
-void RecurrentGradientMachine::createInFrameInfo(const Argument& input,
+
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
+                                                 const Argument& input,
                                                  PassType passType) {
   bool hasSubseq = input.hasSubseq();
+  // numSequences: # samples(sequences) in a batch
   size_t numSequences = input.getNumSequences();
   std::vector<int> allIds;
-  info_.idIndex.clear();
-  info_.idIndex.push_back(0);  // first idIndex = 0
-  if (hasSubseq) {             // for sequenceScatterAgentLayer
-    size_t numSubSequences = input.getNumSubSequences();
-    std::vector<int> sequenceStartPositions;
-    info_.seqStartPosIndex.clear();
-    info_.seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-    for (int i = 0; i < maxSequenceLength_; ++i) {
-      sequenceStartPositions.push_back(0);  // first element = 0
-      for (size_t j = 0; j < numSubSequences; ++j) {
-        if (std::get<3>(seqLengthAndStart_[j]) == i) {
-          int subSeqStart = std::get<1>(seqLengthAndStart_[j]);
-          int subSeqLength = std::get<0>(seqLengthAndStart_[j]);
-          for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) {
-            allIds.push_back(k);
-          }
-          sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                              subSeqLength);
-        }
-      }
-      info_.idIndex.push_back(allIds.size());
-      info_.seqStartPosIndex.push_back(sequenceStartPositions.size());
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  numSeqs_.clear();
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.clear();
+  inlinkInfo->idIndex.push_back(0);  // first idIndex = 0
+
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
+
+  if (hasSubseq) {                    // for sequenceScatterAgentLayer
+    subSequenceStartPositions =
+        input.subSequenceStartPositions->getData(false);
+    inlinkInfo->seqStartPosIndex.clear();
+    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  }
+  // maxSequenceLength_: max topLevelLength in allsamples
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (hasSubseq) {
+      sequenceStartPositions.push_back(0);            // first element = 0
     }
-    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(sequenceStartPositions.size(),
-             maxSequenceLength_ + numSubSequences);
-    CHECK_EQ(info_.seqStartPosIndex.size(),
-             static_cast<size_t>(maxSequenceLength_ + 1));
-    createSeqPos(sequenceStartPositions, &info_.sequenceStartPositions);
-  } else {  // for scatterAgentLayer
-    for (int i = 0; i < maxSequenceLength_; ++i) {
-      for (size_t j = 0; j < numSequences; ++j) {
-        int seqLength = std::get<0>(seqLengthAndStart_[j]);
-        if (i >= seqLength) {
-          break;
+    int numSeqs = 0;
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      if (i >= seqLength) {
+        break;
+      }
+      ++numSeqs;
+      if (hasSubseq) {
+        int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+        int subSeqEnd =
+            subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+        for (int k = subSeqStart; k < subSeqEnd; ++k) {
+          allIds.push_back(k);
         }
-        int seqStart = std::get<1>(seqLengthAndStart_[j]);
+        sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                         subSeqEnd - subSeqStart);
+      } else {
+        int seqStart = seqInfo[j].seqStart;
         allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                    : (seqStart + i));
       }
-      info_.idIndex.push_back(allIds.size());
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    numSeqs_.push_back(numSeqs);
+    if (hasSubseq) {
+      inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
     }
   }
-  // copy and check scatterId
-  copyScattedId(allIds, &info_.allIds, input.getBatchSize());
-  CHECK_EQ(info_.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-  // inFrameLine select rows in real layer one time
-  for (auto& inFrameLine : inFrameLines_) {
-    selectRowsOneTime(inFrameLine.inLayer, info_.allIds, &inFrameLine.outArg,
-                      passType);
+  if (hasSubseq) {
+    // inFrameLine create sequenceStartPositions one time
+    CHECK_EQ(sequenceStartPositions.size(),
+             static_cast<size_t>(maxSequenceLength_ +
+                                 input.getNumSubSequences()));
+    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+             static_cast<size_t>(maxSequenceLength_ + 1));
+    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
   }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
 }
 
 /* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
@@ -633,19 +705,20 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
     sequenceStartPositions.push_back(0);  // first element = 0
     const int* starts = input.sequenceStartPositions->getData(false);
     for (size_t i = 0; i < numSequences; ++i) {
-      int seqId = std::get<2>(seqLengthAndStart_[i]);
+      // memory info adopt info of inlinks[0]
+      int seqId = seqInfos_[0][i].seqId;
       for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
         allIds.push_back(k);
       }
       sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                          starts[seqId + 1] - starts[seqId]);
+                                       starts[seqId + 1] - starts[seqId]);
     }
     createSeqPos(sequenceStartPositions,
                  &(*memoryFrameLine).sequenceStartPositions);
 
   } else {  // for scatterAgentLayer
     for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(std::get<2>(seqLengthAndStart_[i]));
+      allIds.push_back(seqInfos_[0][i].seqId);
     }
   }
   // copy and check scatterId
@@ -672,16 +745,24 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
                                                  const IVectorPtr& allIds,
                                                  Argument* arg,
                                                  PassType passType) {
-  const MatrixPtr& realV = layer->getOutputValue();
-  int height = realV->getHeight();
-  int width = realV->getWidth();
-  Matrix::resizeOrCreate(arg->value, height, width, /* trans */ false, useGpu_);
-  arg->value->zeroMem();
-  arg->value->selectRows(*realV, *allIds);
-  if (passType != PASS_TEST) {
-    Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
-                           useGpu_);
-    arg->grad->zeroMem();
+  Argument& src = layer->getOutput();
+  if (src.value) {
+    const MatrixPtr& realV = src.value;
+    int height = realV->getHeight();
+    int width = realV->getWidth();
+    Matrix::resizeOrCreate(
+      arg->value, height, width, /* trans */ false, useGpu_);
+    arg->value->zeroMem();
+    arg->value->selectRows(*realV, *allIds);
+    if (passType != PASS_TEST) {
+      Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
+                             useGpu_);
+      arg->grad->zeroMem();
+    }
+  }
+  if (src.ids) {
+    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
+    arg->ids->selectFrom(*src.ids, *allIds);
   }
 }
 
@@ -699,18 +780,19 @@ size_t RecurrentGradientMachine::getGenBatchSize() {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (!memoryFrameLine.rootLayer) continue;
     Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = memoryFrameLine.is_sequence ?
-                       bootArg.getNumSequences() : bootArg.getBatchSize();
+    size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences()
+                                                   : bootArg.getBatchSize();
     if (numSequences) {
       CHECK_EQ(numSequences, batchSize);
     } else {
       numSequences = batchSize;
     }
   }
-  CHECK(numSequences) << "Fail to get batch size in generation. "
-    "At least one of the Memory layer MUST have a layer that is NOT in "
-    "the layer group to boot it, and this boot layer is used to "
-    "decide batch_size in generation process.";
+  CHECK(numSequences)
+      << "Fail to get batch size in generation. "
+         "At least one of the Memory layer MUST have a layer that is NOT in "
+         "the layer group to boot it, and this boot layer is used to "
+         "decide batch_size in generation process.";
   return numSequences;
 }
 
@@ -732,7 +814,9 @@ void RecurrentGradientMachine::generateSequence() {
 
   // connect boot frame memory links
   std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) { ids[i] = i; }
+  for (size_t i = 0; i < numSequences; ++i) {
+    ids[i] = i;
+  }
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -756,7 +840,8 @@ void RecurrentGradientMachine::generateSequence() {
 
   // init outArg
   size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(generator_.outArg.ids,
+  IVector::resizeOrCreate(
+      generator_.outArg.ids,
       generator_.config.max_num_frames() * numSequences * resultNum, false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
@@ -847,7 +932,9 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         // path.seqId = -1 indicates end of generation
         // of an input sequence
         finalPaths[seqIds_[j]].seqId = -1;
-      } else { scatterIds.push_back(j); }
+      } else {
+        scatterIds.push_back(j);
+      }
     }
   }
 
@@ -856,13 +943,12 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   starts[0] = 0;
   generator_.ids.clear();
   for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
+    generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(),
                           finalPaths[i].ids.end());
     starts[i + 1] = generator_.ids.size();
     batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                             finalPaths[i].machineIdVec.begin(),
-                             finalPaths[i].machineIdVec.end());
+                              finalPaths[i].machineIdVec.begin(),
+                              finalPaths[i].machineIdVec.end());
   }
 }
 
@@ -920,9 +1006,9 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   }
 }
 
-void RecurrentGradientMachine::singlePathExpand(
-    Path& curPath, size_t curPathId, std::vector<Path>& newPaths,
-    size_t expandWidth) {
+void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
+                                                std::vector<Path>& newPaths,
+                                                size_t expandWidth) {
   int calc_id =
       gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
 
@@ -946,19 +1032,20 @@ void RecurrentGradientMachine::singlePathExpand(
     if (id == -1) break;
 
     real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(curPath, id, newLogProb,
-                 curPathId /*machineId*/, k /*topIndex*/);
+    Path newPath(curPath, id, newLogProb, curPathId /*machineId*/,
+                 k /*topIndex*/);
     if (this->beamSearchCtrlCallbacks_) {
       if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory)) return;
+              newPath.seqId, newPath.ids, newPath.probHistory))
+        return;
     }
     // outFrameLines_.size() > 1UL
     if (dataArgsSize_) {
       newPath.machineIdVec = curPath.machineIdVec;
       newPath.machineIdVec.push_back(curPathId);
     }
-    bool atEos = eosVec[index] == 1U ||
-                 newPath.ids.size() >= (size_t)maxSequenceLength_;
+    bool atEos =
+        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
     // adjustNewPath
     newPath.adjustProb(calc_id, atEos);
     if (this->beamSearchCtrlCallbacks_) {
@@ -966,16 +1053,18 @@ void RecurrentGradientMachine::singlePathExpand(
           newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
     }
     if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath) :
-              newPaths.push_back(newPath);
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
+            : newPaths.push_back(newPath);
     }
   }  // for expandWidth
 
-  if (gDiyProbStop) { gDiyProbStop(calc_id); }
+  if (gDiyProbStop) {
+    gDiyProbStop(calc_id);
+  }
 }
 
-void RecurrentGradientMachine::beamExpand(
-    std::vector<Path>& paths, std::vector<Path>& newPaths) {
+void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
+                                          std::vector<Path>& newPaths) {
   size_t candidatePathCount = paths.size();
   // idVec.size() could be larger than candidatePathCount * beam,
   // so user can drop some node customly.
@@ -988,7 +1077,7 @@ void RecurrentGradientMachine::beamExpand(
   int curSeqId = 0;
   for (size_t j = 0; j <= candidatePathCount; j++) {
     // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount? paths[j].seqId : curSeqId + 1);
+    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
     if (prevSeqId != -1 && curSeqId != prevSeqId) {
       totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
     }
@@ -1000,11 +1089,14 @@ void RecurrentGradientMachine::beamExpand(
 }
 
 // Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(
-    std::vector<Path>& newPaths, size_t seqId, size_t totalExpandCount) {
-  size_t minNewPathSize = std::min(getBeamSize(),
-                                   newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) { return 0; }
+size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
+                                            size_t seqId,
+                                            size_t totalExpandCount) {
+  size_t minNewPathSize =
+      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) {
+    return 0;
+  }
   std::nth_element(newPaths.begin() + totalExpandCount,
                    newPaths.begin() + totalExpandCount + minNewPathSize,
                    newPaths.end(), Path::greaterPath);
@@ -1017,11 +1109,8 @@ size_t RecurrentGradientMachine::beamShrink(
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) {
-                         return p.logProb < minPathLogProb;
-                     }),
+      std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(),
+                     [&](Path& p) { return p.logProb < minPathLogProb; }),
       finalPaths_[seqId].end());
   for (auto p : finalPaths_[seqId]) {
     if (minFinalPathLogProb_[seqId] > p.logProb) {
@@ -1030,7 +1119,7 @@ size_t RecurrentGradientMachine::beamShrink(
   }
 
   if (finalPaths_[seqId].size() >= getBeamSize() &&
-          minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
     newPaths.resize(totalExpandCount);
     return 0;
   }
@@ -1067,7 +1156,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
           // in beam search, here only reserved the top 1 generated result
           // for out_links that are not the generated word indices.
           batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-              path.machineIdVec.begin(), path.machineIdVec.end());
+                                    path.machineIdVec.begin(),
+                                    path.machineIdVec.end());
         }
       }
       starts[i + 1] = generator_.ids.size();
@@ -1091,21 +1181,21 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
 
 void RecurrentGradientMachine::createDataOutlink(
     std::vector<int>& machineIdVec) {
-  size_t seqNum = getBeamSize() > 1UL ?
-                  finalPaths_.size() : finalPaths_[0].size();
+  size_t seqNum =
+      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
   std::vector<int> starts(seqNum + 1, 0);
   for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() :
-                                          finalPaths_[0][i].ids.size();
+    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
+                                        : finalPaths_[0][i].ids.size();
     starts[i + 1] = starts[i] + seqLen;
   }
 
   for (size_t i = 0; i < dataArgsSize_; i++) {
-    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec,
-                        starts, useGpu_, HPPL_STREAM_1, PASS_TEST);
+    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_,
+                        HPPL_STREAM_1, PASS_TEST);
 
-    auto dataAgent = dynamic_cast<DataLayer*>(
-        outFrameLines_[i + 1].agentLayer.get());
+    auto dataAgent =
+        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
     CHECK_NOTNULL(dataAgent);
     dataAgent->setData(dataArgs_[i]);
   }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index cc49d13952..6328213793 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "GradientMachine.h"
@@ -101,7 +100,7 @@ public:
    * Return true if this prefix or candidate is expected to be dropped.
    */
   typedef std::function<bool(int seqId, const std::vector<int>&,
-      const std::vector<real>&)> DropCallback;
+                             const std::vector<real>&)> DropCallback;
 
   /**
     * @brief NormOrDropNodeCallback
@@ -117,7 +116,7 @@ public:
     * The fourth parameter is the probability of the whole path.
     */
   typedef std::function<void(int seqId, const std::vector<int>&,
-      std::vector<real>&, real*)> NormOrDropNodeCallback;
+                             std::vector<real>&, real*)> NormOrDropNodeCallback;
 
   /**
    * @brief Register beam search control callbacks. Used for prediction.
@@ -192,7 +191,7 @@ public:
 
     int machineId;  // index of sample in frame
     int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;  // index of sequence in batch generation
+    int seqId;      // index of sequence in batch generation
     std::vector<int> machineIdVec;
 
     /**
@@ -206,7 +205,10 @@ public:
     /**
      * @brief Path default ctor, first logProb is 0.
      */
-    Path() { logProb = 0; seqId = 0; }
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
     explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
 
     /**
@@ -319,21 +321,33 @@ protected:
   };
   std::vector<MemoryFrameLine> memoryFrameLines_;
 
-  // All inFrameLines and outFrameLines have the same element as follows.
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
   struct Info {
     IVectorPtr allIds;         // scattered id of realLayer
     std::vector<int> idIndex;  // index of allIds
     ICpuGpuVectorPtr
-        sequenceStartPositions;      // scattered sequenceStartPositions
+        sequenceStartPositions;         // scattered sequenceStartPositions
     std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
   };
-  Info info_;
+  std::vector<Info> info_;
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;
 
-  // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex)
-  // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
-  std::vector<std::tuple<int, int, int, int>> seqLengthAndStart_;
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
 
-  void createInFrameInfo(const Argument& input, PassType passType);
+  // the id of inlink which share info with outlinks
+  int targetInfoInlinkId_;
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+  *  for all realLayer of inFrameLines one time.
+  */
+  void createInFrameInfo(int inlinks_id, const Argument& input,
+                         PassType passType);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                              PassType passType);
@@ -363,6 +377,9 @@ protected:
 
   NeuralNetwork* rootNetwork_;
   bool reversed_;
+
+  // if hasSubseq: max number of sentences(subseq)in batchsize samples
+  // else: max number of tokens in batchsize samples(sentences)
   int maxSequenceLength_;
   bool useGpu_;
   bool stopBeamSearch_;
@@ -415,7 +432,7 @@ private:
    * @param machineIdVec : select a row of output matrix in each frame
    * that the generation process expanded.
    */
-  void createDataOutlink(std::vector<int> & machineIdVec);
+  void createDataOutlink(std::vector<int>& machineIdVec);
 
   /*
    * @brief used in beam search, connect previous frame to form recurrent link
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index c1bef18ed3..056e956885 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -139,15 +139,16 @@ void ScatterAgentLayer::forward(PassType passType) {
   Layer::forward(passType);
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
-  if (realLayer_->getOutput().ids) {  // ids scatter
-    IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-    output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-  } else {  // value scatter
-    int width = this->getSize();
-    if (realOutArg_.value) {
-      output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
-                         width, useGpu_);
-    } else {  // used in generation
+  int width = this->getSize();
+  if (realOutArg_.value || realOutArg_.ids) {
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+                       width, useGpu_);
+  } else {  // used in generation
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
       int height = ids_->getSize();
       resetOutput(height, width);
 
@@ -213,18 +214,17 @@ void SequenceGatherAgentLayer::forward(PassType passType) {
 void SequenceScatterAgentLayer::forward(PassType passType) {
   Layer::forward(passType);
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-  CHECK(!realLayer_->getOutput().ids) << "Not supported";
 
   const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(input.value->getWidth(), this->getSize());
+  CHECK_EQ(realLayer_->getSize(), this->getSize());
   int width = this->getSize();
 
   AsyncGpuBlock asyncGpuBlock;
   REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
 
-  if (realOutArg_.value) {
+  if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
                        width, useGpu_, /* trans */ false, /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index df8a2b0314..c1dcad2b5f 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -31,7 +31,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
   }
 
   // coeff only affect bp, keep consistent with CostLayer
-  coeff_ = config_.has_coeff() ? config_.coeff() : real(1.0);
+  coeff_ = config_.coeff();
   if (inputLayers_.size() == 3) {
     weightLayer_ = inputLayers_[2];
   }
@@ -47,81 +47,40 @@ bool CRFLayer::init(const LayerMap& layerMap,
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
   setNeedSequenceInfo(false);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
+
   return true;
 }
 
 void CRFLayer::forward(PassType passType) {
   Layer::forward(passType);
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
-    }
-    VectorPtr cpuParameterValue;
-    VectorPtr cpuParameterGradient;
-    cpuParameterValue =
-      Vector::create(parameter_->getBuf(PARAMETER_VALUE)->getSize(), false);
-    cpuParameterValue->
-      copyFrom(*parameter_->getBuf(PARAMETER_VALUE), HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      cpuParameterGradient =
-        Vector::create(parameter_->getBuf(PARAMETER_GRADIENT)->getSize(),
-                       false);
-      cpuParameterGradient->
-        copyFrom(*parameter_->getBuf(PARAMETER_GRADIENT), HPPL_STREAM_1);
-    } else {
-      cpuParameterGradient = nullptr;
-    }
-    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1], cpuParameterValue,
-               cpuParameterGradient);
-    parameter_->getBuf(PARAMETER_VALUE)->copyFrom(*cpuParameterValue,
-                                                  HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      parameter_->getBuf(PARAMETER_GRADIENT)->copyFrom(*cpuParameterGradient,
-                                                    HPPL_STREAM_1);
-    }
-  } else {
-    forwardImp(getInput(0), getInput(1), parameter_->getBuf(PARAMETER_VALUE),
-               parameter_->getBuf(PARAMETER_GRADIENT));
-  }
-}
 
-void CRFLayer::forwardImp(const Argument&output,
-                          const Argument& label,
-                          VectorPtr parameterValue,
-                          VectorPtr parameterGradient) {
+  CHECK(!useGpu_) << "GPU is not supported";
+
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   CHECK(label.sequenceStartPositions);
   CHECK(label.ids);
 
   int batchSize = output.getBatchSize();
   size_t numSequences = label.sequenceStartPositions->getSize() - 1;
   resizeOutput(numSequences, 1);
-  std::vector<real> out(numSequences);
 
   const int* starts = label.sequenceStartPositions->getData(false);
   CHECK_EQ(starts[numSequences], batchSize);
-  VectorPtr cpuParameterValue;
-  VectorPtr cpuParameterGradient;
-
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
       crfs_.emplace_back(numClasses_,
-                         parameterValue->getData(),
-                         parameterGradient
-                            ? parameterGradient->getData()
+                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
+                         parameter_->getBuf(PARAMETER_GRADIENT)
+                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
                             : nullptr);
     }
-    out[i] = crfs_[i].forward(
+    output_.value->getData()[i] = crfs_[i].forward(
         output.value->getData() + numClasses_ * starts[i],
         label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
   }
-  output_.value->copyFrom(out.data(), numSequences);
+
   if (weightLayer_) {
     const MatrixPtr& weight = getInputValue(*weightLayer_);
     getOutputValue()->dotMul(*getOutputValue(), *weight);
@@ -129,22 +88,8 @@ void CRFLayer::forwardImp(const Argument&output,
 }
 
 void CRFLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (useGpu_) {
-    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
-
-  } else {
-    backwardImp(callback, getInput(0), getInput(1));
-  }
-}
-
-void CRFLayer::backwardImp(const UpdateCallback& callback,
-                           const Argument&output,
-                           const Argument& label) {
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
@@ -159,9 +104,11 @@ void CRFLayer::backwardImp(const UpdateCallback& callback,
       grad->mulScalar(weight);
     }
   }
+
   if (coeff_ != real(1.0f)) {
     output.grad->mulScalar(coeff_);
   }
+
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 5facb9b548..58902a0d3b 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -25,18 +25,14 @@ namespace paddle {
 /**
  * A layer for calculating the cost of sequential conditional random field
  * model.
- * See LinearChainCRF.h for the detail of the CRF formulation.
+ * See class LinearChainCRF for the detail of the CRF formulation.
  */
 class CRFLayer : public Layer {
 public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
   virtual void forward(PassType passType);
-  void forwardImp(const Argument&output, const Argument& label,
-                  VectorPtr parameterValue, VectorPtr parameterGradient);
   virtual void backward(const UpdateCallback& callback);
-  void backwardImp(const UpdateCallback& callback, const Argument&output,
-                   const Argument& label);
 
 protected:
   size_t numClasses_;
@@ -44,7 +40,6 @@ protected:
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
   real coeff_;  // weight for the layer
-  std::vector<Argument> tmpCpuInput_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index db1450694e..6b9ffc5c74 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -49,8 +49,10 @@ void CTCLayer::forward(PassType passType) {
   Layer::forward(passType);
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
     }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
   } else {
     forwardImp(getInput(0), getInput(1));
@@ -92,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
   if (useGpu_) {
     backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
     const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
     const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
   } else {
     backwardImp(callback, getInput(0), getInput(1));
   }
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index d08c422764..8c72c17784 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -248,7 +248,7 @@ void ConvOperator::forward() {
   CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
   checkFilterSize(ins_[1]->value);
   Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_);
+                         outputH_ * outputW_ * numFilters_, false, useGpu_);
   {
     AsyncGpuBlock block;
     for (size_t batchId = 0; batchId < batchSize; ++batchId) {
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index e092b2e390..a81cf939af 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -21,18 +21,20 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for convex weighted average of vectors,
+ * @brief A layer for weighted sum of vectors,
  * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
  * TRANSLATE
- * - Input: the first input contains the convex weights (batchSize x weightDim),
- *          and the shape of second input is (batchSize x (weightdim*dataDim)).
- * - Output: the shape of output is (batchSize x dataDim).
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
  * \f[
- *   out[i][j] = \sum_{j}(in0(i, j) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
  * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
  *
- * The config file api is convex_comb_layer.
+ * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
 protected:
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index b10bd1d886..05a70aeff5 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -48,7 +48,7 @@ void CosSimLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, kCosSimScale_);
+    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
   }
 }
 
@@ -59,7 +59,7 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
 
     outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
                            *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), kCosSimScale_);
+                           *getInputGrad(1), config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 9b0e53335b..65eb807ab2 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
 class CosSimLayer : public Layer {
 public:
   explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config), kCosSimScale_(5.0f) {}
+      : Layer(config) {}
 
   ~CosSimLayer() {}
 
@@ -44,8 +44,6 @@ public:
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
-
-  const real kCosSimScale_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index f353afabb3..14ff8510f7 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -26,11 +26,7 @@ namespace paddle {
 bool CostLayer::init(const LayerMap& layerMap,
                      const ParameterMap& parameterMap) {
   bool ret = Layer::init(layerMap, parameterMap);
-  if (config_.has_coeff()) {
-    coeff_ = config_.coeff();  // coeff only affact bp
-  } else {
-    coeff_ = real(1.0);
-  }
+  coeff_ = config_.coeff();
   if (!ret) return ret;
   CHECK_GE(inputLayers_.size(), 2UL);
   CHECK_LE(inputLayers_.size(), 3UL);
@@ -509,8 +505,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
                                Matrix &cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
     }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
   forwardImpIn(output, label, cost);
 }
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index e1762e8d36..3c6d13b0bf 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -114,27 +114,12 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   } else {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
-#if CUDNN_VERSION < 5000
-  // because of the different api of cudnn v4 and v5.
-  if (weight_->getWGrad()) {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-#endif
+
   hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
                          ioDesc_, inGrad, bnParamDesc_,
                          gamma, gammaGrad, betaGrad,
                          EPS, savedMean, savedInvVar);
 
-#if CUDNN_VERSION < 5000
-  // because of the different api of cudnn v4 and v5.
-  if (weight_->getWGrad() && biases_->getWGrad()) {
-    weight_->getWGrad()->add(*tmpWGrad_);
-    biases_->getWGrad()->add(*tmpBiasGrad_);
-  }
-#endif
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index a74e6ba38d..0f932f960f 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -85,6 +85,7 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
     biasOffset_ = numFilters_ / groups_[0];
   }
 
+  batchNum_ = 0;
   isSelectAlgo_ = false;
   return true;
 }
@@ -132,6 +133,11 @@ void CudnnConvLayer::reshape(int batchSize) {
   getOutput().setFrameHeight(outputH_);
   getOutput().setFrameWidth(outputW_);
 
+  // if the batchSize remains the same, set isSelectAlgo_ true.
+  // Otherwise, set isSelectAlgo_ false and select algo again.
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
   size_t maxWorkSpace = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
@@ -160,6 +166,10 @@ void CudnnConvLayer::reshape(int batchSize) {
 
       maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
       maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
+
+      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
+                           << " / " << bwdDataAlgo_[i]
+                           << " / " << bwdFilterAlgo_[i];
     }
   }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index 2c72ba885e..a6dadba10d 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -87,6 +87,10 @@ protected:
   /// Is or not select conv algorihtm.
   bool isSelectAlgo_;
 
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+
 public:
   explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index fc9832af86..7091c6aa22 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -14,8 +14,7 @@ limitations under the License. */
 
 
 #include "HierarchicalSigmoidLayer.h"
-
-#include "paddle/math/Bits.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 3bde1aa415..c33c83b259 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,39 +21,39 @@ namespace paddle {
 
 class LinearChainCRF {
 public:
-  /*
-    The size of para and grad must be (numClasses + 2) * numClasses.
-    The first numClasses values of para are for starting weights (a).
-    The next numClasses values of para are for ending weights (b),
-    The remaning values are for transition weights (w).
-
-    The probability of a state sequence s of length L is defined as:
-    P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-                     + \sum_{l=1}^L x_{s_l}
-                     + \sum_{l=2}^L w_{s_{l-1},s_l})
-    where Z is a normalization value so that the sum of P(s) over all possible
-    sequences is 1, and x is the input feature to the CRF.
+  /**
+   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The first numClasses values of para are for starting weights (\f$a\f$).
+   * The next numClasses values of para are for ending weights (\f$b\f$),
+   * The remaning values are for transition weights (\f$w\f$).
+   *
+   * The probability of a state sequence s of length \f$L\f$ is defined as:
+   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+   *                  + \sum_{l=1}^L x_{s_l}
+   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible
+   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
   LinearChainCRF(int numClasses, real* para, real* grad);
 
-  /*
-    Calculate the negative log likelihood of s given x.
-    The size of x must be length * numClasses. Each consecutive numClasses
-    values are the features for one time step.
+  /**
+   * Calculate the negative log likelihood of s given x.
+   * The size of x must be length * numClasses. Each consecutive numClasses
+   * values are the features for one time step.
    */
   real forward(real* x, int* s, int length);
 
-  /*
-    Calculate the gradient with respect to x, a, b, and w.
-    The gradient of x will be stored in dx.
-    backward() can only be called after a corresponding call to forward() with
-    the same x, s and length.
-    NOTE: The gradient is added to dx and grad (provided at constructor).
+  /**
+   * Calculate the gradient with respect to x, a, b, and w.
+   * The gradient of x will be stored in dx.
+   * backward() can only be called after a corresponding call to forward() with
+   * the same x, s and length.
+   * @note The gradient is added to dx and grad (provided at constructor).
    */
   void backward(real* x, real* dx, int* s, int length);
 
-  /*
-    Find the most probable sequence given x. The result will be stored in s.
+  /**
+   * Find the most probable sequence given x. The result will be stored in s.
    */
   void decode(real* x, int* s, int length);
 
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
index 710772c0cf..518dc0c60c 100644
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -19,7 +19,7 @@ namespace paddle {
 
 MultinomialSampler::MultinomialSampler(const real* prob, int size)
     : rand_(0.0, size) {
-  intervals_.reserve(size + 1);
+  intervals_.resize(size + 1);
   double sum = 0;
   for (int i = 0; i < size; ++i) {
     sum += prob[i];
@@ -50,12 +50,13 @@ MultinomialSampler::MultinomialSampler(const real* prob, int size)
   int bigPos = nextBigPos(0);
 
   auto fillIntervals = [&]() {
-    while (bigPos < size && smallPos < size) {
+    while (bigPos < size) {
       while (intervals_[bigPos].thresh > 1 && smallPos < size) {
         intervals_[smallPos].otherId = bigPos;
         intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh;
         smallPos = nextSmallPos(smallPos + 1);
       }
+      if (smallPos >= size) break;
       bigPos = nextBigPos(bigPos + 1);
       // If intervals_[bigPos].thresh < 1, it becomes a small interval
     }
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index f30a3e8df0..eab6e904ee 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -46,9 +46,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
   /* the size of inputs for norm-layer is 1 */
   CHECK_EQ(config_.inputs_size(), 1);
 
-  auto& inputConfig = config_.inputs(0);
-  blocked_ = inputConfig.norm_conf().blocked();
-
   return true;
 }
 
@@ -69,7 +66,7 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   denoms_->zeroMem();
 
   outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_,
-                          size_, scale_, pow_, blocked_);
+                          size_, scale_, pow_);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -86,6 +83,6 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
 
   preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV,
                                 channels_, imgSizeH_, imgSizeW_, size_, scale_,
-                                pow_, blocked_);
+                                pow_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index a5e8dce029..728806ea76 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -23,15 +23,12 @@ namespace paddle {
 
 /**
  * @brief response normalization across feature maps
- * namely normalize in number of size_ channels 
+ * namely normalize in number of size_ channels
  */
 class CMRProjectionNormLayer : public ResponseNormLayer {
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
 
-protected:
-  bool blocked_;
-
 public:
   explicit CMRProjectionNormLayer(const LayerConfig& config)
       : ResponseNormLayer(config) {}
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
new file mode 100644
index 0000000000..68fee69f44
--- /dev/null
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class PrintLayer : public Layer {
+public:
+  explicit PrintLayer(const LayerConfig& config)
+      : Layer(config) {}
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {}
+};
+
+void PrintLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const auto& argu = getInput(i);
+    const std::string& name = inputLayers_[i]->getName();
+    if (argu.value) {
+      std::ostringstream os;
+      argu.value->print(os);
+      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
+    }
+    if (argu.ids) {
+      std::ostringstream os;
+      argu.ids->print(os, argu.ids->getSize());
+      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
+    }
+    if (auto startPos = argu.sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+    }
+    if (auto subStartPos = argu.subSequenceStartPositions) {
+      std::ostringstream os;
+      subStartPos->getVector(false)->print(os, subStartPos->getSize());
+      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
+                << os.str();
+    }
+  }
+}
+
+REGISTER_LAYER(print, PrintLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 41c1461967..b39c9948b5 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -52,8 +52,10 @@ public:
     Layer::forward(passType);
     if (useGpu_) {
       for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
       }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
       forwardImp(tmpCpuInput_[0]);
     } else {
       forwardImp(getInput(0));
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 129f10fac1..ff2abf7697 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -56,7 +56,6 @@ add_test(NAME test_RecurrentGradientMachine
     COMMAND .set_python_path.sh -d
             ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
             ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-            --use_gpu=false
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index f72011ae16..552a6c5b41 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     testLayer->forward(PASS_TEST);
     Argument out;
     out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     if (batchOut.value) {
       size_t dim = batchOut.value->getWidth();
       ASSERT_TRUE((bool)out.value);
@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     testLayer->forward(PASS_TEST);
     Argument out;
     out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     if (batchOut.value) {
       size_t dim = batchOut.value->getWidth();
       ASSERT_TRUE((bool)out.value);
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/gserver/tests/Sequence/dummy.list
new file mode 100644
index 0000000000..0e52665e11
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/dummy.list
@@ -0,0 +1 @@
+dummy_file_no_use
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/gserver/tests/concat_table_a.conf
index 2e3c518883..a8ff70f883 100644
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 # emb1 is equal to emb2, note that bias_attr=false 
 # and act=LinearActivation() in default.
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/gserver/tests/concat_table_b.conf
index 6da24a5fbc..95d7c10f7b 100644
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 proj1 = table_projection(input=data, size=128)
 
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
new file mode 100644
index 0000000000..347d5891b9
--- /dev/null
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index dd2b90dd49..cbed1f15fc 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-#coding=utf-8
-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,36 +15,48 @@
 import os
 import sys
 
-from paddle.trainer.PyDataProviderWrapper import *
+from paddle.trainer.PyDataProvider2 import *
+
+
+def hook(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)),
+                            integer_value_sequence(3)]
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
-@init_hook_wrapper
-def hook(obj, dict_file, **kwargs):
-    obj.word_dict = dict_file
-    obj.slots = [IndexSlot(len(obj.word_dict)), IndexSlot(3)]
-    obj.logger.info('dict len : %d' % (len(obj.word_dict)))
 
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
+@provider(init_hook=hook, should_shuffle=False)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
             label, comment = line.strip().split('\t')
             label = int(''.join(label.split()))
             words = comment.split()
-            word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+            word_slot = [settings.word_dict[w] for w in words if
+                         w in settings.word_dict]
             yield word_slot, [label]
 
+
 ## for hierarchical sequence network
-@provider(use_seq=True, init_hook=hook)
-def process2(obj, file_name):
+def hook2(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+                            integer_value_sub_sequence(3)]
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
+
+
+@provider(init_hook=hook2, should_shuffle=False)
+def process2(settings, file_name):
     with open(file_name) as fdata:
         label_list = []
         word_slot_list = []
         for line in fdata:
             if (len(line)) > 1:
-                label,comment = line.strip().split('\t')
+                label, comment = line.strip().split('\t')
                 label = int(''.join(label.split()))
                 words = comment.split()
-                word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+                word_slot = [settings.word_dict[w] for w in words if
+                             w in settings.word_dict]
                 label_list.append([label])
                 word_slot_list.append(word_slot)
             else:
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 9ad2b37628..ac031b3128 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -21,11 +21,11 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources(train_list='gserver/tests/Sequence/train.list',
-                       test_list=None,
-                       module='sequenceGen',
-                       obj='process',
-                       args={"dict_file":dict_file})
+define_py_data_sources2(train_list='gserver/tests/Sequence/train.list',
+                        test_list=None,
+                        module='sequenceGen',
+                        obj='process',
+                        args={"dict_file":dict_file})
 
 settings(batch_size=5)
 ######################## network configure ################################
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 8c3a08f16c..38c60b657b 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -21,11 +21,11 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources(train_list='gserver/tests/Sequence/train.list.nest',
-                       test_list=None,
-                       module='sequenceGen',
-                       obj='process2',
-                       args={"dict_file":dict_file})
+define_py_data_sources2(train_list='gserver/tests/Sequence/train.list.nest',
+                        test_list=None,
+                        module='sequenceGen',
+                        obj='process2',
+                        args={"dict_file":dict_file})
 
 settings(batch_size=2)
 ######################## network configure ################################
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000..62b8c5d072
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,76 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
new file mode 100644
index 0000000000..e01b3f8e7a
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -0,0 +1,77 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(wid, x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y, wid):
+        z = embedding_layer(input=wid, size=word_dim)
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, z, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=[x, wid])
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(data), SubsequenceInput(emb)])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000..3294c2c3fc
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=emb)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
new file mode 100644
index 0000000000..968621cab5
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -0,0 +1,58 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y, wid):
+    z = embedding_layer(input=wid, size=word_dim)
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, z, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=[emb, data])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 8e85778146..3a591a316b 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -87,18 +87,31 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
         return;
     }
 
+    ICpuGpuVectorPtr sequenceStartPositions;
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
+      if (!sequenceStartPositions) {
+        generateSequenceStartPositions(batchSize, sequenceStartPositions);
+      }
+      data.sequenceStartPositions = sequenceStartPositions;
+    }
+
     arguments.push_back(data);
   }
 
   Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
   double totalScore = 0.0;
+  testEvaluator->start();
   totalScore += testEvaluator->evalImp(arguments);
   testEvaluator->updateSamplesNum(arguments);
+  testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
+    testEvaluator->start();
     totalScore2 += testEvaluator->evalImp(arguments);
+    testEvaluator->finish();
     EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
   }
 }
@@ -202,6 +215,15 @@ TEST(Evaluator, precision_recall) {
                 false);
 }
 
+TEST(Evaluator, ctc_error_evaluator) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("ctc_edit_distance");
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
+  testEvaluatorAll(config, "ctc_error_evaluator", 100);
+}
+
 int main(int argc, char** argv) {
   initMain(argc, argv);
   FLAGS_thread_local_rand_use_global_seed = true;
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 7bb79ff5b7..3150c31e49 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -50,7 +50,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 100}) {
+      for (auto batchSize : {1, 2, 5, 20, 50}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength
@@ -179,10 +179,9 @@ TEST(Layer, CRFLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ useGpu,
-                  false /*useWeight*/, 0.03 /*epsilon*/);
-  }
+  // Not support GPU now
+  testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false,
+                false /*useWeight*/, 0.03 /*epsilon*/);
 }
 
 TEST(Layer, CTCLayer) {
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 39a9095833..73b4d0b8b7 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -41,39 +41,42 @@ public:
 TEST(MultinomialSampler, gen) {
   int numGrids = 1024 * 1024;
   int size = 1024 * 4;
-
   default_random_engine reng;
-  uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-  vector<real> prob;
-  int sum = 0;
-  for (int i = 0; i < size; ++i) {
-    prob.push_back(rand(reng));
-    sum += prob.back();
-  }
-  CHECK_LE(sum, numGrids);
-  prob.back() += numGrids - sum;
 
-  vector<int> counts(size);
-  MultinomialSamplerTester sampler(&prob[0], size);
-  counts.assign(size, 0);
-  {
-    double s = (double)size / (double)numGrids;
-    REGISTER_TIMER("MultinomialSampler");
-    for (double i = 0; i < numGrids; ++i) {
-      int ret = sampler.testGen([i, s]() { return s * i; });
-      if (ret < 0 || ret >= size) {
-        EXPECT_GE(ret, 0);
-        EXPECT_LT(ret, size);
-        break;
+  for (size_t iter=0; iter < 256; ++iter) {
+    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
+    vector<real> prob;
+    int sum = 0;
+    for (int i = 0; i < size; ++i) {
+      prob.push_back(rand(reng));
+      sum += prob.back();
+    }
+
+    CHECK_LE(sum, numGrids);
+    prob.back() += numGrids - sum;
+
+    vector<int> counts(size);
+    MultinomialSamplerTester sampler(&prob[0], size);
+    counts.assign(size, 0);
+    {
+      double s = (double)size / (double)numGrids;
+      REGISTER_TIMER("MultinomialSampler");
+      for (double i = 0; i < numGrids; ++i) {
+        int ret = sampler.testGen([i, s]() { return s * i; });
+        if (ret < 0 || ret >= size) {
+          EXPECT_GE(ret, 0);
+          EXPECT_LT(ret, size);
+          break;
+        }
+        ++counts[ret];
       }
-      ++counts[ret];
     }
-  }
-  for (int i = 0; i < size; ++i) {
-    if (prob[i] != counts[i]) {
-      EXPECT_EQ(prob[i], counts[i]);
-      LOG(INFO) << "i=" << i;
-      break;
+    for (int i = 0; i < size; ++i) {
+      if (prob[i] != counts[i]) {
+        EXPECT_EQ(prob[i], counts[i]);
+        LOG(INFO) << iter;
+        break;
+      }
     }
   }
 }
@@ -135,6 +138,7 @@ void benchmarkRandom() {
   LOG(INFO) << "sum1=" << sum1;
 }
 
+
 int main(int argc, char** argv) {
   initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 824295eb6e..e75e53ab7f 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -20,6 +20,18 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 
 P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+
+namespace paddle {
+namespace unittest {
+namespace pydp2 {
+extern void setOnPoolFilledHook(const std::function<void(size_t)>& func);
+extern void clearOnPoolFilledHook();
+
+}  // namespace pydp2
+}  // namespace unittest
+}  // namespace paddle
+
+
 const paddle::real epsilon = 1e-5;
 
 static inline int64_t readDataBatch(
@@ -235,6 +247,112 @@ TEST(PyDataProvider2, index_sub_seq) {
   }
 }
 
+TEST(PyDataProvider2, min_pool_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size");
+  config.set_load_data_args("");
+  size_t totalData = 1 << 14;
+  constexpr size_t batchSize = 100;
+  constexpr size_t minPoolSize = 1000;
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+
+  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
+    if (totalData > batchSize) {
+      CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize));
+    }
+  });
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      totalData -= realBatchSize;
+    } else {
+      break;
+    }
+  }
+  paddle::unittest::pydp2::clearOnPoolFilledHook();
+}
+
+TEST(PyDataProvider2, can_over_batch_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_can_over_batch_size");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      CHECK_LE(realBatchSize, batchSize);
+    } else {
+      break;
+    }
+  }
+}
+
+TEST(PyDataProvider2, input_order) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_input_order");
+  config.set_load_data_args("");
+
+  paddle::ModelConfig modelConfig;
+  *modelConfig.add_input_layer_names() = "input1";
+  *modelConfig.add_input_layer_names() = "input2";
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, modelConfig, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (!realBatchSize) {
+      break;
+    }
+    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
+    for (size_t i = 0; i < realBatchSize; ++i) {
+      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
+      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
+    }
+  }
+}
+
+TEST(PyDataProvider2, test_check) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_check");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, false));
+  provider->reset();
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    if (!realBatchSize) {
+      break;
+    } else {
+      auto& ivec = batch.getStream(0).ids;
+      for (size_t i=0; i < ivec->getSize(); ++i) {
+        CHECK_LT(ivec->getData()[i], 10);
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index a88c48cb4e..145fe85cff 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import random
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -39,7 +41,8 @@ def test_init_hook(setting, filename):
 
 
 @provider(
-    input_types=[sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+    input_types=[
+        sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
 def test_sparse_non_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
@@ -66,3 +69,43 @@ def test_index_sub_seq(setting, filename):
 
     for i in xrange(200):
         yield list(gen_sub_seq(i))
+
+
+@provider(input_types=[index_slot(100)], min_pool_size=1000)
+def test_min_pool_size(setting, filename):
+    for _ in xrange(1 << 14):
+        yield random.randint(0, 100 - 1)
+
+
+@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)],
+          can_over_batch_size=False,
+          calc_batch_size=lambda x: len(x[0]))
+def test_can_over_batch_size(setting, filename):
+    for _ in xrange(1 << 10):
+        seq_len = random.randint(0, 99)
+        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
+
+
+@provider(input_types=[index_slot(10), index_slot(10)])
+def test_input_order(setting, filename):
+    for _ in xrange(1000):
+        yield {
+            'input1': 0,
+            'input2': 1
+        }
+
+
+@provider(input_types=[index_slot(10)],
+          check=True,
+          check_fail_continue=True,
+          should_shuffle="123")  # also test should shuffle
+def test_check(settings, filename):
+    yield_good_value = False
+
+    while not yield_good_value:
+        for _ in xrange(10000):
+            i = random.randint(0, 100)
+            if i < 10:
+                yield_good_value = True
+            yield i
+
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 35d6ee7f4a..550df0a318 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
 
+P_DECLARE_int32(seed);
+
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
   CpuVector vecMomentum(dim);
 
   // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
 
   trainer.startTrain();
   for (int i = 0; i < num_passes; ++i) {
@@ -88,27 +92,54 @@ void CalCost(const string& conf, const string& dir, real* cost,
   rmDir(dir.c_str());
 }
 
-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
+  if (!paddle::version::isWithGpu() && useGpu) {
+    return;
+  }
+  FLAGS_use_gpu = useGpu;
   int num_passes = 5;
   real* cost1 = new real[num_passes];
-  const string conf1 = "gserver/tests/sequence_layer_group.conf";
   const string dir1 = "gserver/tests/t1";
   CalCost(conf1, dir1, cost1, num_passes);
 
   real* cost2 = new real[num_passes];
-  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
   const string dir2 = "gserver/tests/t2";
   CalCost(conf2, dir2, cost2, num_passes);
 
   for (int i = 0; i < num_passes; i++) {
     LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i];
-    ASSERT_NEAR(cost1[i], cost2[i], 1e-3);
+              << ", cost2=" << cost2[i]
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
   }
   delete[] cost1;
   delete[] cost2;
 }
 
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_layer_group.conf",
+         "gserver/tests/sequence_nest_layer_group.conf",
+         1e-5, useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn.conf",
+         "gserver/tests/sequence_nest_rnn.conf",
+         1e-6, useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_input) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_multi_input.conf",
+         "gserver/tests/sequence_nest_rnn_multi_input.conf",
+         1e-6, useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 2cea190b85..9b933b153d 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   Argument& cpuInput = testCpu.dataLayer_->getOutput();
   Argument& gpuInput = testGpu.dataLayer_->getOutput();
   gpuInput.resizeAndCopyFrom(cpuInput, true);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
 
   const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
   const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 36166236e9..f7aa60380f 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <mutex>
-#include <malloc.h>
+#include <stdlib.h>
 #include "hl_gpu.h"
 #include "paddle/utils/Logging.h"
 
@@ -48,9 +48,10 @@ public:
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-    void* ptr = memalign(32ul, size);
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
+      void* ptr;
+      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+      return ptr;
   }
 
   /**
diff --git a/paddle/math/Bits.h b/paddle/math/Bits.h
deleted file mode 100644
index 4114149f6c..0000000000
--- a/paddle/math/Bits.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#pragma once
-
-#include <type_traits>
-
-namespace paddle {
-
-/**
- * From Facebook folly:
- * https://github.com/facebook/folly/blob/master/folly/Bits.h
- *
- * findLastSet: return the 1-based index of the highest bit set
- *
- * for x > 0:
- * \f[
- *    findLastSet(x) = 1 + \floor*{\log_{2}x}
- * \f]
- */
-template <class T>
-inline constexpr typename std::enable_if<(std::is_integral<T>::value &&
-                                          std::is_unsigned<T>::value &&
-                                          sizeof(T) <= sizeof(unsigned int)),
-                                         unsigned int>::type
-findLastSet(T x) {
-  return x ? 8 * sizeof(unsigned int) - __builtin_clz(x) : 0;
-}
-
-template <class T>
-inline constexpr
-    typename std::enable_if<(std::is_integral<T>::value &&
-                             std::is_unsigned<T>::value &&
-                             sizeof(T) > sizeof(unsigned int) &&
-                             sizeof(T) <= sizeof(unsigned long)),  // NOLINT
-                            unsigned int>::type
-    findLastSet(T x) {
-  return x ? 8 * sizeof(unsigned long) - __builtin_clzl(x) : 0;  // NOLINT
-}
-
-}  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index fe486c741d..43075977dc 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -23,6 +23,8 @@ extern "C" {
 }
 #endif
 
+#include <cmath>
+
 namespace paddle {
 
 template<class T>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 7e18400768..e351bede72 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
   if (!matrix) {
     matrix = Matrix::create(height, width, trans, useGpu);
   } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
     matrix->resize(height, width);
   }
 }
@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
   } else {
     CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
           dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
     matrix->resize(height, width, nnz, valueType, format);
   }
 }
@@ -943,7 +945,7 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
 void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
                                   size_t imgSizeW, Matrix& denoms,
                                   size_t channels, size_t sizeX, float scale,
-                                  float pow, bool blocked) {
+                                  float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
   size_t width = imgSizeW;
@@ -960,7 +962,7 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
                                   Matrix& preOutV, Matrix& localOutV,
                                   size_t channels, size_t imgSizeH,
                                   size_t imgSizeW, size_t sizeX, float scale,
-                                  float pow, bool blocked) {
+                                  float pow) {
   size_t num = preOutV.getHeight();
   size_t height = imgSizeH;
   size_t width = imgSizeW;
@@ -1602,7 +1604,7 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
 void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
                                   size_t imgSizeW, Matrix& denoms,
                                   size_t channels, size_t sizeX, float scale,
-                                  float pow, bool blocked) {
+                                  float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
   size_t width = imgSizeW;
@@ -1655,7 +1657,7 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
                                   Matrix& preOutV, Matrix& localOutV,
                                   size_t channels, size_t imgSizeH,
                                   size_t imgSizeW, size_t size, float scale,
-                                  float pow, bool blocked) {
+                                  float pow) {
   LOG(FATAL) << "Not implemented";
 
   CHECK(imgSizeH * imgSizeW * channels == preOutV.getWidth());
@@ -2512,7 +2514,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::random_shuffle(blockSeq.begin(), blockSeq.end());
+    std::shuffle(blockSeq.begin(), blockSeq.end(),
+        ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index f27773d110..cfb30797fc 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -585,7 +585,7 @@ public:
    * \f[
    *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
    * \f]
-   *  
+   *
    * b contains M elements,
    * c contains N elements (N is odd),
    * b's index arithmetic is computed modulo M,
@@ -774,7 +774,7 @@ public:
   virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH,
                                  size_t imgSizeW, Matrix& denoms,
                                  size_t channels, size_t sizeX, float scale,
-                                 float pow, bool blocked) {
+                                 float pow) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -782,7 +782,7 @@ public:
                                  Matrix& preOutV, Matrix& localOutV,
                                  size_t channels, size_t imgSizeH,
                                  size_t imgSizeW, size_t size, float scale,
-                                 float pow, bool blocked) {
+                                 float pow) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -883,7 +883,7 @@ public:
    * @code
    * this[i] = -sum(label[i][j]*log(output[i][j])
    *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode             
+   * @endcode
    */
   virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
     LOG(FATAL) << "Not implemented";
@@ -895,7 +895,7 @@ public:
    * @code
    * this[i][j] = -label[i][j]/output[i][j]
    *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode             
+   * @endcode
    */
   virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
     LOG(FATAL) << "Not implemented";
@@ -903,12 +903,12 @@ public:
 
   /**
    * @brief  Calculate the classification error for multi binary labels
-   * 
+   *
    * @code
    * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
    *            || (output[i][j] < threshold && label[i][j] == 1))
    *            / output->getWidth()
-   * @endcode           
+   * @endcode
    */
   virtual void classificationErrorMulti(Matrix& output, Matrix& label,
                                         real threshold) {
@@ -1149,12 +1149,12 @@ public:
 
   void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                          Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow, bool blocked);
+                         float scale, float pow);
 
   void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
                          Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX, float scale, float pow,
-                         bool blocked);
+                         size_t imgSizeW, size_t sizeX,
+                         float scale, float pow);
 
   void maxSequenceForward(Matrix& input, const IVector& sequence,
                           IVector& index);
@@ -1260,12 +1260,12 @@ public:
 
   void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                          Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow, bool blocked);
+                         float scale, float pow);
 
   void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
                          Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX, float scale, float pow,
-                         bool blocked);
+                         size_t imgSizeW, size_t sizeX,
+                         float scale, float pow);
 
   void maxSequenceForward(Matrix& input, const IVector& sequence,
                           IVector& index);
@@ -1307,14 +1307,14 @@ public:
    * @code
    * table.row[ids[i]] += this.row[i]
    * @endcode
-   */ 
+   */
   virtual void addToRows(Matrix& table, IVector& ids);
 
   /**
    * @code
    * this[i] = table[i, id[i]]
    * @endcode
-   */ 
+   */
   virtual void selectElements(Matrix& table, IVector& ids);
 
   /**
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index d179ac1f53..8497c26e35 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 
 #include "paddle/utils/Logging.h"
-#include "Bits.h"
+#include "paddle/utils/Util.h"
 #include "Matrix.h"
 #include "hl_gpu.h"
 
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 22af0eb893..aca8ffb0ab 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include <unordered_map>
+#include <map>
 #include "Allocator.h"
 
 namespace paddle {
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 9a879a964e..0403c3521c 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -25,8 +25,8 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine(
-  StorageEngine::singleton, std::numeric_limits<int>::max());
+static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+                                          std::numeric_limits<int>::max());
 
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
 }
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index b1a459b86a..7553ea25e0 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
   } else if ((!useGpu) && (!cpuVectorT_)) {
     cpuVectorT_ = VectorT<T>::create(size, false);
   } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
     this->resize(size, useGpu);
   }
 }
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 631d0516cf..491b0cda7b 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <time.h>
 
 static constexpr size_t VECTOR_LEN = 3072;
@@ -37,7 +37,9 @@ static std::mt19937 RandomEngine(time(0));
 
 inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
                                                  size_t align = ALIGN) {
-  return std::unique_ptr<float[]>((float*)memalign(align, len * sizeof(float)));
+  float* ptr;
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
+  return std::unique_ptr<float[]>(ptr);
 }
 
 inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7caade444b..fe8eacc2ef 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1697,7 +1697,6 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-
 void testParamReluForward(int height, int width, int w_height,
                                                  int w_width) {
   MatrixPtr output = CpuMatrix::create(height, width, false, false);
@@ -1736,7 +1735,6 @@ TEST(Matrix, paramReluForward) {
   }
 }
 
-
 void testParamReluBackwardW(int height, int width, int w_height,
                                                    int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1775,7 +1773,6 @@ TEST(Matrix, paramReluBackwardW) {
   }
 }
 
-
 void testParamReluBackwardDiff(int height, int width, int w_height,
                                                       int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1819,6 +1816,36 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
+void testClassificationError(int numSamples, int dim) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(cpuOutput, cpuLabel);
+  gpuError->classificationError(gpuOutput, gpuLabel);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
+  check->copyFrom(*gpuError);
+  MatrixCheckEqual(*cpuError, *check);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
+    for (auto dim : {1, 10, 100, 1000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testClassificationError(numSamples, dim);
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 1310e50987..fa682164aa 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -124,8 +124,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
             count++;
           }
@@ -141,8 +141,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             count++;
           }
         }
@@ -173,8 +173,8 @@ void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
     for (int j = 0; j < width; j++) {
       real a = data1[i * width + j];
       real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
           count++;
         }
       }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 93f86ceccf..42c74661d2 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -25,6 +25,7 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
     if (!dest) {
       dest = src->clone(0, 0, useGpu);
     } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
       dest->resize(src->getHeight(), src->getWidth());
     }
     dest->copyFrom(*src, stream);
@@ -60,12 +61,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startRow + copySize, src->getHeight());
-
     int height = copySize;
     int width = src->getWidth();
     if (!dest) {
       dest = src->clone(height, width, useGpu);
     } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
       dest->resize(height, width);
     }
     MatrixPtr submat = src->subMatrix(startRow, copySize);
@@ -182,6 +183,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
   }
 }
 
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
                                  hl_stream_t stream) {
   dataId = src.dataId;
@@ -199,6 +205,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
   resizeAndCopy(strs, src.strs, useGpu, stream);
 }
 
+int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                                    int32_t copySize, bool useGpu) {
+    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
+                                     HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    return size;
+}
+
 int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                                     int32_t copySize, bool useGpu,
                                     hl_stream_t stream) {
@@ -269,6 +283,9 @@ void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
                       const std::vector<int>& seqStartPos, bool useGpu,
                       hl_stream_t stream, PassType passType) {
+  CHECK(!subSequenceStartPositions)
+          << "undefined behavior for subsequence positions";
+
   size_t batchSize = selectRows.size();
   auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
                                      int startRow, int pos, int size,
@@ -347,9 +364,11 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
                       hl_stream_t stream, PassType passType) {
   int32_t batchSize = 0;
   int64_t numSequences = 0;
+  int64_t numSubSequences = 0;
   for (auto& arg : args) {
     batchSize += arg.getBatchSize();
     numSequences += arg.getNumSequences();
+    numSubSequences += arg.getNumSubSequences();
   }
 
   auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
@@ -393,8 +412,26 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     std::copy(src->begin(), src->end(), dst->begin() + startRow);
   };
 
+  auto copySequencePos = []
+          (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq,
+           int dstNumSequences, int srcNumSequences,
+           int& startSequences, int startRow) {
+      if (srcSeq) {
+          ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+          const int* src = srcSeq->getData(false);
+          int* dest = dstSeq->getMutableData(false);
+          for (int i = 0; i < srcNumSequences + 1; ++i) {
+              dest[i + startSequences] = src[i] + startRow;
+          }
+          startSequences += srcNumSequences;
+      } else {
+          dstSeq.reset();
+      }
+  };
+
   int startRow = 0;
   int startSequences = 0;
+  int startSubSequences = 0;
   dataId = args[0].dataId;
   for (auto& arg : args) {
     CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
@@ -403,17 +440,18 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     copyArg(value, arg.value, startRow, useGpu);
     if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
     copyIds(ids, arg.ids, startRow, useGpu);
-    if (arg.sequenceStartPositions) {
-      ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                                     numSequences + 1,
-                                     false);
-      const int* src = arg.sequenceStartPositions->getData(false);
-      int* dest = sequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < arg.getNumSequences() + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += arg.getNumSequences();
-    }
+    copySequencePos(sequenceStartPositions,
+                    arg.sequenceStartPositions,
+                    numSequences,
+                    arg.getNumSequences(),
+                    startSequences,
+                    startRow);
+    copySequencePos(subSequenceStartPositions,
+                    arg.subSequenceStartPositions,
+                    numSubSequences,
+                    arg.getNumSubSequences(),
+                    startSubSequences,
+                    startRow);
     copyStrs(strs, arg.strs, startRow, useGpu);
     startRow += arg.getBatchSize();
   }
@@ -439,51 +477,34 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
   }
 }
 
-void Argument::getSeqLengthAndStart(
-    std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-    int* maxSequenceLength) const {
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
   const int* starts = sequenceStartPositions->getData(false);
-  if (hasSubseq()) {
-    size_t numSubSequences = getNumSubSequences();
-    (*seqLengthAndStart).reserve(numSubSequences);
-    const int* subStarts = subSequenceStartPositions->getData(false);
-    int seqIndex = 0;
-    int subSeqIndex = 0;
-    *maxSequenceLength = 0;
-    for (size_t i = 0; i < numSubSequences; ++i) {
-      if (subStarts[i] == starts[seqIndex]) {
-        subSeqIndex = 0;
-        (*seqLengthAndStart)
-            .push_back(std::make_tuple<int, int, int, int>(
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
-                (int)seqIndex, (int)subSeqIndex));
-        ++subSeqIndex;
-        ++seqIndex;
-      } else if (subStarts[i] < starts[seqIndex]) {
-        (*seqLengthAndStart)
-            .push_back(std::make_tuple<int, int, int, int>(
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
-                (int)seqIndex - 1, (int)subSeqIndex));
-        ++subSeqIndex;
+  const int* subStarts = hasSubseq()
+      ? subSequenceStartPositions->getData(false) : nullptr;
+  size_t numSequences = getNumSequences();
+  seqInfo->reserve(numSequences);
+  int subSeqEnd = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
+    SeqInfo info;
+    info.seqStart = starts[i];
+    info.subLevelLength = starts[i + 1] - starts[i];
+    info.seqId = i;
+    if (hasSubseq()) {
+      info.subSeqStart = subSeqEnd;
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
+        ++subSeqEnd;
       }
-      // maxSequenceLength_ = 1 + max(subSeqIndex) in each Seq.
-      if (*maxSequenceLength < std::get<3>((*seqLengthAndStart)[i]))
-        *maxSequenceLength = std::get<3>((*seqLengthAndStart)[i]);
-    }
-    *maxSequenceLength += 1;
-  } else {
-    size_t numSequences = getNumSequences();
-    (*seqLengthAndStart).reserve(numSequences);
-    for (size_t i = 0; i < numSequences; ++i) {
-      (*seqLengthAndStart)
-          .push_back(std::make_tuple<int, int, int, int>(
-              starts[i + 1] - starts[i], (int)starts[i], (int)i, (int)i));
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
+    } else {
+      info.topLevelLength = info.subLevelLength;
+      info.subSeqStart = 0;  // not used
     }
-    std::sort((*seqLengthAndStart).begin(), (*seqLengthAndStart).end(),
-              std::greater<std::tuple<int, int, int, int>>());
-
-    *maxSequenceLength = std::get<0>((*seqLengthAndStart)[0]);
+    seqInfo->push_back(info);
   }
+  std::sort(seqInfo->begin(), seqInfo->end(),
+            [](const SeqInfo& a, const SeqInfo& b) {
+              return a.topLevelLength > b.topLevelLength;
+            });
 }
 
 void Argument::checkSubset() const {
@@ -533,11 +554,16 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
 void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
                           size_t width, bool useGpu, bool trans, bool seqFlag,
                           size_t seqStart, size_t seqSize) {
-  value = Matrix::create(input.value->getData() + offset, height, width, trans,
-                         useGpu);
+  if (input.value) {
+    value = Matrix::create(input.value->getData() + offset * width,
+                           height, width, trans, useGpu);
+  }
+  if (input.ids) {
+    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
+  }
   if (input.grad) {
-    grad = Matrix::create(input.grad->getData() + offset, height, width, trans,
-                          useGpu);
+    grad = Matrix::create(input.grad->getData() + offset * width,
+                          height, width, trans, useGpu);
   }
   if (seqFlag) {
     sequenceStartPositions = std::make_shared<ICpuGpuVector>(
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index c444ebaf12..81ff9029bc 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -177,11 +177,11 @@ struct Argument {
   }
 
   /**
-   * @brief (value, grad, sequenceStartPositions) of output are subset of
+   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
    *        input. Note that, output share the same memory of input.
    *
    * @param input[in]       input
-   * @param offset[in]      offset of input.value
+   * @param offset[in]      offset in terms of rows
    * @param height[in]      height of output.value
    * @param width[in]       width of output.value
    * @param useGpu[in]
@@ -203,13 +203,28 @@ struct Argument {
    *   startSeq: the sample id of start
    *   copySize: how many samples need to copy
    *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
    */
   int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu,
-                            hl_stream_t stream = HPPL_STREAM_DEFAULT);
+                            int32_t copySize, bool useGpu, hl_stream_t stream);
 
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
-                         hl_stream_t stream = HPPL_STREAM_DEFAULT);
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
 
   /*
     @brief Concatenate several arguments into one and put the result into it.
@@ -238,12 +253,29 @@ struct Argument {
   static void splitByDataId(const std::vector<Argument>& argus,
                             std::vector<std::vector<Argument>>* arguGroups);
 
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+
+    int seqStart;
+    int seqId;
+
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
   /*
-   Get Sequence Length, startPositions and max Length according to input
-   */
-  void getSeqLengthAndStart(
-      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-      int* maxSequenceLength) const;
+    Get SeqInfo for each sequence of this argument
+    Elements in *seqInfo are sorted by topLevelLength in descending order
+  */
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
+
   /*
    Check Whether sequenceStartPositions is subset of
    subSequenceStartPositions.
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 3db96ccf94..1a22abf7cf 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <paddle/utils/Util.h>
 
 #include <gtest/gtest.h>
@@ -124,9 +124,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
 TEST_F(CommonTest, sgdUpdate) {
   const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
   for (auto& size : sizeVec_) {
-    real* gradientBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* valueBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* momentumBuffer = (real*)memalign(32, sizeof(real) * size);
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+        0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+        0);
+
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
       valueBuffer[i] = 2.0;
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index fb427832fa..ff2875fc70 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <fcntl.h>
 
 #include <arpa/inet.h>
@@ -24,7 +25,6 @@ limitations under the License. */
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <sstream>
-#include <linux/tcp.h>
 
 #include "LightNetwork.h"
 #include "paddle/utils/Util.h"
@@ -79,6 +79,7 @@ std::string getIpAddr(std::string &device) {
  * @note adjust some default sock option for better performance
  */
 void setOption(int sockfd) {
+#if !defined(__APPLE__) && !defined(__OSX__)
   int sendSize = FLAGS_sock_send_buf_size;
   int recvSize = FLAGS_sock_recv_buf_size;
   CHECK_GE(
@@ -87,15 +88,19 @@ void setOption(int sockfd) {
   CHECK_GE(
       setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
       0);
+#endif
+
   if (FLAGS_small_messages) {
     int optval = 1;
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
         0);
+#ifdef TCP_QUICKACK
     optval = 1;
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
         0);
+#endif
   }
   int reuse = 1;
   CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
@@ -340,17 +345,27 @@ void SocketWorker::run() {
  */
 void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
-  struct hostent hostinfo, *server;
-  char buf[1024];  // temp for gethostbyname_r
+  struct hostent *server;
+
   int errRet;      // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
-  CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                              &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r err";
+
+#if defined(__OSX__) || defined(__APPLE__)
+   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+   CHECK_NE(HOST_NOT_FOUND, errRet)
+     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "getipnodebyname error!";
+#else
+   struct hostent hostinfo;
+   char buf[1024];  // temp for gethostbyname_r
+   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
+                               &server, &errRet))
+       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "gethostbyname_r error!";
+#endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 07961cbdcc..d0e5352c82 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -278,7 +278,7 @@ void ParameterClient2::prepareSendData(
 
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * blockSize});
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize});
             /// detect sparse parameter distribution
             sparseDistribution_->probeDistribution(serverId,
                     sizeof(real) * blockSize);
@@ -302,8 +302,8 @@ void ParameterClient2::prepareSendData(
         block->set_begin_pos(beginDim);
         block->set_block_size(endDim - beginDim);
         if (buf) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                    {buf + beginDim, sizeof(real) * (endDim - beginDim)});
+            sendJob->parallelInputIovs[serverId].push_back({buf + beginDim,
+                     sizeof(real) * ((size_t) (endDim - beginDim))});
         }
       }
     }
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index bb3caeb728..8f72c1988d 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -724,7 +724,7 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
       << " id=" << block.para_id() << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, block.block_size()});
+  outputBuffers->push_back({valueBuffer, (size_t) block.block_size()});
 }
 
 void ParameterServer2::sendBackParameter(const ParameterBlock& block,
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index ebb4245b9a..b9d542a296 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -27,6 +27,15 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
+ * declares it on osx/ios if defined(KERNEL)
+ */
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV 512
+#endif
+
+
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
diff --git a/paddle/scripts/docker/cpu/Dockerfile b/paddle/scripts/docker/Dockerfile.cpu
similarity index 94%
rename from paddle/scripts/docker/cpu/Dockerfile
rename to paddle/scripts/docker/Dockerfile.cpu
index 119154200a..3aa8cb1a3a 100644
--- a/paddle/scripts/docker/cpu/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.cpu
@@ -7,4 +7,5 @@ ENV WITH_DEMO=OFF
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/cpu-demo/Dockerfile b/paddle/scripts/docker/Dockerfile.cpu-demo
similarity index 93%
rename from paddle/scripts/docker/cpu-demo/Dockerfile
rename to paddle/scripts/docker/Dockerfile.cpu-demo
index b229120382..22c0b9e701 100644
--- a/paddle/scripts/docker/cpu-demo/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.cpu-demo
@@ -7,4 +7,5 @@ ENV WITH_DEMO=ON
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/cpu-devel/Dockerfile b/paddle/scripts/docker/Dockerfile.cpu-devel
similarity index 94%
rename from paddle/scripts/docker/cpu-devel/Dockerfile
rename to paddle/scripts/docker/Dockerfile.cpu-devel
index 1bfa202d0c..b40f3c0a30 100644
--- a/paddle/scripts/docker/cpu-devel/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.cpu-devel
@@ -7,4 +7,5 @@ ENV WITH_DEMO=OFF
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx b/paddle/scripts/docker/Dockerfile.cpu-noavx
new file mode 100644
index 0000000000..5cb5ac7dc4
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx
@@ -0,0 +1,11 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=OFF
+ENV IS_DEVEL=OFF
+ENV WITH_DEMO=OFF
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
new file mode 100644
index 0000000000..bec401960e
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
@@ -0,0 +1,11 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=OFF
+ENV IS_DEVEL=ON
+ENV WITH_DEMO=ON
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
new file mode 100644
index 0000000000..b7c3eaed97
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
@@ -0,0 +1,11 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=OFF
+ENV IS_DEVEL=ON
+ENV WITH_DEMO=OFF
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/gpu/Dockerfile b/paddle/scripts/docker/Dockerfile.gpu
similarity index 94%
rename from paddle/scripts/docker/gpu/Dockerfile
rename to paddle/scripts/docker/Dockerfile.gpu
index 62d6f1f987..b7f5b6d93d 100644
--- a/paddle/scripts/docker/gpu/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -7,4 +7,5 @@ ENV WITH_DEMO=OFF
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/gpu-demo/Dockerfile b/paddle/scripts/docker/Dockerfile.gpu-demo
similarity index 94%
rename from paddle/scripts/docker/gpu-demo/Dockerfile
rename to paddle/scripts/docker/Dockerfile.gpu-demo
index f3b8cd568d..2d1411de09 100644
--- a/paddle/scripts/docker/gpu-demo/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.gpu-demo
@@ -7,4 +7,5 @@ ENV WITH_DEMO=ON
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/gpu-devel/Dockerfile b/paddle/scripts/docker/Dockerfile.gpu-devel
similarity index 94%
rename from paddle/scripts/docker/gpu-devel/Dockerfile
rename to paddle/scripts/docker/Dockerfile.gpu-devel
index 2e600f34d0..eb13f4304f 100644
--- a/paddle/scripts/docker/gpu-devel/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile.gpu-devel
@@ -7,4 +7,5 @@ ENV WITH_DEMO=OFF
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=ON
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx b/paddle/scripts/docker/Dockerfile.gpu-noavx
new file mode 100644
index 0000000000..0944b0e152
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx
@@ -0,0 +1,11 @@
+FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=ON
+ENV IS_DEVEL=OFF
+ENV WITH_DEMO=OFF
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
new file mode 100644
index 0000000000..2da2a55d69
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
@@ -0,0 +1,11 @@
+FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=ON
+ENV IS_DEVEL=ON
+ENV WITH_DEMO=ON
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
new file mode 100644
index 0000000000..9f551462f2
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
@@ -0,0 +1,11 @@
+FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ENV WITH_GPU=ON
+ENV IS_DEVEL=ON
+ENV WITH_DEMO=OFF
+ENV PIP_INSTALL_ARGS ""
+ENV PIP_GENERAL_ARGS ""
+ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=OFF
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
index 89a1147103..129d21b36a 100644
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
@@ -7,4 +7,5 @@ ENV WITH_DEMO=PADDLE_WITH_DEMO
 ENV PIP_INSTALL_ARGS ""
 ENV PIP_GENERAL_ARGS ""
 ENV USE_UBUNTU_MIRROR OFF
+ENV WITH_AVX=PADDLE_WITH_AVX
 RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 1f74e1f1af..33689e736c 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -26,7 +26,7 @@ cd paddle
 mkdir build
 cd build
 cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
+   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF -DWITH_AVX=${WITH_AVX}
 make -j `nproc`
 # because durning make install, there are several warning, so set +e, do not cause abort
 make install
diff --git a/paddle/scripts/docker/cpu-demo/build.sh b/paddle/scripts/docker/cpu-demo/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/cpu-demo/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/docker/cpu-devel/build.sh b/paddle/scripts/docker/cpu-devel/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/cpu-devel/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/docker/cpu/build.sh b/paddle/scripts/docker/cpu/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/cpu/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/docker/generate.sh b/paddle/scripts/docker/generate.sh
index 009c4a8a56..8a50aefd34 100644
--- a/paddle/scripts/docker/generate.sh
+++ b/paddle/scripts/docker/generate.sh
@@ -2,33 +2,60 @@
 set -e
 cd `dirname $0`
 m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04\
-   Dockerfile.m4 > cpu/Dockerfile
-cp build.sh cpu/
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
+   Dockerfile.m4 > Dockerfile.cpu
+
+m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
+   Dockerfile.m4 > Dockerfile.cpu-noavx
 
 m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04\
-   Dockerfile.m4 > cpu-devel/Dockerfile
-cp build.sh cpu-devel/
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
+   Dockerfile.m4 > Dockerfile.cpu-noavx-devel
+
+m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
+   Dockerfile.m4 > Dockerfile.cpu-devel
+
 
 m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04\
-   Dockerfile.m4 > cpu-demo/Dockerfile
-cp build.sh cpu-demo/
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
+   Dockerfile.m4 > Dockerfile.cpu-demo
+
+m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
+   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
+   Dockerfile.m4 > Dockerfile.cpu-noavx-demo
+
+
+m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
+   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
+   -DPADDLE_WITH_AVX=ON \
+   Dockerfile.m4 > Dockerfile.gpu
 
 m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
    -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   Dockerfile.m4 > gpu/Dockerfile
-cp build.sh gpu/
+   -DPADDLE_WITH_AVX=OFF \
+   Dockerfile.m4 > Dockerfile.gpu-noavx
+
 
 m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
    -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   Dockerfile.m4 > gpu-devel/Dockerfile
-cp build.sh gpu-devel/
+   -DPADDLE_WITH_AVX=ON \
+   Dockerfile.m4 > Dockerfile.gpu-devel
+
+m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
+   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
+   -DPADDLE_WITH_AVX=OFF \
+   Dockerfile.m4 > Dockerfile.gpu-noavx-devel
 
 m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
    -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   Dockerfile.m4 > gpu-demo/Dockerfile
-cp build.sh gpu-demo/
+   -DPADDLE_WITH_AVX=ON \
+   Dockerfile.m4 > Dockerfile.gpu-demo
+
 
+m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
+   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
+   -DPADDLE_WITH_AVX=OFF \
+   Dockerfile.m4 > Dockerfile.gpu-noavx-demo
 
diff --git a/paddle/scripts/docker/gpu-demo/build.sh b/paddle/scripts/docker/gpu-demo/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/gpu-demo/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/docker/gpu-devel/build.sh b/paddle/scripts/docker/gpu-devel/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/gpu-devel/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/docker/gpu/build.sh b/paddle/scripts/docker/gpu/build.sh
deleted file mode 100644
index 1f74e1f1af..0000000000
--- a/paddle/scripts/docker/gpu/build.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
-
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
-
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF
-make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
-make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
-
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
-trap : 0
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index eed2d31593..4cf5f41f19 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -43,6 +43,41 @@ fi
 
 export PYTHONPATH=${PWD}:${PYTHONPATH}
 
+
+# Check python lib installed or not.
+pip --help > /dev/null
+if [ $? -ne 0 ]; then
+    echo "pip should be installed to run paddle."
+    exit 1
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'`
+
+if [ -z ${INSTALLED_VERSION} ]; then
+   INSTALLED_VERSION="0.0.0"  # not installed
+fi
+cat <<EOF | python -
+from distutils.version import LooseVersion
+import sys
+if LooseVersion("${INSTALLED_VERSION}") < LooseVersion("@PADDLE_VERSION@"):
+  sys.exit(1)
+else:
+  sys.exit(0)
+EOF
+
+if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
+   echo "First time run paddle, need to install some python dependencies."
+   BASEDIR=$(dirname "$0")
+   pip install ${BASEDIR}/../opt/paddle/share/wheels/*.whl
+   if [ $? -ne 0 ]; then
+      echo "pip install wheels failed. "
+      echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+      echo "PaddlePaddle will install some python dependencies automatically."
+      exit 1
+   fi
+   echo "Python dependencies are installed."
+fi
+
 case "$1" in
     "train")
         ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
diff --git a/paddle/scripts/travis/before_install.sh b/paddle/scripts/travis/before_install.sh
new file mode 100755
index 0000000000..ec2ac1f224
--- /dev/null
+++ b/paddle/scripts/travis/before_install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+pushd /usr/src/gtest
+cmake .
+make
+sudo cp *.a /usr/lib
+popd
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
new file mode 100755
index 0000000000..3ea633be32
--- /dev/null
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+source ./common.sh
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
+make -j `nproc`
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+sudo make install
+sudo paddle version
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
new file mode 100755
index 0000000000..37e27d665b
--- /dev/null
+++ b/paddle/scripts/travis/common.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+mkdir -p ../../../build
+cd ../../../build
+
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
new file mode 100644
index 0000000000..b0aa45c5ac
Binary files /dev/null and b/paddle/scripts/travis/deploy_key.enc differ
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
new file mode 100755
index 0000000000..c2a4809d75
--- /dev/null
+++ b/paddle/scripts/travis/docs.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Add set -e, cd to directory.
+source ./common.sh
+
+# Compile Documentation only.
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+make paddle_docs paddle_docs_cn
+
+# Parse Github URL
+REPO=`git config remote.origin.url`
+SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
+SHA=`git rev-parse --verify HEAD`
+
+# Documentation branch name
+# gh-pages branch is used for PaddlePaddle.org. The English version of 
+# documentation in `doc` directory, and the chinese version in `doc_cn`
+# directory.
+TARGET_BRANCH="gh-pages"
+
+# Only deploy master branch to build latest documentation.
+SOURCE_BRANCH="master"
+
+# If is not a Github pull request, and in master branch.
+if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH"  ]; then
+  exit 0
+fi
+
+# Clone the repo to output directory
+git clone $REPO output
+cd output
+
+# checkout github page branch
+git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+
+# remove old docs. mv new docs.
+rm -rf doc doc_cn
+mv ../doc_cn/html doc_cn
+mv ../doc/html doc
+
+# Check is there anything changed.
+set +e
+git diff --exit-code >/dev/null
+if [ $? -eq 0 ]; then
+  echo "No changes to the output on this push; exiting."
+  exit 0
+fi
+set -e
+
+# Commit
+git add .
+git config user.name "Travis CI"
+git config user.email "paddle-dev@baidu.com"
+git commit -m "Deploy to GitHub Pages: ${SHA}"
+
+# Set ssh private key
+openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
+chmod 600 deploy_key
+eval `ssh-agent -s`
+ssh-add deploy_key
+
+# Push
+git push $SSH_REPO $TARGET_BRANCH
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
new file mode 100755
index 0000000000..c49d4546c2
--- /dev/null
+++ b/paddle/scripts/travis/main.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+cd `dirname $0`
+
+if [ ${JOB} == "BUILD_AND_TEST" ]; then
+  ./build_and_test.sh
+elif [ ${JOB} == "DOCS" ]; then
+  ./docs.sh
+else
+  echo Unknown job ${JOB}
+  exit 1
+fi
diff --git a/paddle/setup.py b/paddle/setup.py.in
similarity index 73%
rename from paddle/setup.py
rename to paddle/setup.py.in
index fabe2a6b4c..02ea906743 100644
--- a/paddle/setup.py
+++ b/paddle/setup.py.in
@@ -17,6 +17,14 @@
 from setuptools import setup, Extension
 import numpy as np
 import api.paddle_ld_flags
+import platform
+
+system = platform.system().lower()
+
+is_osx = (system == 'darwin')
+is_win = (system == 'windows')
+is_lin = (system == 'linux')
+
 
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
@@ -34,17 +42,24 @@ try:
 except:
   pass
 
+if is_lin == True:
+    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
+elif is_osx == True:
+    extra_links = ["-Wl,-all_load"] + extra_links
+
+include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
+
 setup(name="py_paddle",
-  version="0.8.0b",  # TODO(yuyang18): Make this version same as CMake
+  version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-      ['Paddle_wrap.cxx'],
-      extra_link_args=["-Xlinker", '-start-group'] + 
-                        extra_links + ["-Xlinker", "-end-group"]
+       ['Paddle_wrap.cxx'],
+       include_dirs = include_dirs,
+       extra_link_args = extra_links
     )
   ],
   packages=['py_paddle'],
-  include_dirs = [np.get_include(), "../"],   # include numpy and paddle.
+  include_dirs = include_dirs,
   install_requires = [
     'numpy>=1.8.0',      # The numpy is required.
     'protobuf>=2.4.1' # The paddle protobuf version
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 65d827787e..91f7f4d29d 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -141,7 +141,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
   } else if (hasCpuPara) {
     getGlobalSyncThreadPool()->exec(cpuTraverse);
   } else if (hasGpuPara) {
-    cpuTraverse(0, 0);
+      gpuTraverse(0, 0);
   }
 }
 
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index f47d3b08c1..d8a7a5dd4f 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -79,7 +79,7 @@ protected:
   // The update function for after update operations, such as averager.
   void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
                       int tid, size_t numThreads, Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback&(Parameter*)>
+  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
     GetTraverseCallback;
   void traverse(GetTraverseCallback getTraverseCallback);
 };
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 2890f5b5d7..275150e12d 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
@@ -193,7 +194,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
 
   dataProvider_ = dataProvider;
   if (!dataProvider_ && config_->hasDataConfig()) {
-    dataProvider_.reset(DataProvider::create(*config_, gpuData));
+    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
   if (dataProvider_) {
     evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
@@ -211,7 +212,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   testDataProvider_ = testDataProvider;
   if (!testDataProvider_ && config_->hasTestDataConfig()) {
     testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), gpuData));
+        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
   }
   if (testDataProvider_) {
     tester_.reset(new Tester(config_, createTesterConfig(),
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index 76b6b9bc3e..6029a4b2c1 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -101,6 +101,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
       // it
       //! to ParameterHook.
       auto& grad = para->getBuf(PARAMETER_GRADIENT);
+      SetDevice device(para->getDeviceId());
       paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
       paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
     }
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index dd30b2c8a5..94266639f9 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/pserver/ParameterServer2.h"
 
 #include "ParamUtil.h"
diff --git a/paddle/trainer/tests/.gitignore b/paddle/trainer/tests/.gitignore
index 79f7012036..aedb0ef22e 100644
--- a/paddle/trainer/tests/.gitignore
+++ b/paddle/trainer/tests/.gitignore
@@ -1,2 +1,3 @@
 dump_text.test
 test_pydata_provider_wrapper.json
+*proto.bin
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 370f0b4b41..60c129f4e2 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -47,18 +47,19 @@ add_test(NAME test_CompareTwoOpts
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
             --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=1
+            --num_passes=1 --need_high_accuracy=0
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
 ################# test_CompareSparse ##################
 add_unittest_without_exec(test_CompareSparse
     test_CompareSparse.cpp)
-add_test(NAME test_CompareSparse
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-            ./.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+if(NOT ON_TRAVIS)
+  add_test(NAME test_CompareSparse
+    COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+              ./.set_port.sh -p port -n 6
+                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
+endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
     test_recurrent_machine_generation.cpp)
diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
new file mode 100644
index 0000000000..703e87753d
--- /dev/null
+++ b/paddle/trainer/tests/mnist.list
@@ -0,0 +1 @@
+trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
new file mode 100644
index 0000000000..08b93a0ebb
Binary files /dev/null and b/paddle/trainer/tests/mnist_bin_part differ
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
new file mode 100644
index 0000000000..02c7f142a3
--- /dev/null
+++ b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
@@ -0,0 +1,16 @@
+0	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+	 1 2 3 4
+
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
index 61d2c62d42..f5b1988dda 100644
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
@@ -12,32 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
 
 ################################### Data Configuration ###################################
-TrainData(ProtoData(files = "train.list"))
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-Settings(
-    learning_rate_decay_a = 0.0,
-    learning_rate_decay_b = 0.0,
-    learning_rate = 1e-03,
-    batch_size = 1000,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    learning_method='sparse_momentum',
-)
-default_momentum(0.5)
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
-Layer(type = "data", name = "input", size = 784)
-Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
-Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
-Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
-#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
-Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
-Inputs("input", "label")
-Outputs("cost")
+data = data_layer(name ="input", size=784)
+
+fc1 = fc_layer(input=data, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=fc1, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc1, fc2], size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=1)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
index 82d547dd8a..f5b1988dda 100644
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
@@ -12,32 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
 
 ################################### Data Configuration ###################################
-TrainData(ProtoData(files = "train.list"))
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-Settings(
-    learning_rate_decay_a = 0.0,
-    learning_rate_decay_b = 0.0,
-    learning_rate = 1e-03,
-    batch_size = 1000,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    learning_method='momentum',
-)
-default_momentum(0.5)
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
-Layer(type = "data", name = "input", size = 784)
-Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
-Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
-Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
-#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
-Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
-Inputs("input", "label")
-Outputs("cost")
+data = data_layer(name ="input", size=784)
+
+fc1 = fc_layer(input=data, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=fc1, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc1, fc2], size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=1)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/trainer/tests/sample_trainer_config_parallel.conf
index 3563fede1c..e35a1f26da 100644
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ b/paddle/trainer/tests/sample_trainer_config_parallel.conf
@@ -13,137 +13,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
 
-TrainData(
-    SimpleData(
-        files = "trainer/tests/sample_filelist.txt",
-        feat_dim = 3,
-        context_len = 0,
-        buffer_capacity = 1000000,
-    )
-)
+TrainData(SimpleData(
+            files = "trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
 
-TestData(
-    SimpleData(
-        files = "trainer/tests/sample_filelist.txt",
-        feat_dim = 3,
-        context_len = 0,
-        buffer_capacity = 1000000,
-    )
-)
+TestData(SimpleData(
+           files = "trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
 
-Settings(
-    algorithm = "sgd",
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    batch_size = 100,
-    learning_rate = 0.001,
-    learning_rate_decay_a = 1e-5,
-    learning_rate_decay_b = 0.5,
-)
+settings(batch_size = 100)
 
-default_initial_std(0.2)
 # Output layer, label layer, cost layer, preferably set to the same environment.
 output_device = 0
 
-model_type("nn")
-
 # Input Layer does not need to specify the device number.
-Layer(
-    name = "input",
-    type = "data",
-    size = 3,
-)
+data = data_layer(name='input', size=3)
 
 # Calculate in the CPU.
-Layer(
-    name = "layer1_1",
-    type = "fc",
-    size = 5,
-    active_type = "sigmoid",
-    device = -1,
-    inputs = "input",
-)
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=-1),
+               act=SigmoidActivation())
 
 # Calculate in the GPU 0.
-Layer(
-    name = "layer2_1",
-    type = "fc",
-    size = 10,
-    active_type = "sigmoid",
-    device = 0,
-    inputs = "layer1_1",
-)
+fc2 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
 
 # Calculate in the GPU 1.
-Layer(
-    name = "layer2_2",
-    type = "fc",
-    size = 10,
-    active_type = "sigmoid",
-    device = 1,
-    inputs = "layer1_1",
-)
+fc3 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
 
 # Calculate in the GPU 0.
-Layer(
-    name = "layer3_1",
-    type = "fc",
-    size = 10,
-    device = 0,
-    active_type = "sigmoid",
-    inputs = ["layer2_1", "layer2_2"],
-)
+fc4 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
 
 # Calculate in the GPU 1.
-Layer(
-    name = "layer3_2",
-    type = "fc",
-    size = 10,
-    device = 1,
-    active_type = "sigmoid",
-    inputs = ["layer2_1", "layer2_2"],
-)
-
+fc5 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
 
-Layer(
-    name = "output",
-    type = "fc",
-    size = 10,
-    device = output_device,
-    active_type = "sigmoid",
-    inputs = ["layer3_1", "layer3_2"],
-)
+output = fc_layer(input=[fc4,fc5], size=10,
+                  bias_attr=True,
+                  layer_attr=ExtraAttr(device=output_device),
+                  act=SoftmaxActivation())
 
 if get_config_arg('with_cost', bool, True):
     # This is for training the neural network.
     # We need to have another data layer for label
     # and a layer for calculating cost
-    Layer(
-        name = "label",
-        type = "data",
-        device = output_device,
-        size = 1,
-    )
-
-    Layer(
-        name = "cost",
-        type = "multi-class-cross-entropy",
-        device = output_device,
-        inputs = ["output", "label"],
-    )
-
-    Evaluator(
-        name = "error",
-        type = "classification_error",
-        inputs = ["output", "label"])
-
-    Inputs("input", "label")
-    Outputs("cost")
-
+    lbl = data_layer(name='label', size=1,
+                    layer_attr=ExtraAttr(device=output_device))
+                    
+    outputs(classification_cost(input=output, 
+                                label=lbl,
+                                layer_attr=ExtraAttr(device=output_device)))
 else:
     # This is for prediction where we don't have label
     # and don't need to calculate cost
-    Inputs("input")
-    Outputs("output")
+    outputs(output)
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
new file mode 100644
index 0000000000..613fd325e1
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -0,0 +1,73 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=15, learning_rate=0)
+
+num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
+
+sent_id = data_layer(name="sent_id", size=1)
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+def outer_step(dummy_data):
+
+    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
+                  GeneratedInput(size=num_words,
+                                 embedding_name="wordvec",
+                                 embedding_size=num_words)]
+
+    def inner_step(dummy_memory, predict_word):
+        
+        # simplified RNN for testing
+        with mixed_layer(size=num_words) as layer:
+            layer += full_matrix_projection(input=predict_word,
+                                            param_attr=ParamAttr(name="transtable"))
+
+        with mixed_layer(size=num_words, act=ExpActivation()) as out:
+            out += trans_full_matrix_projection(input=layer,
+                                                param_attr=ParamAttr(name="wordvec"))
+
+        return out
+    
+    beam_gen = beam_search(name="rnn_gen",
+                           step=inner_step,
+                           input=gen_inputs,
+                           bos_id=0,
+                           eos_id=num_words-1,
+                           beam_size=2 if beam_flag else 1,
+                           num_results_per_sample=2 if beam_flag else 1,
+                           max_length=10) 
+    return beam_gen
+
+beam_gen_concat = recurrent_group(name="rnn_gen_concat",
+                                  step=outer_step,
+                                  input=[SubsequenceInput(dummy_data)])
+
+seqtext_printer_evaluator(input=beam_gen_concat,
+                          id_input=sent_id,
+                          dict_file="./trainer/tests/test_gen_dict.txt",
+                          result_file="./trainer/tests/dump_text.test")
+#outputs(beam_gen_concat)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index 5b65310e76..ec1c12cc89 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -13,96 +13,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
 
-import math
+from paddle.trainer_config_helpers import *
 
-beam_search = get_config_arg('beam_search', bool, False)
-
-model_type("recurrent_nn")
-
-Settings(learning_rate=0, batch_size=15, algorithm='sgd')
-
-Inputs("sent_id", "dummy_data_input")
-Outputs("predict_word")
+settings(batch_size=15, learning_rate=0)
 
 num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
 
-DataLayer(name="sent_id", size=1, )
+sent_id = data_layer(name="sent_id", size=1)
 
 # This layer has no actual use, but only to decide batch_size in generation.
 # When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-DataLayer(name="dummy_data_input", size=2, )
-
-if beam_search:
-    RecurrentLayerGroupBegin("decoding_layer_group",
-                             in_links=[],
-                             out_links=["predict_word"],
-                             generator=Generator(max_num_frames=10,
-                                                 beam_size=2,
-                                                 num_results_per_sample=2, ))
-else:
-    RecurrentLayerGroupBegin("decoding_layer_group",
-                             in_links=[],
-                             out_links=["predict_word"],
-                             generator=Generator(max_num_frames=10, ))
-dummy_memory = Memory(name="dummy_memory",
-                      size=2,
-                      boot_layer="dummy_data_input")
-MixedLayer(name="dummy_memory",
-           size=2,
-           bias=False,
-           inputs=[IdentityProjection(dummy_memory)], )
-state_memory = Memory(name="state",
-                      size=num_words,
-                      #boot_bias=True,
-                      #boot_bias_active_type = "tanh",
-                      )
-
-predict_word_memory = Memory(name="predict_word",
-                             size=num_words,
-                             boot_with_const_id=0, )
-
-MixedLayer(
-        name = "word_embedding",
-        size = num_words, # word embedding dim is the same as num_words in this test.
-        bias = False,
-        inputs = TableProjection(predict_word_memory,
-                                 initial_std=1,
-                                 learning_rate=0,
-                                 parameter_name="wordvec"))
-
-Layer(  # simplified RNN for testing
-    name="state",
-    type="mixed",
-    size=num_words,
-    bias=False,
-    inputs=[FullMatrixProjection("word_embedding",
-                                 parameter_name="transtable")])
-
-Layer(name="output",
-      type="mixed",
-      size=num_words,
-      active_type="exponential",
-      bias=False,
-      inputs=TransposedFullMatrixProjection("state",
-                                            initial_std=1,
-                                            learning_rate=0,
-                                            parameter_name="wordvec"), )
-
-Layer(name="predict_word", type="maxid", inputs=["output"], )
-
-Layer(name="eos_check",
-      type="eos_id",
-      eos_id=num_words - 1,
-      inputs=["predict_word"], )
-RecurrentLayerGroupEnd("decoding_layer_group")
-
-Evaluator(name="answer_printer",
-          type="seq_text_printer",
-          dict_file="./trainer/tests/test_gen_dict.txt",
-          result_file="./trainer/tests/dump_text.test",
-          inputs=[
-              "sent_id",
-              "predict_word",
-          ], )
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+gen_inputs = [StaticInput(input=dummy_data, size=2),
+              GeneratedInput(size=num_words,
+                             embedding_name="wordvec",
+                             embedding_size=num_words)]
+
+def step(dummy_memory, predict_word):
+    
+    # simplified RNN for testing
+    with mixed_layer(size=num_words) as layer:
+        layer += full_matrix_projection(input=predict_word,
+                                        param_attr=ParamAttr(name="transtable"))
+
+    with mixed_layer(size=num_words, act=ExpActivation()) as out:
+        out += trans_full_matrix_projection(input=layer,
+                                            param_attr=ParamAttr(name="wordvec"))
+
+    return out
+    
+beam_gen = beam_search(name="rnn_gen",
+                       step=step,
+                       input=gen_inputs,
+                       bos_id=0,
+                       eos_id=num_words-1,
+                       beam_size=2 if beam_flag else 1,
+                       num_results_per_sample=2 if beam_flag else 1,
+                       max_length=10) 
+
+seqtext_printer_evaluator(input=beam_gen,
+                          id_input=sent_id,
+                          dict_file="./trainer/tests/test_gen_dict.txt",
+                          result_file="./trainer/tests/dump_text.test")
+#outputs(beam_gen)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3070682c0a..ff37d7b364 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -146,12 +146,12 @@ TEST(compareSparse, remote_cpu) {
 TEST(compareSparse, cpu10_local_vs_remote) {
   FLAGS_local = 1;  // disable remote sparse update in parameter config
   std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   FLAGS_local = 0;  // will enable remote sparse update
   FLAGS_ports_num_for_sparse = 5;
   std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   compareValue(localParameters, remoteParameters);
 }
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
       FLAGS_parallel_nn = useGpu;
       LOG(INFO) << " local=" << local
                 << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 10;
+      int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
       compareValue(getDenseParameters(), parameters, eps);
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 8ca9be71de..ad2a715ef8 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -62,7 +62,11 @@ TEST(checkGradient, multiGpu) {
   }
 }
 
-TEST(checkGradient, parallel) { checkGradientTest(configFile4, true, true); }
+TEST(checkGradient, parallel) {
+  if (hl_get_device_count() >= 2) {
+    checkGradientTest(configFile4, true, true);
+  }
+}
 
 TEST(checkGradient, multiParallel) {
   FLAGS_allow_only_one_model_on_one_gpu = false;
@@ -90,7 +94,11 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
+#if defined(__APPLE__) || defined (__OSX__)
+  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
+#else
   EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
+#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 6d8b8e0ca5..4554b94485 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -82,7 +82,11 @@ TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
 
 TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
 
-TEST(trainerOnePass, parallel) { trainerOnePassTest(configFile2, true, true); }
+TEST(trainerOnePass, parallel) {
+  if (hl_get_device_count() >= 2) {
+    trainerOnePassTest(configFile2, true, true);
+  }
+}
 #endif
 
 // 2. test average_window.
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index cf52c568e5..fcee318d16 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 
 #include <paddle/utils/PythonUtil.h>
@@ -24,6 +23,8 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf";
+static const string& NEST_CONFIG_FILE =
+    "trainer/tests/sample_trainer_nest_rnn_gen.conf";
 static const string& OUTPUT_DIR = "trainer/tests/dump_text.test";
 static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
 static string expectFile =                                           // NOLINT
@@ -50,32 +51,52 @@ void checkOutput(const string& expRetFile) {
   }
 }
 
-void prepareInArgs(vector<Argument>& inArgs,
-                   const size_t batchSize, bool useGpu) {
+void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
+                   bool useGpu, bool hasSubseq) {
   inArgs.clear();
   // sentence id
   Argument sentId;
   sentId.value = nullptr;
-  IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
-  for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
+  if (hasSubseq) {
+    // as there is only one sequence, there is only one label.
+    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
+    sentId.ids->setElement(0, 0);
+  } else {
+    // as there is batchSize word, there is batchSize label.
+    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
+    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
+  }
   inArgs.emplace_back(sentId);
 
   // a dummy layer to decide batch size
   Argument dummyInput;
   dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
   dummyInput.value->randomizeUniform();
+  if (hasSubseq) {
+    // generate one sequence with batchSize subsequence,
+    // and each subsequence has only one word.
+    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
+    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
+    dummyInput.subSequenceStartPositions =
+        ICpuGpuVector::create(batchSize + 1, false);
+    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
+    buf[0] = 0;
+    buf[1] = batchSize;
+    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
+  }
   inArgs.emplace_back(dummyInput);
 }
 
-void testGeneration(bool useGpu, const string& expRetFile) {
+void testGeneration(const string& configFile, bool useGpu, bool hasSubseq,
+                    const string& expRetFile) {
   FLAGS_use_gpu = useGpu;
-  auto config = std::make_shared<TrainerConfigHelper>(CONFIG_FILE);
+  auto config = std::make_shared<TrainerConfigHelper>(configFile);
   unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
   gradientMachine->loadParameters(modelDir);
   vector<Argument> inArgs(2);
 
   const size_t batchSize = 15;
-  prepareInArgs(inArgs, batchSize, useGpu);
+  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
   vector<Argument> outArgs;
   unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
   testEvaluator->start();
@@ -93,16 +114,21 @@ TEST(RecurrentGradientMachine, test_generation) {
 #else
   const auto useGpuConfs = {true, false};
 #endif
-  FLAGS_config_args = "beam_search=0";  // no beam search
-  string expectRetFileNoBeam = expectFile + ".nobeam";
-  for (auto useGpu : useGpuConfs) {
-    testGeneration(useGpu, expectRetFileNoBeam);
-  }
-  FLAGS_config_args = "beam_search=1";  // no beam search
-  string expectRetFileBeam = expectFile + ".beam";
-  for (auto useGpu : useGpuConfs) {
-    testGeneration(useGpu, expectRetFileBeam);
-  }
+  auto testGen = [&](const string& configFile, bool hasSubseq,
+                     const string& expRetFile, bool beam_search) {
+    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
+    for (auto useGpu : useGpuConfs) {
+      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
+    }
+  };
+  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
+  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
+  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
+  // outer-RNN will concat the generated inner-results (first for beam search)
+  // from inner-RNN. Thus, they have the same outer-results.
+  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest",
+          false);  // no beam search
+  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
 }
 #endif
 
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 3c08f1e305..0557b01e36 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,12 +2,18 @@
 
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
-
+if(APPLE)
+    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
+else()
+    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
+endif()
 add_library(paddle_utils STATIC
-        ${UTIL_SOURCES})
+        ${UTIL_SOURCES}
+        ${UTIL_ARCH_SOURCES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES})
+add_style_check_target(paddle_utils ${UTIL_SOURCES}
+    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils gen_proto_cpp)
 if(WITH_TESTING)
     add_subdirectory(tests)
-endif()
\ No newline at end of file
+endif()
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 50d7f5402f..232a478ecd 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -14,9 +14,44 @@ limitations under the License. */
 
 
 #include "CustomStackTrace.h"
+#include "CommandLineParser.h"
+#include <iostream>
+
+P_DEFINE_bool(layer_stack_error_only_current_thread,
+    true,
+    "Dump current thread or whole process layer stack when signal error "
+    "occurred. true means only dump current thread layer stack");
 
 namespace paddle {
 
 CustomStackTrace<std::string> gLayerStackTrace;
 
+static std::mutex gLayerStackTraceMtx;
+void installLayerStackTracer() {
+  logging::installFailureWriter([](const char* data, int sz) {
+    std::lock_guard<std::mutex> guard(gLayerStackTraceMtx);
+    if (!gLayerStackTrace.empty()) {
+      size_t curTid = -1UL;
+      std::hash<std::thread::id> hasher;
+      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
+                            bool* isForwarding,
+                            const std::string& layerName) {
+        if (curTid != hasher(tid)) {
+          if (curTid != -1UL) {
+            std::cerr << std::endl;
+          }
+          curTid = hasher(tid);
+          std::cerr << "Thread [" << tid << "] ";
+          if (isForwarding) {
+            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
+          }
+        }
+        std::cerr << layerName << ", ";
+      }, FLAGS_layer_stack_error_only_current_thread);
+      std::cerr << std::endl;
+    }
+    std::cerr.write(data, sz);
+  });
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index e1b2d2d8e5..774c4db2b9 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <stack>
+#include <thread>
+#include <unordered_map>
+#include <functional>
 
 #include "ThreadLocal.h"
 
@@ -29,25 +32,18 @@ namespace paddle {
  * @code{.cpp}
  * 
  * paddle::CustomStackTrace<std::string> stack;
- * PASS_TEST=0;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
- *   layer->forward(passType);
+ *   layer->forward();
  * }
- * for (auto& layer : layers){
+ *
+ * stack.pop("");  // mark under pop stage.
+ *
+ * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
+ *   auto& layer = *it;
  *   layer->backward(passType);
  *   stack.pop(layer->getName());
  * }
- * 
- * if(passType == PASS_TEST) {
- *   stack.clear();
- * }
- * else {
- *   stack.dump([](const std::string& layername){
- *     LOG(INFO) << "LayerName: " << layername;
- *   })
- * }
- * 
  *
  * @endcode
  */
@@ -55,45 +51,141 @@ template <typename T>
 class CustomStackTrace{
 public:
   /**
-   * @brief Pop out an item from the top of the stack. For safety the item 
-   * will be poped should equal to ip.
+   * @brief Pop out an item from the top of the stack if item == top.
+   *        Else, just set status to popping.
    */
-  void pop(const T& ip) {
-    auto& p = *logstack_;
-    CHECK_EQ(ip, p.top());
-    p.pop();
+  void pop(const T& item) {
+    pushing() = false;
+    auto& s = this->stack();
+    if (item == s.top()) {
+      s.pop();
+    }
   }
+
   /**
-   * @brief Empty the stack by sequence from top to button.
-   * @param[in] callback A function deal with each item while dumping.
-   * It must have and only have a in parameter which is the stack item.
+   * @brief clear current thread stack.
    */
-  template <typename Callback>
-  void dump(Callback callback) {
-    auto& p = *logstack_;
-    while (!p.empty()) {
-      callback(p.top());
-      p.pop();
+  void clear() {
+    auto& s = stack();
+    while (!s.empty()) {
+      s.pop();
     }
   }
+
   /**
-   * @brief Only empty the stack.
+   * @brief return true if all thread's stack is empty.
+   * @return true if empty
    */
-  void clear() {
-    dump([](const T& ip){});
+  bool empty() const {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::stack<T>& s = *p.second;
+      if (!s.empty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+
+  /**
+   * @brief DumpCallback Type. It will be invoked many times by dump method.
+   *
+   * The first parameter is stack thread id.
+   * The second parameter is the last action of stack is push or not.
+   * The third parameter is the item in stack.
+   */
+  typedef std::function<void(const std::thread::id& /*threadId*/,
+                              bool* /*isPushing*/,
+                              const T& /*item*/)> DumpCallback;
+
+  /**
+   * Dump all thread stack, and all stack will be cleared.
+   */
+  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::thread::id tid = p.first;
+      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
+        continue;
+      }
+      std::stack<T>& s = *p.second;
+      bool* isPush = nullptr;
+      auto it = this->pushingBuffers_.find(tid);
+      if (it != this->pushingBuffers_.end()) {
+        isPush = it->second;
+      }
+
+      while (!s.empty()) {
+        callback(tid, isPush, s.top());
+        s.pop();
+      }
+    }
   }
+
   /**
-   * @brief Push item ip to the top of the stack.
+   * @brief Push item to current thread stack.
    */
-  void push(const T& ip) {
-    auto& p = *logstack_;
-    p.push(ip);
+  void push(const T& item) {
+    pushing() = true;
+    auto& p = this->stack();
+    p.push(item);
   }
 
 private:
-  ThreadLocalD<std::stack<T> > logstack_;
+  /**
+   * Get thread local attribute, and save them into a map (threadId => TYPE*)
+   *
+   * @tparam TYPE thread local attribute type.
+   * @param threadLocal Thread Local object.
+   * @param buffers a map from threadId to TYPE*
+   */
+  template <typename TYPE>
+  inline TYPE& getThreadLocal(
+      ThreadLocal<TYPE>& threadLocal,
+      std::unordered_map<std::thread::id, TYPE*>& buffers) {
+    TYPE* retv = threadLocal.get(false);
+    if (retv) {
+      return *retv;
+    } else {
+      std::lock_guard<std::mutex> guard(this->mtx_);
+      retv = threadLocal.get();
+      auto id = std::this_thread::get_id();
+      buffers.insert({id, retv});
+      return *retv;
+    }
+  }
+
+  /**
+   * @brief Get thread local stack reference.
+   */
+  std::stack<T>& stack() {
+    return this->getThreadLocal(this->logStack_,
+                                this->stackBuffers_);
+  }
+
+  /**
+   * @brief Get thread local pushing flag.
+   */
+  bool& pushing() {
+    return this->getThreadLocal(this->isPushing_,
+                                this->pushingBuffers_);
+  }
+
+private:
+  mutable std::mutex mtx_;
+
+  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
+  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  ThreadLocal<bool> isPushing_;
+  ThreadLocal<std::stack<T> > logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
 
+/**
+ * @brief Install a failure handler to print layer stack when error.
+ */
+extern void installLayerStackTracer();
+
 }  // namespace paddle
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
new file mode 100644
index 0000000000..9123508fc7
--- /dev/null
+++ b/paddle/utils/Excepts.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Excepts.h"
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+#include <fenv.h>
+
+int fegetexcept(void) {
+  static fenv_t fenv;
+  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
+}
+
+int feenableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr   &= ~(new_excepts << 7);
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+int fedisableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // mask
+  fenv.__control |= new_excepts;
+  fenv.__mxcsr   |= new_excepts << 7;
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+#endif
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
new file mode 100644
index 0000000000..a84a2d33a6
--- /dev/null
+++ b/paddle/utils/Excepts.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef EXCEPTS_H_
+#define EXCEPTS_H_
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+int fegetexcept(void);
+int feenableexcept(unsigned int excepts);
+int fedisableexcept(unsigned int excepts);
+
+#endif
+
+#endif  // EXCEPTS_H_
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 085aca508d..1fc0363d34 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <semaphore.h>
 #include <sys/time.h>
-#include <unistd.h>
-
 #include <condition_variable>
 #include <mutex>
 
+#include "DisableCopy.h"
+
 namespace paddle {
 
 /**
@@ -98,35 +97,44 @@ protected:
  * which means it will keep trying to lock until lock on successfully.
  * The SpinLock disable copy.
  */
+class SpinLockPrivate;
 class SpinLock {
 public:
-  SpinLock() { pthread_spin_init(&lock_, 0); }
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-  SpinLock(const SpinLock&) = delete;
-  SpinLock& operator=(const SpinLock&) = delete;
+  DISABLE_COPY(SpinLock);
+  SpinLock();
+  ~SpinLock();
 
   // std::mutext interface
-  void lock() { pthread_spin_lock(&lock_); }
-  void unlock() { pthread_spin_unlock(&lock_); }
+  void lock();
+  void unlock();
 
-protected:
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
+private:
+  SpinLockPrivate* m;
 };
 
 /**
  * A simple wapper of semaphore which can only be shared in the same process.
  */
+class SemaphorePrivate;
 class Semaphore {
+public:
+  //! Disable copy & assign
+  Semaphore(const Semaphore& other) = delete;
+  Semaphore& operator= (const Semaphore&& other) = delete;
+
+  //! Enable move.
+  Semaphore(Semaphore&& other): m(std::move(other.m)) {
+  }
+
 public:
   /**
    * @brief Construct Function. 
    * @param[in] initValue the initial value of the 
    * semaphore, default 0.
    */
-  explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); }
+  explicit Semaphore(int initValue = 0);
 
-  ~Semaphore() { sem_destroy(&sem_); }
+  ~Semaphore();
 
   /**
    * @brief The same as wait(), except if the decrement can not 
@@ -136,41 +144,38 @@ public:
    * @return ture if the decrement proceeds before ts, 
    * else return false.
    */
-  bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); }
+  bool timeWait(struct timespec* ts);
 
   /**
    * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
    */
-  void wait() { sem_wait(&sem_); }
+  void wait();
 
   /**
    * @brief increment the semaphore. If the semaphore's value 
    * greater than 0, wake up a thread blocked in wait().
    */
-  void post() { sem_post(&sem_); }
+  void post();
 
-protected:
-  sem_t sem_;
+private:
+  SemaphorePrivate* m;
 };
 
-static_assert(sizeof(SpinLock) == 64, "Wrong padding");
-
 /**
  * A simple wrapper of thread barrier.
  * The ThreadBarrier disable copy.
  */
+class ThreadBarrierPrivate;
 class ThreadBarrier {
 public:
+  DISABLE_COPY(ThreadBarrier);
+
   /**
    * @brief Construct Function. Initialize the barrier should
    * wait for count threads in wait().
    */
-  explicit ThreadBarrier(int count) {
-    pthread_barrier_init(&barrier_, NULL, count);
-  }
-  ~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
-  ThreadBarrier(const ThreadBarrier&) = delete;
-  ThreadBarrier& operator=(const ThreadBarrier&) = delete;
+  explicit ThreadBarrier(int count);
+  ~ThreadBarrier();
 
   /**
    * @brief . 
@@ -178,10 +183,10 @@ public:
    * then wake up all the count - 1 threads and continue run together. 
    * Else block the thread until waked by other thread .
    */
-  void wait() { pthread_barrier_wait(&barrier_); }
+  void wait();
 
-protected:
-  pthread_barrier_t barrier_;
+private:
+  ThreadBarrierPrivate* m;
 };
 
 /**
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 7fdfa3240c..b3f4398046 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
 }
 #endif  // PADDLE_USE_GLOG
 
-#ifndef NDEBUG
+#ifdef NDEBUG
 #define DEBUG_LEVEL 5
 #define DBG VLOG(DEBUG_LEVEL)
 #else
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 4467fd784e..db02d1252b 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
 // gcc compiler may produce warning
+#ifdef __APPLE__
+#define _POSIX_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#define _XOPEN_SOURCE 700
+#endif
+
 #ifdef _POSIX_C_SOURCE
 #define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
 #undef _POSIX_C_SOURCE
@@ -28,12 +34,7 @@ limitations under the License. */
 #endif
 #include <Python.h>
 #include <frameobject.h>
-#ifndef _POSIX_C_SOURCE
-#warning "no _POSIX_C_SOURCE defined in Python.h"
-#endif
-#ifndef _XOPEN_SOURCE
-#warning "no _XOPEN_SOURCE defined in Python.h"
-#endif
+
 #endif
 
 #include "paddle/utils/Util.h"
@@ -175,10 +176,21 @@ public:
   /**
    * Get bool attribute.
    * @param field
+   * @param [out] isBoolType return true if attribute is bool type. If the
+   *                         attribute is not bool type, then an implicit
+   *                         conversion will happens, and will return the
+   *                         conversion result.
+   *
+   *                         Such as, if the attribute is 1, then the return
+   *                         value of function will be true, but the isBoolType
+   *                         will return false.
    * @return
    */
-  bool getBoolAttr(const std::string& field) const {
+  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
     PyObjectPtr tmp(getAttr(field));
+    if (isBoolType) {
+      *isBoolType = PyBool_Check(tmp.get());
+    }
     return PyObject_IsTrue(tmp.get());
   }
 
@@ -258,6 +270,15 @@ public:
     this->set(key, PyBool_FromLong(b));
   }
 
+  void setStringList(const std::string& key,
+                     const std::vector<std::string>& items) {
+    auto * list = PyList_New(items.size());
+    for (size_t i=0; i < items.size(); ++i) {
+      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
+    }
+    this->set(key, list);
+  }
+
 private:
   inline void checkDict() {
     CHECK(PyDict_Check(this->dict_));
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 14aae6909d..d7b20ca5eb 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -13,24 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Stat.h"
-
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
+#include "Util.h"
 #include <iomanip>
 #include <algorithm>
 
 namespace paddle {
 
-// return the thread id used by glog
-pid_t getTID() {
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-  CHECK_NE(tid, -1);
-  return tid;
-}
-
 StatSet globalStat("GlobalStatInfo");
 
 void Stat::addSample(uint64_t value) {
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index 3e1d95ab1f..f6c826a1ee 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "Util.h"
 #include "Logging.h"
 #include <thread>
 
-#include <sys/syscall.h>
-#include <unistd.h>
-inline pid_t gettid() { return syscall(SYS_gettid); }
-
 #include "Queue.h"
 #include "ThreadLocal.h"
 
@@ -175,7 +172,7 @@ public:
         jobFinishBarrier_(numWorkers + 1),
         jobFunc_(nullptr),
         checkOwner_(checkOwner) {
-    ownerThreadId_ = ::gettid();
+    ownerThreadId_ = getTID();
     workers_.resize(numWorkers);
     start();
   }
@@ -199,7 +196,7 @@ public:
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
     if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, ::gettid())
+      CHECK_EQ(ownerThreadId_, getTID())
           << "this sync thread pool should be used in one thread";
     }
 
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index a4b399d144..0f948f1029 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "Util.h"
 #include "ThreadLocal.h"
-
-#include "Thread.h"
-
 #include "CommandLineParser.h"
 
 P_DEFINE_bool(thread_local_rand_use_global_seed, false,
@@ -31,11 +29,11 @@ unsigned int* ThreadLocalRand::getSeed() {
   if (!p) {  // init seed
     if (FLAGS_thread_local_rand_use_global_seed) {
       p = new unsigned int(defaultSeed_);
-    } else if (getpid() == gettid()) {  // main thread
+    } else if (getpid() == getTID()) {  // main thread
       // deterministic, but differs from global srand()
       p = new unsigned int(defaultSeed_ - 1);
     } else {
-      p = new unsigned int(defaultSeed_ + gettid());
+      p = new unsigned int(defaultSeed_ + getTID());
       LOG(INFO) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
@@ -51,7 +49,7 @@ std::default_random_engine& ThreadLocalRandomEngine::get() {
     int defaultSeed = ThreadLocalRand::getDefaultSeed();
     engine->seed(FLAGS_thread_local_rand_use_global_seed
                      ? defaultSeed
-                     : defaultSeed + gettid());
+                     : defaultSeed + getTID());
     engine_.set(engine);
   }
   return *engine;
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index e782868f69..686a1a99a4 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -156,7 +156,15 @@ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
-    pid_t tid = syscall(SYS_gettid);
+#if defined(__APPLE__) || defined(__OSX__)
+    pid_t tid = syscall(SYS_thread_selfid);
+#else
+    #ifndef __NR_gettid
+    #define __NR_gettid 224
+    #endif
+    pid_t tid = syscall(__NR_gettid);
+#endif
+    CHECK_NE(tid, -1);
     std::lock_guard<std::mutex> guard(mutex_);
     auto ret = threadMap_.insert(std::make_pair(tid, p));
     if (!ret.second) {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 1c1d75dc5b..c3c76f907d 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -93,6 +93,19 @@ static void installProfilerSwitch() {}
 
 namespace paddle {
 
+pid_t getTID() {
+  #if defined(__APPLE__) || defined(__OSX__)
+      pid_t tid = syscall(SYS_thread_selfid);
+  #else
+      #ifndef __NR_gettid
+      #define __NR_gettid 224
+      #endif
+      pid_t tid = syscall(__NR_gettid);
+  #endif
+  CHECK_NE(tid, -1);
+  return tid;
+}
+
 static bool g_initialized = false;
 typedef std::pair<int, std::function<void()>> PriorityFuncPair;
 typedef std::vector<PriorityFuncPair> InitFuncList;
@@ -129,13 +142,7 @@ void runInitFunctions() {
 
 void initMain(int argc, char** argv) {
   initializeLogging(argc, argv);
-  logging::installFailureWriter([](const char* data, int sz) {
-    std::cerr << "Current Layer forward/backward stack is " << std::endl;
-    gLayerStackTrace.dump([](const std::string& layername){
-      std::cerr << "LayerName: " << layername << std::endl;
-    });
-    std::cerr.write(data, sz);
-  });
+  installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 3729c5c433..2adb626c83 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <mutex>
 #include <functional>
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
 
 #include "CommandLineParser.h"
 #include "Logging.h"
@@ -63,6 +65,25 @@ limitations under the License. */
 
 namespace paddle {
 
+// return the thread id used by glog
+pid_t getTID();
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    findLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+inline constexpr size_t findLastSet(size_t x) {
+  return std::is_same<size_t , unsigned int>::value ?
+      (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+    : (std::is_same<size_t , unsigned long>::value ? // NOLINT
+      (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+}
+
 /**
  * calculate the non-negative remainder of a/b
  * @param[in] a
@@ -96,6 +117,17 @@ static bool contains(const Container& container, const T& val) {
   return std::find(container.begin(), container.end(), val) != container.end();
 }
 
+/**
+ * pop and get the front element of a container
+ */
+template<typename Container>
+typename Container::value_type pop_get_front(Container& c) {
+  typename Container::value_type v;
+  swap(v, c.front());
+  c.pop_front();
+  return v;
+}
+
 #define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
 
 /**
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000..347ae64c26
--- /dev/null
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+namespace paddle {
+class SemaphorePrivate {
+public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() {
+  sem_wait(&m->sem);
+}
+
+void Semaphore::post() {
+  sem_post(&m->sem);
+}
+
+
+class SpinLockPrivate {
+public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+SpinLock::SpinLock():m(new SpinLockPrivate()) {}
+
+
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  pthread_spin_lock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  pthread_spin_unlock(&m->lock_);
+}
+
+class ThreadBarrierPrivate {
+public:
+  pthread_barrier_t barrier_;
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+  pthread_barrier_init(&m->barrier_, nullptr, count);
+}
+
+ThreadBarrier::~ThreadBarrier() {
+  pthread_barrier_destroy(&m->barrier_);
+  delete m;
+}
+
+void ThreadBarrier::wait() {
+  pthread_barrier_wait(&m->barrier_);
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000..47e44e9d7c
--- /dev/null
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+namespace paddle {
+
+class SemaphorePrivate {
+public:
+  ~SemaphorePrivate() {
+    dispatch_release(sem);
+  }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() {
+  delete m;
+}
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() {
+  dispatch_semaphore_signal(m->sem);
+}
+
+class SpinLockPrivate {
+public:
+  SpinLockPrivate(): lock_(OS_SPINLOCK_INIT) {}
+
+  OSSpinLock lock_;
+  char padding_[64 - sizeof(OSSpinLock)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  OSSpinLockLock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  OSSpinLockUnlock(&m->lock_);
+}
+
+
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 147ee3f6d6..51f1889392 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -2,3 +2,16 @@ add_simple_unittest(test_CommandLineParser)
 add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
+add_simple_unittest(test_CustomStackTrace)
+add_simple_unittest(test_ThreadBarrier)
+
+add_executable(
+    test_CustomStackTracePrint
+    test_CustomStackTracePrint.cpp
+)
+link_paddle_exe(test_CustomStackTracePrint)
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
new file mode 100644
index 0000000000..3e66502147
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <chrono>
+
+#include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+
+void testNormalImpl(const std::function<void(
+                      paddle::CustomStackTrace<std::string>&,
+                      size_t, size_t,
+                      paddle::ThreadBarrier&,
+                      paddle::ThreadBarrier&)>& callback) {
+  paddle::CustomStackTrace<std::string> tracer;
+  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
+  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
+  constexpr size_t countDown = 10;
+  constexpr size_t layerSize = 1000;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.reserve(FLAGS_test_thread_num);
+
+  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
+                                         &startBarrier, &doneBarrier,
+                                         &callback]{
+      callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
+    }));
+  }
+  size_t cntDown = countDown;
+  while (cntDown-- > 0) {
+    startBarrier.wait();
+    sleep(1);
+    doneBarrier.wait();
+    ASSERT_TRUE(tracer.empty());
+  }
+
+  for (auto& thread : threads) {
+    thread->join();
+  }
+}
+
+
+TEST(CustomStackTrace, normalTrain) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.pop("");
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+      }
+      finish.wait();
+    }
+  });
+}
+
+TEST(CustomStackTrace, normalTest) {
+  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.clear();  // in forward test, tracer will clear after forward.
+      finish.wait();
+    }
+  });
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
new file mode 100644
index 0000000000..c19c98614e
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/CustomStackTrace.h"
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+
+  for (size_t i=0; i < 1000; ++i) {
+    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    if (i == 998) {
+      throw "Unhandle exception";
+    }
+  }
+
+  return 0;
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/utils/tests/test_CustomStackTracePrint.sh
new file mode 100755
index 0000000000..b5543485f3
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+echo "Test Custom Stack Trace print correct result when fail"
+./test_CustomStackTracePrint >customStackTraceLog 2>&1
+if [ $? -eq 0 ]; then
+  exit 1
+else
+  set -e
+  TEXT=""
+  for ((i=0; i<=998; i++))
+  do
+    TEXT="layer_$i, "$TEXT
+  done
+  TEXT="Forwarding "$TEXT
+  grep -q "$TEXT" customStackTraceLog
+fi
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
new file mode 100644
index 0000000000..90bd6c21bc
--- /dev/null
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <set>
+#include <vector>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(size_t thread_num,
+                    const std::function<void(size_t,
+                    std::mutex&, std::set<std::thread::id>&,
+                    paddle::ThreadBarrier&)>& callback) {
+ std::mutex mutex;
+ std::set<std::thread::id> tids;
+ paddle::ThreadBarrier barrier(thread_num);
+
+ std::vector<std::thread> threads;
+ threads.reserve(thread_num);
+ for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex,
+                         &tids, &barrier, &callback]{
+        callback(thread_num, mutex, tids, barrier);
+    });
+ }
+
+ for (auto& thread : threads) {
+   thread.join();
+ }
+}
+
+TEST(ThreadBarrier, normalTest) {
+  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+    testNormalImpl(thread_num,
+                  [](size_t thread_num, std::mutex& mutex,
+                  std::set<std::thread::id>& tids,
+                  paddle::ThreadBarrier& barrier){
+      {
+        std::lock_guard<std::mutex> guard(mutex);
+        tids.insert(std::this_thread::get_id());
+      }
+      barrier.wait();
+      // Check whether all threads reach this point or not
+      CHECK_EQ(tids.size(), thread_num);
+    });
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index d04620d363..b32f8b1ee9 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -299,7 +299,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional real coeff = 26;
+  optional real coeff = 26 [default = 1.0];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
@@ -452,6 +452,9 @@ message SubModelConfig {
   repeated LinkConfig out_links = 10;
 
   optional GeneratorConfig generator = 11;
+
+  // the id of inlink which share info with outlinks, used in recurrent layer group
+  optional int32 target_inlinkid = 12;
 }
 
 message ModelConfig {
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
index 222e070089..e8d512445e 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
@@ -31,8 +31,8 @@ message ParameterUpdaterHookConfig {
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  required real learning_rate = 3;
-  required real momentum = 4;
+  optional real learning_rate = 3 [default = 1.0];
+  optional real momentum = 4 [default = 0.0];
   optional real initial_mean = 5 [default = 0.0];
   optional real initial_std = 6 [default = 0.01];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
@@ -54,8 +54,8 @@ message ParameterConfig {
   optional int32 num_batches_regularization = 13 [default = 1];
   // if is_sparse is true, para is sparse, else para is dense
   optional bool is_sparse = 14[default = false];
-  // if para is sparse, format should be "csc" or "csr"
-  optional string format = 15[default = "csr"];
+  // if para is sparse, format should be "csc" or "csr", empty means is not sparse
+  optional string format = 15 [default = ""];
   // sparse remote update or not
   optional bool sparse_remote_update = 16 [default = false];
   // gradient clipping threshold, no clipping by default
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c8e3ecd41c..dce0b90952 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,6 +1,14 @@
 set(OUTPUT_DIR
     "${CMAKE_CURRENT_BINARY_DIR}/build")
 
+file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
+file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
+file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+
+set(PY_FILES paddle/__init__.py
+             ${TRAINER_PY_FILES}
+             ${HELPERS_PY_FILES}
+             ${UTILS_PY_FILES})
 
 set(PADDLE_INTERNAL_PACKAGE "")
 if (PADDLE_WITH_INTERNAL)
@@ -13,7 +21,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py)
+    DEPENDS gen_proto_py ${PY_FILES})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
@@ -22,9 +30,7 @@ find_python_module(pip REQUIRED)
 find_python_module(wheel REQUIRED)
 find_python_module(google.protobuf REQUIRED)
 
-install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} setup.py install -f
-    WORKING_DIRECTORY
-    ${CMAKE_CURRENT_BINARY_DIR})")
+add_subdirectory(paddle/trainer_config_helpers/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index c4f6147393..34f5dd41b7 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -14,6 +14,13 @@
 
 import cPickle
 import logging
+import collections
+import functools
+import itertools
+
+logging.basicConfig(
+    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
+           " %(message)s")
 
 
 class SequenceType(object):
@@ -68,30 +75,39 @@ sparse_binary_vector = sparse_non_value_slot
 sparse_vector = sparse_value_slot
 integer_value = index_slot
 
+
 def dense_vector_sequence(dim):
     return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def dense_vector_sub_sequence(dim):
     return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def sparse_binary_vector_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def sparse_binary_vector_sub_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def sparse_vector_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def sparse_vector_sub_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def integer_value_sequence(dim):
     return integer_value(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def integer_value_sub_sequence(dim):
     return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def integer_sequence(dim):
     return index_slot(dim, seq_type=SequenceType.SEQUENCE)
 
@@ -102,13 +118,97 @@ class SingleSlotWrapper(object):
 
     def __call__(self, obj, filename):
         for item in self.generator(obj, filename):
-            yield [item]
+            if isinstance(item, dict):
+                yield item
+            else:
+                yield [item]
 
 
-def provider(input_types=None, should_shuffle=True, pool_size=-1,
+class InputOrderWrapper(object):
+    def __init__(self, generator, input_order):
+        self.generator = generator
+        self.input_order = input_order
+
+    def __call__(self, obj, filename):
+        for item in self.generator(obj, filename):
+            if isinstance(item, dict):
+                yield [item.get(input_name, None) for input_name in
+                       self.input_order]
+            else:
+                yield item
+
+
+class CheckWrapper(object):
+    def __init__(self, generator, input_types, check_fail_continue, logger):
+        self.generator = generator
+        self.input_types = input_types
+        self.check_fail_continue = check_fail_continue
+        self.logger = logger
+
+    def __call__(self, obj, filename):
+        for items in self.generator(obj, filename):
+            try:
+                assert len(items) == len(self.input_types)
+                assert len(filter(lambda x: x is None, items)) == 0
+                for item, input_type in itertools.izip(items, self.input_types):
+                    callback = functools.partial(CheckWrapper.loop_callback,
+                                                 input_type)
+
+                    for _ in xrange(input_type.seq_type):
+                        callback = functools.partial(CheckWrapper.loop_check,
+                                                     callback)
+                    callback(item)
+
+                yield items
+            except AssertionError as e:
+                self.logger.warning(
+                    "Item (%s) is not fit the input type with error %s"
+                    % (repr(item), repr(e)))
+
+                if self.check_fail_continue:
+                    continue
+                else:
+                    raise
+
+    @staticmethod
+    def loop_callback(input_type, each):
+        assert isinstance(input_type, InputType)
+        if input_type.type == DataType.Dense:
+            assert isinstance(each, collections.Sequence)
+            for d in each:
+                assert isinstance(d, float)
+            assert len(each, input_type.dim)
+        elif input_type.type == DataType.Index:
+            assert isinstance(each, int)
+            assert each < input_type.dim
+        elif input_type.type == DataType.SparseNonValue \
+                or input_type.type == DataType.SparseValue:
+            assert isinstance(each, collections.Sequence)
+            sparse_id = set()
+            for k in each:
+                if input_type.type == DataType.SparseValue:
+                    k, v = k
+                    assert isinstance(v, float)
+                assert isinstance(k, int)
+                assert k < input_type.dim
+                sparse_id.add(k)
+            assert len(sparse_id) == len(each)
+        else:
+            raise RuntimeError("Not support input type")
+
+    @staticmethod
+    def loop_check(callback, item):
+        for each in item:
+            callback(each)
+
+
+def provider(input_types=None, should_shuffle=None, pool_size=-1,
+             min_pool_size=-1,
              can_over_batch_size=True,
              calc_batch_size=None,
              cache=CacheType.NO_CACHE,
+             check=False, check_fail_continue=False,
+             use_dynamic_order=True,
              init_hook=None, **kwargs):
     """
     Provider decorator. Use it to make a function into PyDataProvider2 object.
@@ -130,30 +230,63 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
     :param input_types: Specify the input types, can also be set in init_hook.
                         It is a list of InputType object. For example, input_types= \
                         [dense_vector(9), integer_value(2)].
-    :param should_shuffle: True if data should shuffle.
+    :type input_types: list|tuple
+
+    :param should_shuffle: True if data should shuffle. Pass None means shuffle
+                           when is training and not to shuffle when is testing.
     :type should_shuffle: bool
+
     :param pool_size: Max number of sample in data pool.
     :type pool_size: int
+
+    :param min_pool_size: Set minimal sample in data pool. The PaddlePaddle will
+                          random pick sample in pool. So the min_pool_size
+                          effect the randomize of data.
+    :type min_pool_size: int
+
     :param can_over_batch_size: True if paddle can return a mini-batch larger
                                 than batch size in settings. It is useful when
                                 custom calculate one sample's batch_size.
 
                                 It is very danger to set it to false and use
                                 calc_batch_size together. Default is false.
+    :type can_over_batch_size: bool
+
     :param calc_batch_size: a method to calculate each sample's batch size.
                             Default each sample's batch size is 1. But to you
                             can customize each sample's batch size.
+    :type calc_batch_size: callable
+
     :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE
+    :type cache: int
 
     :param init_hook: Initialize hook. Useful when data provider need load some
                       external data like dictionary. The parameter is
                       (settings, file_list, \*\*kwargs).
 
-                      - settings\: Is the global settings. User can set
-                                   settings.input_types here.
-                      - file_list\: All file names for passed to data provider.
-                      - kwargs: Other keyword arguments passed from
+                      - settings. It is the global settings object. User can set
+                        settings.input_types here.
+                      - file_list. All file names for passed to data provider.
+                      - is_train. Is this data provider used for training or not.
+                      - kwargs. Other keyword arguments passed from
                         trainer_config's args parameter.
+    :type init_hook: callable
+
+    :param check: Check the yield data format is as same as input_types. Enable
+                  this will make data provide process slow but it is very useful
+                  for debug. Default is disabled.
+    :type check: bool
+
+    :param check_fail_continue: Continue train or not when check failed. Just
+                                drop the wrong format data when it is True. Has
+                                no effect when check set to False.
+    :type check_fail_continue: bool
+
+    :param use_dynamic_order: Allow provider to yield a dictionary object, whose
+                              key is a input data layer name, and value is the
+                              feature value. The tuples are still allowed when
+                              use_dynmaic_order is True.
+    :type use_dynamic_order: bool
     """
 
     def __wrapper__(generator):
@@ -168,12 +301,38 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
                     self.slots = kwargs['slots']
                 self.slots = input_types
                 self.should_shuffle = should_shuffle
+
+                true_table = [1, 't', 'true', 'on']
+                false_table = [0, 'f', 'false', 'off']
+                if not isinstance(self.should_shuffle, bool) and \
+                                self.should_shuffle is not None:
+
+                    if isinstance(self.should_shuffle, basestring):
+                        self.should_shuffle = self.should_shuffle.lower()
+
+                    if self.should_shuffle in true_table:
+                        self.should_shuffle = True
+                    elif self.should_shuffle in false_table:
+                        self.should_shuffle = False
+                    else:
+                        self.logger.warning(
+                            "Could not recognize should_shuffle (%s), "
+                            "just use default value of should_shuffle."
+                            " Please set should_shuffle to bool value or "
+                            "something in %s" % (
+                                repr(self.should_shuffle),
+                                repr(true_table + false_table)))
+                        self.should_shuffle = None
+
                 self.pool_size = pool_size
                 self.can_over_batch_size = can_over_batch_size
                 self.calc_batch_size = calc_batch_size
                 self.file_list = file_list
                 self.generator = generator
                 self.cache = cache
+                self.min_pool_size = min_pool_size
+                self.input_order = kwargs['input_order']
+                self.check = check
                 if init_hook is not None:
                     init_hook(self, file_list=file_list, **kwargs)
                 if self.input_types is not None:
@@ -184,6 +343,15 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
                 if len(self.slots) == 1:
                     self.generator = SingleSlotWrapper(self.generator)
 
+                if use_dynamic_order:
+                    self.generator = InputOrderWrapper(self.generator,
+                                                       self.input_order)
+                if self.check:
+                    self.generator = CheckWrapper(self.generator,
+                                                  self.slots,
+                                                  check_fail_continue,
+                                                  self.logger)
+
         return DataProvider
 
     return __wrapper__
@@ -196,3 +364,4 @@ def deserialize_args(args):
     :return:
     """
     return cPickle.loads(args)
+
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 53d8bb98f0..1f55298f24 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -114,15 +114,15 @@ g_layer_type_map = {}
 # Initialize global variables. We use this function so that we can
 # call parse_config() multiple times
 def init_config_environment(
-        g_default_momentum = 0.,
-        g_default_decay_rate = 0.,
+        g_default_momentum = None,
+        g_default_decay_rate = None,
         g_default_initial_mean = 0.,
         g_default_initial_std = 0.01,
-        g_default_num_batches_regularization = 1,
+        g_default_num_batches_regularization = None,
         g_default_initial_strategy = 0,
         g_default_initial_smart = False,
-        g_default_gradient_clipping_threshold = 0.,
-        g_default_device = -1,
+        g_default_gradient_clipping_threshold = None,
+        g_default_device = None,
         g_default_update_hooks = None,
         g_default_compact_func = None,
 
@@ -262,8 +262,8 @@ def SubModelEnd(name = None):
 
 def MakeLayerNameInParentSubmodel(name):
     suffix = ""
-    for submodel in g_submodel_stack[1:]:
-        suffix = "@" + submodel.name + suffix
+    if len(g_submodel_stack) > 1:
+        suffix = "@" + g_submodel_stack[-1].name
     return name + suffix
 
 def GetLayerBaseName(name):
@@ -303,7 +303,8 @@ def MakeLayerNameInSubmodel(name, submodel_name = None):
 @config_func
 def RecurrentLayerGroupWithoutOutLinksBegin(name,
                                             in_links,
-                                            seq_reversed=False):
+                                            seq_reversed=False,
+                                            target_inlinkname=""):
     global g_current_submodel
     config_assert(g_config.model_config.type == "recurrent_nn",
                   "RecurrentLayerGroup should be used only in recurrent_nn")
@@ -311,14 +312,19 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
     SubModelBegin(name)
     g_current_submodel.is_recurrent_layer_group = True
     g_current_submodel.reversed = seq_reversed
+    g_current_submodel.target_inlinkid = -1
     in_links_count = 0
-    for link in in_links:
+    for linkid, link in enumerate(in_links):
         if isinstance(link, basestring):
             name = link
             has_subseq = False
         else:
             name = link.link_name
             has_subseq = link.has_subseq
+        # assign target_inlinkid according to target_inlinkname
+        if target_inlinkname == name:
+            g_current_submodel.target_inlinkid = linkid
+
         if in_links_count == 0:
             in_links_has_subseq = has_subseq
         else:
@@ -331,6 +337,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
             SequenceScatterAgentLayer(name=name, size=layer.size)
         else:
             ScatterAgentLayer(name=name, size=layer.size)
+
         pair = g_current_submodel.in_links.add()
         pair.layer_name = layer_name
         pair.link_name = MakeLayerNameInSubmodel(name)
@@ -362,10 +369,12 @@ def RecurrentLayerGroupBegin(name,
                              in_links,
                              out_links,
                              generator=None,
+                             target_inlinkname="",
                              seq_reversed=False):
     RecurrentLayerGroupWithoutOutLinksBegin(name,
                                             in_links,
-                                            seq_reversed)
+                                            seq_reversed,
+                                            target_inlinkname)
     for link in out_links:
         RecurrentLayerGroupSetOutLink(link)
 
@@ -627,7 +636,6 @@ class Operator(Cfg):
             input_layer_names,
             ):
         self.add_keys(locals())
-
         self.operator_conf = OperatorConfig()
         self.operator_conf.type = self.type
 
@@ -677,12 +685,15 @@ class ConvOperator(Operator):
         if num_filters is not None:
             self.operator_conf.num_filters = num_filters
 
-        parse_conv(conv_conf, input_layer_names[0], self.operator_conf.conv_conf, True)
+        parse_conv(conv_conf,
+                   MakeLayerNameInSubmodel(input_layer_names[0]),
+                   self.operator_conf.conv_conf)
         self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x  ** 2) * num_filters
 
         config_assert(len(input_layer_names) == 2, "Conv is binary operator")
 
-
+    def calc_output_size(self, input_sizes):
+        return self.operator_conf.output_size
 
 
 # please refer to the comments in proto/ModelConfig.proto
@@ -1090,12 +1101,12 @@ def Evaluator(
         inputs,
         chunk_scheme = None,
         num_chunk_types = None,
-        classification_threshold = 0.5,
-        positive_label = -1,
-        dict_file = "",
-        result_file = "",
-        num_results = 1,
-        delimited = True,
+        classification_threshold = None,
+        positive_label = None,
+        dict_file = None,
+        result_file = None,
+        num_results = None,
+        delimited = None,
         ):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
@@ -1111,12 +1122,19 @@ def Evaluator(
         evaluator.num_chunk_types = num_chunk_types
     g_current_submodel.evaluator_names.append(evaluator.name)
 
-    evaluator.classification_threshold = classification_threshold
-    evaluator.positive_label = positive_label
-    evaluator.dict_file = dict_file
-    evaluator.result_file = result_file
-    evaluator.num_results = num_results
-    evaluator.delimited = delimited
+    if classification_threshold is not None:
+        evaluator.classification_threshold = classification_threshold
+    if positive_label is not None:
+        evaluator.positive_label = positive_label
+    if dict_file is not None:
+        evaluator.dict_file = dict_file
+
+    if result_file is not None:
+        evaluator.result_file = result_file
+    if num_results is not None:
+        evaluator.num_results = num_results
+    if delimited is not None:
+        evaluator.delimited = delimited
 
 class LayerBase(object):
     def __init__(
@@ -1128,7 +1146,7 @@ class LayerBase(object):
             device=None,
             active_type="",
             drop_rate=0.,
-            coeff=1.):
+            coeff=None):
         config_assert('@' not in name,
                 "layer name: %s contain special character @" % name)
         global g_current_submodel
@@ -1146,10 +1164,12 @@ class LayerBase(object):
             self.inputs = [self.inputs]
 
         self.config = g_config.model_config.layers.add()
+        assert isinstance(self.config, LayerConfig)
         self.config.name = name
         self.config.type = type
         self.config.active_type = active_type
-        self.config.coeff = coeff
+        if coeff is not None:
+            self.config.coeff = float(coeff)
         if size != 0:
             self.config.size = size
         if drop_rate != 0:
@@ -1157,7 +1177,7 @@ class LayerBase(object):
 
         if device is not None:
             self.config.device = device
-        else:
+        elif g_default_device is not None:
             self.config.device = g_default_device
 
         for input_index in xrange(len(self.inputs)):
@@ -1227,10 +1247,12 @@ class LayerBase(object):
             if bias.parameter_name is None:
                 bias.parameter_name = gen_bias_parameter_name(self.config.name)
             if bias.parameter_name not in g_parameter_map:
+                assert isinstance(self.config, LayerConfig)
+
                 Parameter(
                     bias.parameter_name,
                     size,
-                    self.config.device,
+                    self.config.device if self.config.HasField('device') else None,
                     dims,
                     bias.learning_rate,
                     bias.momentum,
@@ -1256,8 +1278,8 @@ class LayerBase(object):
             input_index,
             size,
             dims=None,
-            sparse = False,
-            format = "csr"):
+            sparse = None,
+            format = None):
         if dims is None:
             # TODO(yuyang18): print warning and callstack here!
             dims = list()
@@ -1284,7 +1306,7 @@ class LayerBase(object):
         Parameter(
             input_config.parameter_name,
             size,
-            self.config.device,
+            self.config.device if self.config.HasField("device") else None,
             dims,
             input_config.learning_rate,
             input_config.momentum,
@@ -1344,6 +1366,8 @@ class FCLayer(LayerBase):
 
             if sparse:
                 psize = self.inputs[input_index].nnz
+            else:
+                sparse = None
 
             self.create_input_parameter(input_index, psize, dims, sparse, format)
         self.create_bias_parameter(bias, self.config.size)
@@ -1399,6 +1423,14 @@ class SelectiveFCLayer(LayerBase):
                 input_index, psize, dims, sparse, format)
         self.create_bias_parameter(bias, self.config.size)
 
+@config_layer('print')
+class PrintLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs):
+        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(
@@ -1614,7 +1646,7 @@ class BatchNormLayer(LayerBase):
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
             ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4000
+            cudnn_version >= 4007
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(name, self.layer_type, 0,
                                              active_type=active_type,
@@ -2042,7 +2074,7 @@ class MaxLayer(LayerBase):
             active_type='linear',
             device=None,
             bias=False,
-            output_max_index=False):
+            output_max_index=None):
         super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
         self.config.trans_type =  trans_type
@@ -2051,7 +2083,8 @@ class MaxLayer(LayerBase):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
         self.create_bias_parameter(bias, self.config.size)
-        self.config.output_max_index=output_max_index
+        if output_max_index is not None:
+            self.config.output_max_index = output_max_index
 
 
 @config_layer('maxid')
@@ -2264,6 +2297,9 @@ class ConvexCombinationLayer(LayerBase):
            name, 'convex_comb', size, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2,
           'ConvexCombinationLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for ConvexCombinationLayer')
         self.set_layer_size(size)
 
 @config_layer('interpolation')
@@ -2313,6 +2349,9 @@ class CosSimVecMatLayer(LayerBase):
         self.config.cos_scale = cos_scale
         config_assert(len(self.inputs) == 2,
           'CosSimVecMatLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for CosSimVecMatLayer')
 
 @config_layer('sampling_id')
 class SamplingIdLayer(LayerBase):
@@ -2361,6 +2400,7 @@ class CosSimLayer(LayerBase):
             self,
             name,
             inputs,
+            cos_scale=5,
             device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
@@ -2368,6 +2408,7 @@ class CosSimLayer(LayerBase):
         config_assert(
             self.get_input_layer(0).size == self.get_input_layer(1).size,
             'inputs of CosSimLayer must have same dim')
+        self.config.cos_scale = cos_scale
 
 
 @config_layer('tensor')
@@ -2400,12 +2441,11 @@ class MixedLayer(LayerBase):
             inputs,
             size=0,
             bias=True,
-            error_clipping_threshold=0.0,
+            error_clipping_threshold=None,
             **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(MixedLayer, self).__init__(
             name, 'mixed', size, inputs=inputs, **xargs)
-
         operator_input_index = []
         for operator in self.operators:
             operator_conf = operator.operator_conf
@@ -2420,21 +2460,31 @@ class MixedLayer(LayerBase):
                 input_layer = self.get_input_layer(input_index)
                 operator_conf.input_sizes.append(input_layer.size)
                 operator_input_index.append(input_index)
-            if self.config.size ==  0:
+            if self.config.size == 0:
                 size = operator.calc_output_size(operator_conf.input_sizes)
                 if size != 0:
                     self.set_layer_size(size)
-
+            else:
+                sz = operator.calc_output_size(operator_conf.input_sizes)
+                if sz != 0:
+                    config_assert(sz == self.config.size,
+                                  "different inputs have different size: %s vs. %s" %
+                                  (sz, self.config.size))
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             input = self.inputs[input_index]
             if input_index not in operator_input_index:
                 config_assert(isinstance(input, Projection), "input should be projection or operation")
-            if self.config.size ==  0 and isinstance(input, Projection):
+            if self.config.size == 0 and isinstance(input, Projection):
                 size = input.calc_output_size(input_layer)
                 if size != 0:
                     self.set_layer_size(size)
-
+            elif isinstance(input, Projection):
+            	sz = input.calc_output_size(input_layer)
+            	if sz != 0:
+            		config_assert(sz == self.config.size,
+            		"different inputs have different size: %s vs. %s" %
+            		(sz, self.config.size))
         config_assert(size != 0, "size is not set")
 
         for input_index in xrange(len(self.inputs)):
@@ -2461,7 +2511,8 @@ class MixedLayer(LayerBase):
 
         self.create_bias_parameter(bias, self.config.size)
 
-        self.config.error_clipping_threshold = error_clipping_threshold
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
 
 # like MixedLayer, but no bias parameter
 @config_func
@@ -2802,27 +2853,44 @@ def Parameter(
     para = g_config.model_config.parameters.add()
     para.name = name
     para.size = size
-    para.device = device
-    para.dims.extend(dims);
-    para.learning_rate = default(learning_rate, 1.)
-    para.momentum = default(momentum, g_default_momentum)
+    if device is not None:
+        para.device = int(device)
+    para.dims.extend(dims)
+
+    if learning_rate is not None:
+        para.learning_rate = float(learning_rate)
+
+    momentum = default(momentum, g_default_momentum)
+    if momentum is not None:
+        para.momentum = float(momentum)
+
     config_assert(not momentum or not decay_rate_l1,
                   "momentum and decay_rate_l1 cannot both be non-zero")
-    para.decay_rate = default(decay_rate, g_default_decay_rate)
+
+    decay_rate = default(decay_rate, g_default_decay_rate)
+    if decay_rate is not None:
+        para.decay_rate = decay_rate
+
     if decay_rate_l1 is not None:
         para.decay_rate_l1 = decay_rate_l1
     para.initial_std = default(initial_std, g_default_initial_std)
     para.initial_mean = default(initial_mean, g_default_initial_mean)
-    para.num_batches_regularization = default(
+
+    num_batches_regularization = default(
         num_batches_regularization, g_default_num_batches_regularization)
+    if num_batches_regularization is not None:
+        para.num_batches_regularization = int(num_batches_regularization)
+
     if sparse_remote_update is not None:
         para.sparse_remote_update = sparse_remote_update
         if sparse_remote_update:
             g_config.opt_config.use_sparse_remote_updater = True
     if sparse_update is not None:
         para.sparse_update = sparse_update
-    para.gradient_clipping_threshold = default(
-        gradient_clipping_threshold, g_default_gradient_clipping_threshold);
+    gradient_clipping_threshold = default(
+        gradient_clipping_threshold, g_default_gradient_clipping_threshold)
+    if gradient_clipping_threshold is not None:
+        para.gradient_clipping_threshold = gradient_clipping_threshold
     para.initial_strategy = default(initial_strategy, g_default_initial_strategy)
     para.initial_smart = default(initial_smart, g_default_initial_smart)
     if para.initial_smart:
@@ -2835,15 +2903,19 @@ def Parameter(
             para.initial_std = 1. / math.sqrt(para.size)
     if g_default_compact_func is not None:
         sparse, format, need_compact = g_default_compact_func(para.name)
-    para.is_sparse = default(sparse, False)
-    para.format = default(format, "")
-    para.need_compact = default(need_compact, False)
+
+    if sparse is not None:
+        para.is_sparse = sparse
+    if format is not None:
+        para.format = format
+    if need_compact is not None:
+        para.need_compact = need_compact
     if is_static is not None:
         para.is_static = is_static
     config_assert(not para.sparse_remote_update or not para.is_static,
                   "sparse_remote_update and is_static cannot both be true")
-
-    para.is_shared = default(is_shared, False)
+    if is_shared is not None:
+        para.is_shared = is_shared
 
     update_hooks = default(update_hooks, g_default_update_hooks)
 
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 24defb06a6..2920145193 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -14,9 +14,11 @@
 
 __all__ = ["TanhActivation", "SigmoidActivation",
            "SoftmaxActivation", "IdentityActivation", "LinearActivation",
-           'SequenceSoftmaxActivation',
-           "ReluActivation", "BReluActivation", "SoftReluActivation", "STanhActivation",
-           "AbsActivation", "SquareActivation", "BaseActivation"]
+           'SequenceSoftmaxActivation', 'ExpActivation',
+           "ReluActivation", "BReluActivation", "SoftReluActivation",
+           "STanhActivation",
+           "AbsActivation", "SquareActivation",
+           "BaseActivation"]
 
 
 class BaseActivation(object):
@@ -36,6 +38,9 @@ class BaseActivation(object):
         self.name = name
         self.support_hppl = support_hppl
 
+    def __repr__(self):
+        return self.name
+
 
 class TanhActivation(BaseActivation):
     """
@@ -185,3 +190,12 @@ class SquareActivation(BaseActivation):
     """
 
     def __init__(self): BaseActivation.__init__(self, 'square', False)
+
+class ExpActivation(BaseActivation):
+    """
+    Exponential Activation.
+    
+    .. math::
+       f(z) = e^z.
+    """
+    def __init__(self): BaseActivation.__init__(self, 'exponential', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 7b0a398d19..d263441247 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -17,6 +17,42 @@ __all__ = ['ParamAttr', 'ExtraAttr', 'ParameterAttribute',
            'ExtraLayerAttribute']
 
 
+def convert_and_compare(x, Type):
+    """                                                                                                                                                                                                
+    Convert x to be the same type as Type and then convert back to                                                                                                                                      
+    check whether there is a loss of information                                                                                                                                                        
+    :param x: object to be checked                                                                                                                                                                      
+    :param Type: target type to check x over                                                                                                                                                           
+    
+    """
+    return type(x)(Type(x))==x
+
+def is_compatible_with(x, Type):
+    """                                                                                                                                                                                                
+    Check if x has a type compatible with Type                                                                                                                                                         
+    :param x: object to be checked                                                                                                                                                                     
+    :param Type: target type to check x over                                                                                                                                                           
+    
+    """
+    if type(x) == Type:
+        return True
+    try:
+        if float == Type or int == Type:
+        # avoid those types that can be converted to float/int but not very                                                                                                                            
+        # meaningful and  could potentially lead to error                                                                                                                                              
+        # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
+            if not isinstance(x, str) and not isinstance(x, bool):
+                return convert_and_compare(x, Type)
+        elif bool == Type:
+            # should not use string type to initialize bool variable                                                                                                                                   
+            if not isinstance(x, str):
+                return convert_and_compare(x, Type)
+        else:
+            return False
+    except:
+        return False
+
+
 class ParameterAttribute(object):
     """
     Parameter Attributes object. To fine-tuning network training process, user
@@ -65,14 +101,18 @@ class ParameterAttribute(object):
         elif initial_std is None and initial_mean is None and initial_max \
                 is None and initial_min is None:
             self.attr = {'initial_smart': True}
-        elif isinstance(initial_std, float) or isinstance(initial_mean, float):
+        elif is_compatible_with(initial_std, float) or \
+             is_compatible_with(initial_mean, float):
             self.attr = dict()
             if initial_std is not None:
                 self.attr['initial_std'] = initial_std
             if initial_mean is not None:
                 self.attr['initial_mean'] = initial_mean
             self.attr['initial_strategy'] = 0  # Gauss Random
-        elif isinstance(initial_max, float) and isinstance(initial_min, float):
+        elif is_compatible_with(initial_max, float) and \
+             is_compatible_with(initial_min, float):
+            initial_max = initial_max
+            initial_min = initial_min
             assert initial_min < initial_max
             initial_mean = (initial_max + initial_min) / 2
             initial_std = initial_mean - initial_min
@@ -83,16 +123,16 @@ class ParameterAttribute(object):
         else:
             raise RuntimeError("Unexpected branch.")
 
-        if not is_static and isinstance(l1_rate, float):
+        if not is_static and is_compatible_with(l1_rate, float):
             self.attr['decay_rate_l1'] = l1_rate
 
-        if not is_static and isinstance(l2_rate, float):
+        if not is_static and is_compatible_with(l2_rate, float):
             self.attr['decay_rate'] = l2_rate
 
-        if not is_static and isinstance(learning_rate, float):
+        if not is_static and is_compatible_with(learning_rate, float):
             self.attr['learning_rate'] = learning_rate
 
-        if not is_static and isinstance(momentum, float):
+        if not is_static and is_compatible_with(momentum, float):
             self.attr['momentum'] = momentum
 
         if name is not None:
@@ -134,12 +174,16 @@ class ExtraLayerAttribute(object):
                       The dropout rate is the zero rate of this mask. The
                       details of what dropout is please refer to `here
                       <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_
+                      JMLRdropout.pdf>`_.
     :type drop_rate: float
-
+    :param device: device ID of layer. device=-1, use CPU. device>0, use GPU.
+                   The details allocation in parallel_nn please refer to `here
+                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
+                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+    :type device: int
     """
 
-    def __init__(self, error_clipping_threshold=None, drop_rate=None):
+    def __init__(self, error_clipping_threshold=None, drop_rate=None, device=None):
         self.attr = dict()
         if isinstance(error_clipping_threshold, float):
             assert error_clipping_threshold > 0
@@ -149,6 +193,9 @@ class ExtraLayerAttribute(object):
             assert drop_rate > 0
             self.attr["drop_rate"] = drop_rate
 
+        if isinstance(device, int):
+            self.attr["device"] = device
+
     def check(self, layer_name):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 8f3dcb96a9..8ada3903dc 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -14,10 +14,6 @@
 
 """
 Data Sources are helpers to define paddle training data or testing data.
-There are several data attributes will be used by paddle:
-
-- Data ProviderType\: such as Python, Protobuf
-- Data File list\: a single file that contains all data file paths
 """
 from paddle.trainer.config_parser import *
 from .utils import deprecated
@@ -27,8 +23,7 @@ try:
 except ImportError:
     import pickle
 
-__all__ = ['define_py_data_sources',
-           'define_py_data_sources2']
+__all__ = ['define_py_data_sources2']
 
 
 def define_py_data_source(file_list, cls, module,
@@ -50,11 +45,8 @@ def define_py_data_source(file_list, cls, module,
         define_py_data_source("train.list", TrainData, "data_provider", "process",
                               args={"dictionary": dict_name})
 
-    The related data provider can refer to 
-    `here <data_provider/python_case.html#quick-start>`__.
-
     :param data_cls:
-    :param file_list: file list name.
+    :param file_list: file list name, which contains all data file paths
     :type file_list: basestring
     :param cls: Train or Test Class.
     :type cls: TrainData or TestData
@@ -105,27 +97,10 @@ def define_py_data_source(file_list, cls, module,
 def define_py_data_sources(train_list, test_list, module, obj, args=None,
                            train_async=False, data_cls=PyData):
     """
-    Define python Train/Test data sources in one method. If train/test use
-    the same Data Provider configuration, module/obj/args contain one argument,
-    otherwise contain a list or tuple of arguments. For example\:
-
-    ..  code-block:: python
-
-        define_py_data_sources("train.list", "test.list", module="data_provider"
-                               obj="process", args={"dictionary": dict_name})
-
-    Or.
-
-    ..  code-block:: python
+    The annotation is almost the same as define_py_data_sources2, except that
+    it can specific train_async and data_cls.
 
-        define_py_data_sources("train.list", "test.list", module="data_provider"
-                               obj=["process_train", "process_test"],
-                               args=[{"dictionary": dict_train}, {"dictionary": dict_test}])
-
-    The related data provider can refer to 
-    `here <data_provider/python_case.html#sequence-example>`__.
-
-    :param data_cls:
+    :param data_cls: 
     :param train_list: Train list name.
     :type train_list: basestring
     :param test_list: Test list name.
@@ -183,6 +158,43 @@ def define_py_data_sources(train_list, test_list, module, obj, args=None,
 
 
 def define_py_data_sources2(train_list, test_list, module, obj, args=None):
+    """
+    Define python Train/Test data sources in one method. If train/test use
+    the same Data Provider configuration, module/obj/args contain one argument,
+    otherwise contain a list or tuple of arguments. For example\:
+
+    ..  code-block:: python
+
+        define_py_data_sources2(train_list="train.list", 
+                                test_list="test.list", 
+                                module="data_provider"
+                                # if train/test use different configurations,
+                                # obj=["process_train", "process_test"]
+                                obj="process", 
+                                args={"dictionary": dict_name})
+
+    The related data provider can refer to 
+    `here <../../data_provider/pydataprovider2.html#dataprovider-for-the-sequential-model>`__.
+
+    :param train_list: Train list name.
+    :type train_list: basestring
+    :param test_list: Test list name.
+    :type test_list: basestring
+    :param module: python module name. If train and test is different, then
+                   pass a tuple or list to this argument.
+    :type module: basestring or tuple or list
+    :param obj: python object name. May be a function name if using
+                PyDataProviderWrapper. If train and test is different, then pass
+                a tuple or list to this argument.
+    :type obj: basestring or tuple or list
+    :param args: The best practice is using dict() to pass arguments into
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
+                 arguments. If train and test is different, then pass a tuple 
+                 or list to this argument.
+    :type args: string or picklable object or list or tuple.
+    :return: None
+    :rtype: None
+    """
     define_py_data_sources(train_list=train_list,
                            test_list=test_list,
                            module=module,
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 956bedadd7..ded124a5c8 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -65,12 +65,12 @@ def evaluator_base(
         name=None,
         chunk_scheme=None,
         num_chunk_types=None,
-        classification_threshold=0.5,
-        positive_label=-1,
-        dict_file="",
-        result_file="",
-        num_results=1,
-        delimited=True):
+        classification_threshold=None,
+        positive_label=None,
+        dict_file=None,
+        result_file=None,
+        num_results=None,
+        delimited=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -94,7 +94,7 @@ def evaluator_base(
          Batch=200 samples=20000 AvgCost=0.679655 CurrentCost=0.662179 Eval:
          classification_error_evaluator=0.4486
          CurrentEval: ErrorRate=0.3964
-         
+
     :param input: Input layers, a object of LayerOutput or a list of
                   LayerOutput.
     :type input: list|LayerOutput
@@ -105,9 +105,10 @@ def evaluator_base(
     :type weight: LayerOutput.
     """
     # inputs type assertions.
-    assert isinstance(classification_threshold, float)
-    assert isinstance(positive_label, int)
-    assert isinstance(num_results, int)
+    assert classification_threshold is None or isinstance(
+        classification_threshold, float)
+    assert positive_label is None or isinstance(positive_label, int)
+    assert num_results is None or isinstance(num_results, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -136,7 +137,7 @@ def classification_error_evaluator(
         label,
         name=None,
         weight=None,
-        threshold=0.5):
+        threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
 
@@ -253,7 +254,7 @@ def pnpair_evaluator(
 def precision_recall_evaluator(
         input,
         label,
-        positive_label=-1,
+        positive_label=None,
         weight=None,
         name=None,
         ):
@@ -296,6 +297,7 @@ def precision_recall_evaluator(
 @wrap_name_default()
 def ctc_error_evaluator(
         input,
+        label,
         name=None,
         ):
     """
@@ -305,16 +307,20 @@ def ctc_error_evaluator(
 
     .. code-block:: python
 
-       eval = ctc_error_evaluator(input)
+       eval = ctc_error_evaluator(input=input, label=lbl)
 
     :param name: Evaluator name.
     :type name: None|basestring
-    :param input: Input Layer.
+    :param input: Input Layer. Should be the same as the input for ctc_layer.
     :type input: LayerOutput
+    :param label: input label, which is a data_layer. Should be the same as the
+                  label for ctc_layer
+    :type label: LayerOutput
     """
     evaluator_base(name=name,
                    type="ctc_edit_distance",
-                   input=input)
+                   input=input,
+                   label=label)
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
@@ -489,7 +495,7 @@ def gradient_printer_evaluator(
 @wrap_name_default()
 def maxid_printer_evaluator(
         input,
-        num_results=1,
+        num_results=None,
         name=None,
         ):
     """
@@ -513,13 +519,14 @@ def maxid_printer_evaluator(
     """
     evaluator_base(name=name,
                    type="max_id_printer",
-                   input=input)
+                   input=input,
+                   num_results=num_results)
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def maxframe_printer_evaluator(
         input,
-        num_results=1,
+        num_results=None,
         name=None,
         ):
     """
@@ -551,20 +558,20 @@ def maxframe_printer_evaluator(
 @wrap_name_default()
 def seqtext_printer_evaluator(
         input,
-        dict_file="",
-        result_file="",
-        delimited=True,
+        result_file,
+        id_input=None,
+        dict_file=None,
+        delimited=None,
         name=None,
         ):
     """
     Sequence text printer will print text according to index matrix and a
     dictionary. There can be multiple input to this layer:
 
-    1. If there is only one input, the input must be a matrix containing
+    1. If there is no id_input, the input must be a matrix containing
     the sequence of indices;
 
-    2. If there are more than one input, the first input should be ids,
-    and are interpreted as sample ids.
+    2. If there is id_input, it should be ids, and interpreted as sample ids.
 
     The output format will be:
 
@@ -595,25 +602,43 @@ def seqtext_printer_evaluator(
 
     .. code-block:: python
 
-       eval = seqtext_printer_evaluator(input,
+       eval = seqtext_printer_evaluator(input=maxid_layer,
+                                        id_input=sample_id,
                                         dict_file=dict_file,
                                         result_file=result_file)
 
     :param input: Input Layer name.
     :type input: LayerOutput|list
-    :param dict_file: The input dictionary which contains a list of tokens.
-    :type dict_file: basestring
-    :param result_file: The file is to save the results.
+    :param result_file: Path of the file to store the generated results.
     :type result_file: basestring
+    :param id_input: Index of the input sequence, and the specified index will
+                     be prited in the gereated results. This an optional
+                     parameter.
+    :type id_input: LayerOutput
+    :param dict_file: Path of dictionary. This is an optional parameter.
+                      Every line is a word in the dictionary with
+                      (line number - 1) as the word index.
+                      If this parameter is set to None, or to an empty string,
+                      only word index are printed in the generated results.
+    :type dict_file: basestring
     :param delimited: Whether to use space to separate output tokens.
                 Default is True. No space is added if set to False.
     :type delimited: bool
     :param name: Evaluator name.
     :type name: None|basestring
+    :return: The seq_text_printer that prints the generated sequence to a file.
+    :rtype: evaluator
     """
+    assert isinstance(result_file, basestring)
+    if id_input is None:
+        inputs = [input]
+    else:
+        inputs = [id_input, input]
+        input.parents.append(id_input)
+
     evaluator_base(name=name,
                    type="seq_text_printer",
-                   input=input,
+                   input=inputs,
                    dict_file=dict_file,
                    result_file=result_file,
                    delimited=delimited)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6e7964c12c..c355dc042a 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import functools
+import collections
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
@@ -29,14 +30,14 @@ except ImportError:
 import copy
 
 __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
-           "identity_projection", "dotmul_projection",
+           "identity_projection", "dotmul_projection", "dotmul_operator",
            "table_projection", "mixed_layer", "data_layer",
            "embedding_layer", "fc_layer", "grumemory",
            "pooling_layer", "lstmemory", "last_seq", "first_seq",
            "cos_sim", "hsigmoid",
            "regression_cost", 'classification_cost', "LayerOutput",
            'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
-           'img_cmrnorm_layer', 'img_rnorm_layer', 'addto_layer',
+           'img_cmrnorm_layer', 'addto_layer',
            'concat_layer', 'lstm_step_layer', 'recurrent_group',
            'memory', 'StaticInput', 'expand_layer', 'scaling_layer',
            'power_layer', 'interpolation_layer', 'trans_layer',
@@ -47,11 +48,13 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            'BaseGeneratedInput', 'conv_operator', 'conv_shift_layer',
            'tensor_layer', 'selective_fc_layer', 'sampling_id_layer',
            'slope_intercept_layer', 'trans_full_matrix_projection',
+           'linear_comb_layer',
            'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
            'cross_entropy_with_selfnorm', 'cross_entropy',
            'multi_binary_label_cross_entropy',
            'rank_cost', 'lambda_cost', 'huber_cost',
-           'block_expand_layer',
+           # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
+           'out_prod_layer', 'print_layer'
            ]
 
 
@@ -70,7 +73,8 @@ class LayerType(object):
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
     COST = 'cost'
-    COSINE_SIM = 'cos_vm'
+    COSINE_SIM_VEC = 'cos_vm'
+    COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
     POOL_LAYER = "pool"
@@ -91,6 +95,7 @@ class LayerType(object):
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
+    OUT_PROD_LAYER = 'out_prod'
 
     MEMORY = 'memory'
     MAXID_LAYER = 'maxid'
@@ -102,9 +107,11 @@ class LayerType(object):
     SEL_FC_LAYER = "selective_fc"
     SAMPLING_ID_LAYER = "sampling_id"
     SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    CONVEX_COMBINATION_LAYER = "convex_comb"
+    LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
 
+    PRINT_LAYER = "print"
+
     CTC_LAYER = "ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
@@ -161,16 +168,19 @@ class LayerOutput(object):
     :param activation: Layer Activation.
     :type activation: BaseActivation.
     :param parents: Layer's parents.
-    :type parents: list|tuple
+    :type parents: list|tuple|collection.Sequence
     """
 
     def __init__(self, name, layer_type, parents=None, activation=None,
-                 num_filters=None, img_norm_type=None, size=None, outputs=None):
+                 num_filters=None, img_norm_type=None, size=None, outputs=None,
+                 reverse=None):
         assert isinstance(name, basestring)
         assert isinstance(layer_type, basestring)
         assert LayerType.is_layer_type(layer_type)
         self.name = name
         self.layer_type = layer_type
+        if parents is not None and type(parents) != list:
+            parents = [parents]
         self.parents = [] if parents is None else parents
         self.activation = activation
         self.num_filters = num_filters
@@ -179,6 +189,7 @@ class LayerOutput(object):
         if outputs is None:
             outputs = ['default']
         self.outputs = outputs
+        self.reverse = reverse
 
     def __repr__(self):
         """
@@ -195,13 +206,16 @@ class LayerOutput(object):
 
 ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
+DEVICE = 'device'
 
 
 def layer_support(*attrs):
+    attrs_list = list(attrs) 
+    attrs_list.append(DEVICE)
     def decorator(method):
         @functools.wraps(method)
         def wrapper(*args, **kwargs):
-            for attr in attrs:
+            for attr in attrs_list:
                 for each in args:
                     if isinstance(each, ExtraLayerAttribute):
                         setattr(each, '_'.join(['can', attr]), True)
@@ -261,7 +275,43 @@ def full_matrix_projection(input, size=0, param_attr=None):
                                 size=size,
                                 **param_attr.attr)
     proj.origin = input
-    proj.origin.projection = "matrix"
+    return proj
+
+
+@wrap_param_attr_default()
+def trans_full_matrix_projection(input, size=0, param_attr=None):
+    """
+    Different from full_matrix_projection, this projection performs matrix
+    multiplication, using transpose of weight.
+
+    ..  math::
+        out.row[i] += in.row[i] * w^\mathrm{T}
+
+    :math:`w^\mathrm{T}` means transpose of weight.
+    The simply usage is:
+
+    .. code-block:: python
+
+       proj = trans_full_matrix_projection(input=layer,
+                                           size=100,
+                                           param_attr=ParamAttr(
+                                                name='_proj',
+                                                initial_mean=0.0,
+                                                initial_std=0.01))
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param size: The parameter size. Means the width of parameter.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A TransposedFullMatrixProjection Object.
+    :rtype: TransposedFullMatrixProjection
+    """
+    proj = TransposedFullMatrixProjection(input_layer_name=input.name,
+                                          size=size,
+                                          **param_attr.attr)
+    proj.origin = input
     return proj
 
 
@@ -308,7 +358,6 @@ def table_projection(input, size=0, param_attr=None):
                            size=size,
                            **param_attr.attr)
     proj.origin = input
-    proj.origin.projection = "table"
     return proj
 
 
@@ -343,7 +392,7 @@ def identity_projection(input, offset=None):
     Note that both of two projections should not have any parameter.
 
     :param input: Input Layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param offset: Offset, None if use default.
     :type offset: int
     :return: A IdentityProjection or IdentityOffsetProjection Object
@@ -352,19 +401,17 @@ def identity_projection(input, offset=None):
     if offset is None:
         proj = IdentityProjection(input_layer_name=input.name)
         proj.origin = input
-        proj.origin.projection = 'identity'
     else:
         proj = IdentityOffsetProjection(input_layer_name=input.name,
                                         offset=offset)
         proj.origin = input
-        proj.origin.projection = 'identity_offset'
     return proj
 
 
 @wrap_param_attr_default()
-def dotmul_projection(input, param_attr=None, scale=1):
+def dotmul_projection(input, param_attr=None):
     """
-    1. DotMulProjection if input is a layer.
+    DotMulProjection with a layer as input.
     It performs element-wise multiplication with weight.
 
     ..  math::
@@ -378,11 +425,26 @@ def dotmul_projection(input, param_attr=None, scale=1):
 
        proj = dotmul_projection(input=layer)
 
-    2. DotMulOperator if input is a list or tuple.
-    It takes two inputs, performs element-wise multiplication:
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
+    """
+    proj = DotMulProjection(input_layer_name=input.name,
+                            size=input.size,
+                            **param_attr.attr)
+    proj.origin = input
+    return proj
+
+
+def dotmul_operator(a=None, b=None, scale=1, **kwargs):
+    """
+    DotMulOperator takes two inputs and performs element-wise multiplication:
 
     .. math::
-       out.row[i] += scale * (in1.row[i] .* in2.row[i])
+       out.row[i] += scale * (x.row[i] .* y.row[i])
 
     where :math:`.*` means element-wise multiplication, and
     scale is a config scalar, its default value is one.
@@ -391,34 +453,31 @@ def dotmul_projection(input, param_attr=None, scale=1):
 
     .. code-block:: python
 
-       op = dotmul_projection(input=[layer1, layer2],
-                              scale=2.0)
+       op = dotmul_operator(x=layer1, y=layer2, scale=0.5)
 
-    :param input: Input layer.
-    :type input: LayerOutput|list|tuple
-    :param param_attr: Parameter config, None if use default.
-    :type param_attr: ParameterAttribute
+    :param a: Input layer1
+    :type a: LayerOutput
+    :param b: Input layer2
+    :type b: LayerOutput
     :param scale: config scalar, default value is one.
     :type scale: float
-    :return: A DotMulProjection or DotMulOperator Object.
-    :rtype: DotMulProjection or DotMulOperator
-    """
-    if isinstance(input, LayerOutput):
-        proj = DotMulProjection(input_layer_name=input.name,
-                                size=input.size,
-                                **param_attr.attr)
-        proj.origin = input
-        proj.origin.projection = "dot_mul"
-        return proj
-    else:
-        assert isinstance(input, list) or isinstance(input, tuple)
-        assert len(input) == 2
-        assert param_attr is None
-        op = DotMulOperator(input_layer_name=[x.name for x in input],
-                            scale=scale)
-        op.origin = input
-        op.origin.operator = "dot_mul"
-        return op
+    :return: A DotMulOperator Object.
+    :rtype: DotMulOperator
+    """
+    if 'x' in kwargs or 'y' in kwargs:
+        logger.warning('x and y arguments for dotmul_operator is deprecated. '
+                       'Please use a and b as parameter.')
+    a = kwargs.get('x', a)    # For Backward capacity.
+    b = kwargs.get('y', b)
+    assert isinstance(a, LayerOutput)
+    assert isinstance(b, LayerOutput)
+    if a.size is not None and b.size is not None:
+        assert a.size == b.size
+
+    op = DotMulOperator(input_layer_names=[a.name, b.name],
+                        scale=scale)
+    op.origin = [a, b]
+    return op
 
 
 @wrap_bias_attr_default(['padding_attr'])
@@ -465,7 +524,6 @@ def context_projection(input, context_len, context_start=None,
                              trainable_padding=trainable,
                              **extra_dict)
     proj.origin = input
-    proj.origin.projection = 'context'
     return proj
 
 
@@ -512,9 +570,12 @@ class MixedLayerType(LayerOutput):
         :rtype: MixedLayerType
         """
         if not self.finalized:
-            assert isinstance(other, Projection)
+            assert isinstance(other, Projection) or isinstance(other, Operator)
             self.inputs.append(other)
-            self.parents.append(other.origin)
+            if isinstance(other, Projection):
+                self.parents.append(other.origin)
+            else:
+                self.parents.extend(other.origin)
             return self
         else:
             raise MixedLayerType.AddToSealedMixedLayerException()
@@ -540,7 +601,7 @@ class MixedLayerType(LayerOutput):
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def mixed_layer(size, input=None, name=None, act=None, bias_attr=False,
+def mixed_layer(size=0, input=None, name=None, act=None, bias_attr=False,
                 layer_attr=None):
     """
     Mixed Layer. A mixed layer will add all inputs together, then activate.
@@ -587,7 +648,7 @@ def mixed_layer(size, input=None, name=None, act=None, bias_attr=False,
     else:
         with mixed_layer(name=name, size=size, act=act, bias_attr=bias_attr,
                          layer_attr=layer_attr) as m:
-            if isinstance(input, list) or isinstance(input, tuple):
+            if isinstance(input, collections.Sequence):
                 for each in input:
                     m += each
             else:
@@ -613,7 +674,7 @@ def data_layer(name, size, layer_attr=None):
     :type size: int
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
-    :return: Layer Output Object.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(type=LayerType.DATA, name=name, size=size,
@@ -640,7 +701,7 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     :type param_attr: ParameterAttribute|None
     :param layer_attr: Extra layer Config. Default is None.
     :type layer_attr: ExtraLayerAttribute|None
-    :return: Embedding Layer output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     with mixed_layer(name=name, size=size, act=LinearActivation(),
@@ -692,28 +753,24 @@ def fc_layer(input, size, act=None, name=None,
     :type bias_attr: ParameterAttribute|None|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
-    :return: Layer Name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
-        assert not isinstance(param_attr, list)
+        assert not isinstance(param_attr, collections.Sequence)
         param_attr = [param_attr]
     else:
-        if isinstance(param_attr, list) or isinstance(param_attr, tuple):
+        if isinstance(param_attr, collections.Sequence):
             assert len(input) == len(param_attr)
         else:
             param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
 
-    assert isinstance(input, list)
-
-    def __idx_to_input__(i):
-        attr = param_attr[i]
-        assert isinstance(attr, ParameterAttribute)
-        return Input(input[i].name, **attr.attr)
+    assert isinstance(input, collections.Sequence)
 
     Layer(
-        inputs=map(__idx_to_input__, range(len(input))),
+        inputs=[Input(ipt.name, **attr.attr) for ipt, attr in zip(
+            input, param_attr)],
         name=name,
         type=LayerType.FC_LAYER,
         size=size,
@@ -725,6 +782,31 @@ def fc_layer(input, size, act=None, name=None,
                        size=size)
 
 
+@wrap_name_default("print")
+def print_layer(input, name=None):
+    """
+    Print the output value of input layers. This layer is useful for debugging.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. Could be a list/tuple of input layer.
+    :type input: LayerOutput|list|tuple
+    :return: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    assert isinstance(input, collections.Sequence)  # list or tuple
+    for each in input:
+        assert isinstance(each, LayerOutput)
+
+    Layer(
+        name=name,
+        type=LayerType.PRINT_LAYER,
+        inputs=[l.name for l in input],
+    )
+    # this layer don't return anything, can not be input of other layer.
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -743,7 +825,8 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
                                 pooling_type=AvgPooling(),
                                 agg_level=AggregateLevel.EACH_SEQUENCE)
 
-    :param agg_level: AggregateLevel.EACH_TIMESTEP or AggregateLevel.EACH_SEQUENCE
+    :param agg_level: AggregateLevel.EACH_TIMESTEP or
+                      AggregateLevel.EACH_SEQUENCE
     :type agg_level: AggregateLevel
     :param name: layer name.
     :type name: basestring
@@ -756,12 +839,17 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
     :type bias_attr: ParameterAttribute|None|False
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute|None
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerType
     """
     extra_dict = dict()
+    # noinspection PyUnresolvedReferences
     if isinstance(pooling_type, AvgPooling):
         extra_dict['average_strategy'] = pooling_type.strategy
+    elif isinstance(pooling_type, MaxPooling) and \
+                    pooling_type.output_max_index is not None:
+        assert isinstance(pooling_type.output_max_index, bool)
+        extra_dict['output_max_index'] = pooling_type.output_max_index
     extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
 
     Layer(
@@ -785,7 +873,7 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
 @wrap_name_default("lstmemory")
 @layer_support(DROPOUT)
 def lstmemory(input, name=None, reverse=False, act=None,
-              gate_act=None,
+              gate_act=None, size=None,
               state_act=None, bias_attr=None, param_attr=None,
               layer_attr=None):
     """
@@ -806,21 +894,22 @@ def lstmemory(input, name=None, reverse=False, act=None,
         h_t & = o_t tanh(c_t)
 
 
-    NOTE: In paddle's implementation, the multiply operation
+    NOTE: In PaddlePaddle's implementation, the multiplications
     :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` is not done by
-    lstmemory layer, so it must use a mixed_layer do this full_matrix_projection
-    before lstm is used.
+    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer,
+    so an additional mixed_layer with full_matrix_projection or a fc_layer must
+    be included in the configuration file to complete the input-to-hidden
+    mappings before lstmemory is called.
 
-    NOTE: This is a low level user interface. You may use network.simple_lstm
+    NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer **Generating Sequences With Recurrent Neural Networks** if you
-    want to know what lstm is. Link_ is here.
+    Please refer to **Generating Sequences With Recurrent Neural Networks** for
+    more details about LSTM.
 
-    .. _Link: http://arxiv.org/abs/1308.0850
+    Link_ goes as below.
 
-    TODO(yuyang18): Check lstm can input multiple values or not?
+    .. _Link: http://arxiv.org/abs/1308.0850
 
     :param name: The lstmemory layer name.
     :type name: basestring
@@ -842,13 +931,23 @@ def lstmemory(input, name=None, reverse=False, act=None,
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
     :type layer_attr: ExtraLayerAttribute|None
-    :return: Layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
     assert gate_act.support_hppl
     assert state_act.support_hppl
     assert act.support_hppl
+    assert input.size is not None and input.size % 4 == 0
+    if size is not None:
+        if input.size / 4 == size:
+            plog = logger.warning
+        else:
+            plog = logger.fatal
+
+        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
+             "layer. The lstm size should be equal with input layer size/4. The"
+             " size which is set explicitly will be ignored." % name)
 
     Layer(name=name,
           type=LayerType.LSTMEMORY,
@@ -860,8 +959,9 @@ def lstmemory(input, name=None, reverse=False, act=None,
           inputs=[Input(input.name, **param_attr.attr)],
           **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.LSTMEMORY, [input],
-                       size=input.size / 4 if input.size is not None else None)
+    return LayerOutput(name, LayerType.LSTMEMORY, [input], size=input.size / 4,
+                       reverse=reverse)
+
 
 @wrap_bias_attr_default()
 @wrap_param_attr_default()
@@ -871,7 +971,7 @@ def lstmemory(input, name=None, reverse=False, act=None,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input, name=None, reverse=False, act=None,
-              gate_act=None,
+              gate_act=None, size=None,
               bias_attr=None, param_attr=None,
               layer_attr=None):
     """
@@ -894,28 +994,30 @@ def grumemory(input, name=None, reverse=False, act=None,
 
         r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
 
-    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to that
-    of the traditional recurrent unit:
+    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to
+    that of the traditional recurrent unit:
 
     ..  math::
 
         {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
 
-    4. The hidden activation :math:`h_t` of the GRU at time t is a linear interpolation
-    between the previous activation :math:`h_{t-1}` and the candidate activation
-    :math:`\\tilde{h_t}`:
+    4. The hidden activation :math:`h_t` of the GRU at time t is a linear
+    interpolation between the previous activation :math:`h_{t-1}` and the
+    candidate activation :math:`\\tilde{h_t}`:
 
     ..  math::
 
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
-    NOTE: In paddle's implementation, the multiply operation
+    NOTE: In PaddlePaddle's implementation, the multiplication operations
     :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. So it must use a mixed_layer with full_matrix_projection
-    or fc_layer to compute them before GRU.
+    gate_recurrent layer. Consequently, an additional mixed_layer with
+    full_matrix_projection or a fc_layer must be included before grumemory
+    is called.
 
-    The details can refer to `Empirical Evaluation of Gated Recurrent
-    Neural Networks on Sequence Modeling. <https://arxiv.org/abs/1412.3555>`_
+    More details can be found by referring to `Empirical Evaluation of Gated
+    Recurrent Neural Networks on Sequence Modeling.
+    <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -927,7 +1029,7 @@ def grumemory(input, name=None, reverse=False, act=None,
     :type name: None|basestring
     :param input: input layer.
     :type input: LayerOutput.
-    :param reverse: Wether sequence process is reversed or not.
+    :param reverse: Whether sequence process is reversed or not.
     :type reverse: bool
     :param act: activation type, TanhActivation by default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
@@ -943,12 +1045,23 @@ def grumemory(input, name=None, reverse=False, act=None,
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
     :type layer_attr: ExtraLayerAttribute|None
-    :return: Layer name.
+    :param size: Stub parameter of size, but actually not used. If set this size
+                 will get a warning.
+    :type size: None
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-
     assert act.support_hppl
     assert gate_act.support_hppl
+    assert input.size is not None and input.size % 3 == 0
+    if size is not None:
+        if input.size / 3 == size:
+            plog = logger.warning
+        else:
+            plog = logger.fatal
+        plog("NOTE: the gru memory layer's size is set by previous input layer,"
+             " and should be input size / 3. Set size explicitly will be "
+             "ignored.")
 
     Layer(name=name,
           type=LayerType.GRUMEMORY,
@@ -960,8 +1073,9 @@ def grumemory(input, name=None, reverse=False, act=None,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
-    return LayerOutput(name, LayerType.GRUMEMORY, [input],
-                       size=input.size / 3 if input.size is not None else None)
+    return LayerOutput(name, LayerType.GRUMEMORY, [input], size=input.size / 3,
+                       reverse=reverse)
+
 
 @wrap_name_default()
 @layer_support()
@@ -977,9 +1091,15 @@ def last_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
     :type input: LayerOutput
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
+    if input.reverse is not None and input.reverse:
+        logger.warning("You are getting the last instance of a sequence that"
+                       " is a output of a REVERSED layer. There is no time"
+                       " series information at all. Maybe you want to use"
+                       " first_seq instead.")
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_LAST_INSTANCE,
@@ -1005,9 +1125,16 @@ def first_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
     :type input: LayerOutput
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
+
+    if input.reverse is not None and not input.reverse:
+        logger.warning('You are getting the first instance for a time series,'
+                       ' and it is a normal recurrent layer output. There is no'
+                       ' time series information at all. Maybe you want to use'
+                       ' last_seq instead.')
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_FIRST_INSTANCE,
@@ -1023,6 +1150,7 @@ class ExpandLevel(object):
     FROM_TIMESTEP = AggregateLevel.EACH_TIMESTEP
     FROM_SEQUENCE = AggregateLevel.EACH_SEQUENCE
 
+
 @wrap_name_default()
 @layer_support()
 def expand_layer(input, expand_as,
@@ -1055,7 +1183,7 @@ def expand_layer(input, expand_as,
     :type expand_level: ExpandLevel
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
@@ -1073,7 +1201,6 @@ def expand_layer(input, expand_as,
                        parents=[input, expand_as])
 
 
-
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1102,13 +1229,18 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert isinstance(input, list) or isinstance(input, tuple)
+    assert isinstance(input, collections.Sequence)
     assert len(input) == 2
-    assert input[0].size == input[1].size
-    assert weight.size == 1
+    assert isinstance(input[0], LayerOutput) and isinstance(input[1],
+                                                            LayerOutput)
+    if input[0].size is not None and input[1].size is not None:
+        assert input[0].size == input[1].size
+    assert isinstance(weight, LayerOutput)
+    if weight.size is not None:
+        assert weight.size == 1
     Layer(
         name=name,
         type=LayerType.INTERPOLATION_LAYER,
@@ -1147,14 +1279,16 @@ def power_layer(input, weight, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert weight.size == 1
+    assert isinstance(input, LayerOutput) and isinstance(weight, LayerOutput)
+    if weight.size is not None:
+        assert weight.size == 1
     Layer(
         name=name,
         type=LayerType.POWER_LAYER,
-        inputs=[input.name, weight.name],
+        inputs=[weight.name, input.name],
         **ExtraAttr.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.POWER_LAYER,
@@ -1165,13 +1299,16 @@ def power_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def scaling_layer(input, weight, name=None, layer_attr=None):
     """
-    A layer for each row of a matrix, multiplying with a element of a vector.
+    A layer for multiplying input vector by weight scalar.
 
     .. math::
-       y.row[i] = w[i] * x.row[i]
+       y  = w x
 
-    where :math:`x` is (batchSize x dataDim) input, :math:`w` is
-    (batchSize x 1) weight vector, and :math:`y` is (batchSize x dataDim) output.
+    where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
+    and :math:`y` is size=dataDim output.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The example usage is:
 
@@ -1187,10 +1324,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert weight.size == 1
+    assert isinstance(weight, LayerOutput) and isinstance(input, LayerOutput)
+    if weight.size is not None:
+        assert weight.size == 1
     Layer(
         name=name,
         type=LayerType.SCALING_LAYER,
@@ -1224,7 +1363,7 @@ def trans_layer(input, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(
@@ -1244,12 +1383,15 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     Cosine Similarity Layer. The cosine similarity equation is here.
 
     ..  math::
-        similarity = cos(\\theta) = {\\mathbf{A} \\cdot \\mathbf{B}
-        \\over \\|\\mathbf{A}\\| \\|\\mathbf{B}\\|}
+        similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
+        \\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
+
+    The size of a is M, size of b is M*N,
+    Similarity will be calculated N times by step M. The output size is
+    N. The scale will be multiplied to similarity.
 
-    And the input dimension is :math:`a \in R^M`, :math:`b \in R^{MN}`. The
-    similarity will be calculated N times by step M. The output dimension is
-    :math:`R^N`. The scale will be multiplied to similarity.
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     :param name: layer name
     :type name: basestring
@@ -1263,23 +1405,38 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     :type size: int
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(
-        name=name,
-        type=LayerType.COSINE_SIM,
-        size=size,
-        cos_scale=scale,
-        inputs=[a.name, b.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    if size == 1:
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
+    else:
+        if a.size is not None and b.size is not None:
+            assert size == b.size / a.size
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM_VEC,
+            size=size,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
 
+
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
+@wrap_param_attr_default()
 @layer_support()
-def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=None):
+def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
+             param_attr=None, layer_attr=None):
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
@@ -1308,20 +1465,28 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
     :type bias_attr: ParameterAttribute|False
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
-    assert isinstance(input, list) or isinstance(input, tuple)
+        if not isinstance(param_attr, collections.Sequence):
+            param_attr = [param_attr]
+    else:
+        if not isinstance(param_attr, collections.Sequence):
+            param_attr = [param_attr] * len(input)
+        else:
+            assert len(param_attr) == len(input)
+
+    assert isinstance(input, collections.Sequence)
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
 
     ipts_for_layer = []
     parents = []
-    for each_input in input:
+    for each_input, each_param_attr in zip(input, param_attr):
         assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(each_input.name)
+        ipts_for_layer.append(Input(each_input.name, **each_param_attr.attr))
         parents.append(each_input)
     ipts_for_layer.append(label.name)
     parents.append(label)
@@ -1336,6 +1501,7 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
     )
     return LayerOutput(name, LayerType.HSIGMOID, parents=parents)
 
+
 @wrap_name_default("conv")
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
@@ -1358,34 +1524,37 @@ def img_conv_layer(input, filter_size, num_filters,
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
     num_filters * num_group.
 
-    There are several group of filter in paddle
-    implementation. Each group will process some channel of inputs. For example,
-    if input num_channel = 256, group = 4, num_filter=32, the paddle will create
+    There are several group of filter in PaddlePaddle implementation.
+    Each group will process some channel of the inputs. For example, if an input
+    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
     32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The rest
-    channels will be processed by rest group of filters.
+    pieces. First 256/4 = 64 channels will process by first 32 filters. The
+    rest channels will be processed by rest group of filters.
 
     :param name: Layer name.
     :type name: basestring
     :param input: Layer Input.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
-    :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
-                        support rectangular filters, the filter's shape
-                        will be (filter_size, filter_size_y).
-    :type filter_size_y: int
+    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
+                        two image dimension.
+    :type filter_size: int|tuple|list
+    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
+                        currently supports rectangular filters, the filter's
+                        shape will be (filter_size, filter_size_y).
+    :type filter_size_y: int|None
     :param num_filters: Each filter group's number of filter
     :param act: Activation type. Default is tanh
     :type act: BaseActivation
     :param groups: Group size of filters.
     :type groups: int
-    :param stride: The x dimension of the stride.
-    :type stride: int
+    :param stride: The x dimension of the stride. Or input a tuple for two image
+                   dimension.
+    :type stride: int|tuple|list
     :param stride_y: The y dimension of the stride.
     :type stride_y: int
-    :param padding: The x dimension of the padding.
-    :type padding: int
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension
+    :type padding: int|tuple|list
     :param padding_y: The y dimension of the padding.
     :type padding_y: int
     :param bias_attr: Convolution bias attribute. None means default bias.
@@ -1400,28 +1569,49 @@ def img_conv_layer(input, filter_size, num_filters,
     :type shared_biases: bool
     :param layer_attr: Layer Extra Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: Layer output.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if num_channels is None:
         assert input.num_filters is not None
         num_channels = input.num_filters
+
     if filter_size_y is None:
-        filter_size_y = filter_size
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
     if stride_y is None:
-        stride_y = stride
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
     if padding_y is None:
-        padding_y = padding
-    if param_attr.attr.get('initial_smart') == True: # special initial for conv layers.
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
         init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
-        param_attr = ParameterAttribute(initial_mean=0.0, initial_std=init_w)
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
     Layer(
         name=name,
         inputs=Input(input.name, conv=Conv(
             filter_size=filter_size, padding=padding, stride=stride,
             channels=num_channels, groups=groups,
-            filter_size_y=filter_size_y, padding_y=padding_y, stride_y=stride_y),
-            **param_attr.attr),
+            filter_size_y=filter_size_y, padding_y=padding_y,
+            stride_y=stride_y),
+                     **param_attr.attr),
         active_type=act.name,
         num_filters=num_filters,
         bias=ParamAttr.to_bias(bias_attr),
@@ -1464,7 +1654,8 @@ def img_pool_layer(input, pool_size, name=None,
     :type start: int
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if num_channels is None:
         assert input.num_filters is not None
@@ -1480,7 +1671,7 @@ def img_pool_layer(input, pool_size, name=None,
         type=LayerType.POOL_LAYER,
         inputs=[Input(input.name,
                       pool=Pool(
-                          pool_type=pool_type.name + '-projection',
+                          pool_type=''.join([pool_type.name, '-projection']),
                           channels=num_channels,
                           size_x=pool_size,
                           start=start,
@@ -1514,56 +1705,33 @@ def __img_norm_layer__(name, input, size, norm_type, scale, power,
 
 @wrap_name_default("crmnorm")
 @layer_support()
-def img_cmrnorm_layer(input, size, scale, power, name=None, num_channels=None,
-                      blocked=0, layer_attr=None):
+def img_cmrnorm_layer(input, size, scale=0.0128, power=0.75,
+                      name=None, num_channels=None,
+                      layer_attr=None):
     """
-    Convolution cross-map-response-normalize layer.
-
-    TODO(yuyang18): Add reference and equations, to explain why cmr is work?
+    Response normalization across feature maps.
+    The details please refer to
+    `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
 
     :param name: layer name.
-    :type name: basestring
+    :type name: None|basestring
     :param input: layer's input.
     :type input: LayerOutput
-    :param size: cross map response size.
+    :param size: Normalize in number of :math:`size` feature maps.
     :type size: int
-    :param scale: TODO(yuyang18)
+    :param scale: The hyper-parameter.
     :type scale: float
-    :param power: TODO(yuyang18)
+    :param power: The hyper-parameter.
     :type power: float
     :param num_channels: input layer's filers number or channels. If
                          num_channels is None, it will be set automatically.
-    :param blocked: TODO(yuyang18)
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: Layer's output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     return __img_norm_layer__(name, input, size, "cmrnorm-projection", scale,
-                              power, num_channels, blocked, layer_attr)
-
-
-@wrap_name_default("rnorm")
-@layer_support()
-def img_rnorm_layer(input, size, scale, power, name=None, num_channels=None,
-                    layer_attr=None):
-    """
-    TODO(yuyang18): add comments
-
-    TODO(yuyang18): Why it is always not implemented whenever use_gpu or not?
-
-
-    :param name:
-    :param input:
-    :param size:
-    :param scale:
-    :param power:
-    :param num_channels:
-    :param layer_attr:
-    :return:
-    """
-    return __img_norm_layer__(name, input, size, 'rnorm', scale, power,
-                              num_channels, 0, layer_attr)
+                              power, num_channels, 0, layer_attr)
 
 
 @wrap_bias_attr_default()
@@ -1609,7 +1777,7 @@ def batch_norm_layer(input, act=None, name=None, num_channels=None,
                             batch_norm for CPU. Otherwise, select batch norm
                             type based on the specified type. If you use cudnn_batch_norm,
                             we suggested you use latest version, such as v5.1.
-    :type type: None|string, None or "batch_norm" or "cudnn_batch_norm"
+    :type batch_norm_type: None|string, None or "batch_norm" or "cudnn_batch_norm"
     :param act: Activation Type. Better be relu. Because batch
                      normalization will normalize input near zero.
     :type act: BaseActivation
@@ -1637,7 +1805,7 @@ def batch_norm_layer(input, act=None, name=None, num_channels=None,
                                    :math:`runningMean = newMean*(1-factor)
                                    + runningMean*factor`
     :type moving_average_fraction: float.
-    :return: Layer's output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if not isinstance(act, ReluActivation):
@@ -1701,7 +1869,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(
@@ -1742,11 +1910,13 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
     inputs. Each input of this layer should be the same size, which is also the
     output size of this layer.
 
-    There is no weight matrix for each input, because it just a simple add operation.
-    If you want to a complicated operation before add, please use mixed_layer.
+    There is no weight matrix for each input, because it just a simple add
+    operation. If you want a complicated operation before add, please use
+    mixed_layer.
 
     It is a very good way to set dropout outside the layers. Since not all
-    paddle layer support dropout, you can add an add_to layer, set dropout here.
+    PaddlePaddle layer support dropout, you can add an add_to layer, set
+    dropout here.
     Please refer to dropout_layer for details.
 
     :param name: Layer name.
@@ -1761,14 +1931,14 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
     :type bias_attr: ParameterAttribute|bool
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer's output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     num_filters = None
     if isinstance(input, LayerOutput):
         input = [input]
 
-    assert isinstance(input, list) or isinstance(input, tuple)
+    assert isinstance(input, collections.Sequence)
     ipts_for_layer = []
     for each_input in input:
         assert isinstance(each_input, LayerOutput)
@@ -1782,7 +1952,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
         active_type=act.name,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    assert isinstance(input, list) or isinstance(input, tuple)
+
     return LayerOutput(name, LayerType.ADDTO_LAYER, parents=input,
                        activation=act, num_filters=num_filters)
 
@@ -1798,12 +1968,12 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
     :param name: Layer name.
     :type name: basestring
     :param input: input layers or projections
-    :type input: list|tuple
+    :type input: list|tuple|collection.Sequence
     :param act: Activation type.
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer's output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
@@ -1812,10 +1982,10 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
     elif isinstance(input, Projection):
         input = [input]
     else:
-        assert isinstance(input, list) or isinstance(input, tuple)
+        assert isinstance(input, collections.Sequence)
 
     def __is_type__(o, tp):
-        if not isinstance(o, list) and not isinstance(o, tuple):
+        if not isinstance(o, collections.Sequence):
             if o == tp:
                 return True
             elif len(o.__bases__) == 0:
@@ -1901,7 +2071,7 @@ def memory(name, size, is_seq=False, boot_layer=None,
     :type boot_bias_active_type: BaseActivation
     :param boot_with_const_id: boot layer's id.
     :type boot_with_const_id: int
-    :return: Memory layer's output
+    :return: LayerOutput object which is a memory.
     :rtype: LayerOutput
     """
     if boot_bias_active_type is None:
@@ -1993,7 +2163,7 @@ def lstm_step_layer(input, state, size, act=None,
     :type bias_attr: ParameterAttribute
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: lstm step's layer output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(
@@ -2032,7 +2202,7 @@ def gru_step_layer(input, output_mem, size=None, act=None,
     :param gate_act:
     :param bias_attr:
     :param layer_attr:
-    :return:
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert input.size % 3 == 0
@@ -2061,9 +2231,10 @@ def gru_step_layer(input, output_mem, size=None, act=None,
 @layer_support()
 def get_output_layer(input, arg_name, name=None, layer_attr=None):
     """
-    Get layer's output by name. In paddle, a layer might return multiple value,
-    but return one layer output. If user want to reference another output beside
-    default output, use get_output_layer first to get another output from input.
+    Get layer's output by name. In PaddlePaddle, a layer might return multiple
+    values, but returns one layer's output. If the user wants to use another
+    output besides the default one, please use get_output_layer first to get
+    the output from input.
 
     :param name: Layer's name.
     :type name: basestring
@@ -2073,7 +2244,7 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
     :param arg_name: Output name from input.
     :type arg_name: basestring
     :param layer_attr: Layer's extra attribute.
-    :return: Layer's output
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     # GetOutputLayer
@@ -2096,28 +2267,51 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
 @wrap_param_attr_default()
 @layer_support()
 def recurrent_layer(input, act=None, bias_attr=None,
-                    param_attr=None, name=None, layer_attr=None):
+                    param_attr=None, name=None, reverse=False, layer_attr=None):
     """
-    TODO(yuyang18): Add docs
+    Simple recurrent unit layer. It is just a fully connect layer through both
+    time and neural network.
 
-    :param input:
-    :param size:
-    :param act:
-    :param bias_attr:
-    :param param_attr:
-    :param name:
-    :param layer_attr:
-    :return:
+    For each sequence [start, end] it performs the following computation\:
+
+    ..  math::
+
+        out_{i} = act(in_{i})     \\      \\      \\text{for} \\ i = start \\\\
+        out_{i} = act(in_{i} + out_{i-1} * W) \\ \\ \\text{for} \\ start < i <= end
+
+    If reversed is true, the order is reversed\:
+
+    ..  math::
+
+        out_{i} = act(in_{i})           \\    \\   \\text{for} \\ i = end  \\\\
+        out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
+
+
+    :param input: Input Layer
+    :type input: LayerOutput
+    :param act: activation.
+    :type act: BaseActivation
+    :param bias_attr: bias attribute.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: parameter attribute.
+    :type param_attr: ParameterAttribute
+    :param name: name of the layer
+    :type name: basestring
+    :param layer_attr: Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     Layer(name=name,
           type=LayerType.RECURRENT_LAYER,
           inputs=Input(input.name, **param_attr.attr),
           active_type=act.name,
-          size=input.size,
           bias=ParamAttr.to_bias(bias_attr),
+          reversed=reverse,
           **ExtraAttr.to_kwargs(layer_attr))
     return LayerOutput(name=name, layer_type=LayerType.RECURRENT_LAYER,
-                       parents=[input], size=input.size, activation=act)
+                       parents=[input], size=input.size, activation=act,
+                       reverse=reverse)
 
 
 class StaticInput(object):
@@ -2125,6 +2319,7 @@ class StaticInput(object):
     StaticInput is only used in recurrent_group which defines a read-only memory
     that can be a sequence or non-sequence.
     """
+
     def __init__(self, input, is_seq=False, size=None):
         assert isinstance(input, LayerOutput)
         self.input = input
@@ -2144,6 +2339,7 @@ class SubsequenceInput(object):
 
        input = SubsequenceInput(layer)
     """
+
     def __init__(self, input):
         assert isinstance(input, LayerOutput)
         assert input.size is not None
@@ -2153,7 +2349,11 @@ class SubsequenceInput(object):
 @wrap_name_default("recurrent_group")
 def recurrent_group(step, input, reverse=False, name=None):
     """
-    Recurrent Group. It supports time steps and sequence steps mechanisms.
+    Recurrent layer group is an extremely flexible recurrent unit in
+    PaddlePaddle. As long as the user defines the calculation done within a
+    time step, PaddlePaddle will iterate such a recurrent calculation over
+    sequence input. This is extremely usefull for attention based model, or
+    Neural Turning Machine like models.
 
     The basic usage (time steps) is:
 
@@ -2201,7 +2401,7 @@ def recurrent_group(step, input, reverse=False, name=None):
     :param reverse: If reverse is set true, the recurrent unit will process the
                     input sequence in a reverse order.
     :type reverse: bool
-    :return: Layer output object
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     model_type('recurrent_nn')
@@ -2212,7 +2412,7 @@ def recurrent_group(step, input, reverse=False, name=None):
 
     if is_single_input(input):
         input = [input]
-    assert isinstance(input, list) or isinstance(input, tuple)
+    assert isinstance(input, collections.Sequence)
 
     def is_in_links(x):
         return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput)
@@ -2256,6 +2456,7 @@ def recurrent_group(step, input, reverse=False, name=None):
 
     for ot in layer_outs:
         assert isinstance(ot, LayerOutput)
+        ot.reverse = reverse
         if contains_sub_seq[0]:
             RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True))
         else:
@@ -2268,6 +2469,7 @@ def recurrent_group(step, input, reverse=False, name=None):
     else:
         return layer_outs
 
+
 class BaseGeneratedInput(object):
     def __init__(self):
         self.bos_id = None
@@ -2296,6 +2498,7 @@ class GeneratedInput(BaseGeneratedInput):
         return trg_emb
 
     def __init__(self, size, embedding_name, embedding_size):
+        super(GeneratedInput, self).__init__()
         self.size = size
         self.embedding_name = embedding_name
         self.embedding_size = embedding_size
@@ -2319,7 +2522,7 @@ def maxid_layer(input, name=None, layer_attr=None):
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
@@ -2333,6 +2536,41 @@ def maxid_layer(input, name=None, layer_attr=None):
                        parents=[input])
 
 
+@wrap_name_default()
+def out_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the outer product of two vectors
+    The result is a matrix of size(input1) x size(input2)
+
+    The example usage is:
+
+    .. code-block:: python
+
+       out_prod = out_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input1: The first input layer name.
+    :type input: LayerOutput
+    :param input2: The second input layer name.
+    :type input2: LayerOutput
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    Layer(name=name,
+          type="out_prod",
+          inputs=[input1.name, input2.name],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name=name,
+                       layer_type=LayerType.OUT_PROD_LAYER,
+                       parents=[input1, input2])
+
+
 @wrap_name_default()
 def eos_layer(input, eos_id, name=None, layer_attr=None):
     """
@@ -2356,7 +2594,7 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
     :type eos_id: int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(name=name,
@@ -2370,7 +2608,6 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
 
 @wrap_name_default()
 def beam_search(step, input, bos_id, eos_id, beam_size,
-                result_file, dict_file="", id_input=None,
                 max_length=500, name=None,
                 num_results_per_sample=None):
     """
@@ -2384,18 +2621,17 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
         def rnn_step(input):
             last_time_step_output = memory(name='rnn', size=512)
-            with mixed_layer(size=512) as simple_rnn:
+            with mixed_layer(size=512, name='rnn') as simple_rnn:
                 simple_rnn += full_matrix_projection(input)
                 simple_rnn += last_time_step_output
             return simple_rnn
 
         beam_gen = beam_search(name="decoder",
                                step=rnn_step,
-                               input=[StaticInput("encoder_last")],
+                               input=[StaticInput(encoder_last)],
                                bos_id=0,
                                eos_id=1,
-                               beam_size=5,
-                               result_file="./generated_sequences.txt")
+                               beam_size=5)
 
     Please see the following demo for more details:
 
@@ -2405,18 +2641,18 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
     :param name: Name of the recurrent unit that generates sequences.
     :type name: base string
     :param step: A callable function that defines the calculation in a time
-                 step, and it is appled to sequences with arbitrary length by
+                 step, and it is applied to sequences with arbitrary length by
                  sharing a same set of weights.
 
                  You can refer to the first parameter of recurrent_group, or
                  demo/seqToseq/seqToseq_net.py for more details.
     :type step: callable
     :param input: Input data for the recurrent unit
-    :type input: StaticInput|GeneratedInput
+    :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
                    beginning of a sequence. In the generation task, the start
-                   symbol is ensential, since it is used to initialize the RNN
+                   symbol is essential, since it is used to initialize the RNN
                    internal state.
     :type bos_id: int
     :param eos_id: Index of the end symbol in the dictionary. The end symbol is
@@ -2425,30 +2661,20 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
                    symbol is generated, or a pre-defined max iteration number
                    is exceeded.
     :type eos_id: int
+    :param max_length: Max generated sequence length.
+    :type max_length: int
     :param beam_size: Beam search for sequence generation is an iterative search
                       algorithm. To maintain tractability, every iteration only
                       only stores a predetermined number, called the beam_size,
                       of the most promising next words. The greater the beam
                       size, the fewer candidate words are pruned.
     :type beam_size: int
-    :param result_file: Path of the file to store the generated results.
-    :type result_file: basestring
-    :param dict_file: Path of dictionary. This is an optional parameter.
-                      Every line is a word in the dictionary with
-                      (line number - 1) as the word index.
-                      If this parameter is set to None, or to an empty string,
-                      only word index are printed in the generated results.
-    :type dict_file: basestring
     :param num_results_per_sample: Number of the generated results per input
                                   sequence. This number must always be less than
                                   beam size.
     :type num_results_per_sample: int
-    :param id_input: Index of the input sequence, and the specified index will
-                     be prited in the gereated results. This an optional
-                     parameter.
-    :type id_input: LayerOutput
-    :return: The seq_text_printer that prints the generated sequence to a file.
-    :rtype: evaluator
+    :return: The generated word index.
+    :rtype: LayerOutput
     """
 
     if num_results_per_sample is None:
@@ -2464,9 +2690,8 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
     real_input = []
     for i, each_input in enumerate(input):
-        # print type(each_input)
-        assert isinstance(each_input, StaticInput) or isinstance(each_input,
-                                                          BaseGeneratedInput)
+        assert isinstance(each_input, StaticInput) or isinstance(
+            each_input, BaseGeneratedInput)
         if isinstance(each_input, BaseGeneratedInput):
             assert generated_input_index == -1
             generated_input_index = i
@@ -2500,20 +2725,7 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
     tmp = recurrent_group(step=__real_step__, input=real_input, reverse=False,
                           name=name)
-
-    if id_input is None:
-        inputs = [tmp.name]
-    else:
-        assert isinstance(id_input, LayerOutput)
-        inputs = [id_input.name, tmp.name]
-        tmp.parents.append(id_input)
-
-    Evaluator(name='target_printer',
-              type='seq_text_printer',
-              dict_file=dict_file,
-              result_file=result_file,
-              inputs=inputs
-              )
+    
     return tmp
 
 
@@ -2528,7 +2740,7 @@ def regression_cost(input, label, cost='square_error', name=None):
     :param input: Network prediction.
     :param label: Data label.
     :param cost: Cost method.
-    :return: layer name.
+    :return: LayerOutput object.
     """
     Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name)
     return LayerOutput(
@@ -2537,9 +2749,11 @@ def regression_cost(input, label, cost='square_error', name=None):
 
 
 @wrap_name_default("cost")
+@layer_support()
 def classification_cost(input, label, name=None,
                         cost="multi-class-cross-entropy",
-                        evaluator=classification_error_evaluator):
+                        evaluator=classification_error_evaluator,
+                        layer_attr=None):
     """
     classification cost Layer.
 
@@ -2552,13 +2766,16 @@ def classification_cost(input, label, name=None,
     :param cost: cost method.
     :type cost: basestring
     :param evaluator: Evaluator method.
-    :return: layer name.
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert input.layer_type != LayerType.DATA
     assert isinstance(input.activation, SoftmaxActivation)
     assert label.layer_type == LayerType.DATA
-    Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)])
+    Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
         assert callable(e)
@@ -2571,7 +2788,7 @@ def classification_cost(input, label, name=None,
 
         e(name=e.__name__, input=input, label=label)
 
-    if not isinstance(evaluator, list) and not isinstance(evaluator, tuple):
+    if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
 
     for each_evaluator in evaluator:
@@ -2579,7 +2796,8 @@ def classification_cost(input, label, name=None,
 
     return LayerOutput(name, LayerType.COST, parents=[input, label])
 
-def conv_operator(input, filter_size, num_filters,
+
+def conv_operator(img, filter, filter_size, num_filters,
                   num_channel=None, stride=1, padding=0,
                   filter_size_y=None, stride_y=None, padding_y=None):
     """
@@ -2592,21 +2810,24 @@ def conv_operator(input, filter_size, num_filters,
 
     .. code-block:: python
 
-       op = conv_operator(input=[layer1, layer2],
-                          filter_size=3.0,
+       op = conv_operator(img=input1,
+                          filter=input2,
+                          filter_size=3,
                           num_filters=64,
                           num_channels=64)
 
-    :param input: Input layer.
-    :type input: LayerOutput|list|tuple
+    :param img: input image
+    :type img: LayerOutput
+    :param filter: input filter
+    :type filter: LayerOutput
     :param filter_size: The x dimension of a filter kernel.
     :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
-                        support rectangular filters, the filter's shape
-                        will be (filter_size, filter_size_y).
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                        PaddlePaddle now supports rectangular filters,
+                        the filter's shape can be (filter_size, filter_size_y).
     :type filter_size_y: int
-    :param num_filter: channel of output data.
-    :type num_filter: int
+    :param num_filters: channel of output data.
+    :type num_filters: int
     :param num_channel: channel of input data.
     :type num_channel: int
     :param stride: The x dimension of the stride.
@@ -2620,29 +2841,36 @@ def conv_operator(input, filter_size, num_filters,
     :return: A ConvOperator Object.
     :rtype: ConvOperator
     """
-    assert isinstance(input, list) or isinstance(input, tuple)
     if filter_size_y is None:
         filter_size_y = filter_size
     if stride_y is None:
         stride_y = stride
     if padding_y is None:
         padding_y = padding
-    op = ConvOperator(input_layer_name=[x.name for x in input],
-                      num_filters = num_filter,
+
+    if num_channel is None:
+        num_channel = img.num_filters
+
+    assert isinstance(filter, LayerOutput)
+    if filter.size is not None:
+        filter.size = filter_size * filter_size_y * num_filters * num_channel
+
+    op = ConvOperator(input_layer_names=[img.name, filter.name],
+                      num_filters=num_filters,
                       conv_conf=Conv(filter_size=filter_size,
                                      padding=padding,
                                      stride=stride,
                                      channels=num_channel,
                                      filter_size_y=filter_size_y,
                                      padding_y=padding_y,
-                                     stride_y=stride_y))
-    op.origin = input
-    op.origin.operator = "conv_op"
+                                     stride_y=stride_y,
+                                     groups=1))
+    op.origin = [img, filter]
     return op
 
 
 @wrap_name_default()
-def conv_shift_layer(input, name=None):
+def conv_shift_layer(a, b, name=None):
     """
     This layer performs cyclic convolution for two input. For example:
       - a[in]: contains M elements.
@@ -2654,136 +2882,106 @@ def conv_shift_layer(input, name=None):
         c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
 
     In this formular:
-     - a's index is computed modulo M.
-     - b's index is computed modulo N.
+     - a's index is computed modulo M. When it is negative, then get item from
+       the right side (which is the end of array) to the left.
+     - b's index is computed modulo N. When it is negative, then get item from
+       the right size (which is the end of array) to the left.
 
     The example usage is:
 
     .. code-block:: python
 
-       conv_shift = conv_shif_layer(input=[layer1, layer2])
+       conv_shift = conv_shift_layer(input=[layer1, layer2])
 
     :param name: layer name
     :type name: basestring
-    :param input: Input layer.
-    :type input: LayerOutput|list|tuple.
-    :return: a object of LayerOutput.
+    :param a: Input layer a.
+    :type a: LayerOutput
+    :param b: input layer b
+    :type b: LayerOutput
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert isinstance(input, list) or isinstance(input, tuple)
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert b.size is None or b.size % 2 == 1  # size of b must be odd.
     Layer(
         name=name,
         type=LayerType.CONV_SHIFT_LAYER,
-        inputs=[x.name for x in input],
+        inputs=[a.name, b.name],
     )
 
-    return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=input)
+    return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
+                       size=a.size)
 
 
 @wrap_name_default()
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
+@wrap_act_default(act=LinearActivation())
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def tensor_layer(input, size, act=None, name=None,
+def tensor_layer(a, b, size, act=None, name=None,
                  param_attr=None, bias_attr=None, layer_attr=None):
     """
     This layer performs tensor operation for two input.
     For example, each sample:
 
     .. math::
-       y_{i} = x_{1} * W_{i} * {x_{2}^\mathrm{T}}, i=0,1,...,K-1
+       y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
 
     In this formular:
-      - :math:`x_{1}`: the first input contains M elements.
-      - :math:`x_{2}`: the second input contains N elements.
+      - :math:`a`: the first input contains M elements.
+      - :math:`b`: the second input contains N elements.
       - :math:`y_{i}`: the i-th element of y.
       - :math:`W_{i}`: the i-th learned weight, shape if [M, N]
-      - :math:`{x_{2}}^\mathrm{T}`: the transpose of :math:`x_{2}`.
+      - :math:`b^\mathrm{T}`: the transpose of :math:`b_{2}`.
 
     The simple usage is:
 
     .. code-block:: python
 
-       tensor = tensor_layer(input=[layer1, layer2])
+       tensor = tensor_layer(a=layer1, b=layer2, size=1000)
 
     :param name: layer name
     :type name: basestring
-    :param input: Input layer.
-    :type input: LayerOutput|list|tuple.
+    :param a: Input layer a.
+    :type a: LayerOutput
+    :param b: input layer b.
+    :type b: LayerOutput
     :param size: the layer dimension.
     :type size: int.
     :param act: Activation Type. Default is tanh.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
-    :type param_attr: ParameterAttribute|list
+    :type param_attr: ParameterAttribute
     :param bias_attr: The Bias Attribute. If no bias, then pass False or
                       something not type of ParameterAttribute. None will get a
                       default Bias.
     :type bias_attr: ParameterAttribute|None|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert isinstance(input, list) or isinstance(input, tuple)
-    assert len(input) == 2
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
     Layer(
         name=name,
         size=size,
         type=LayerType.TENSOR_LAYER,
         active_type=act.name,
         bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(input[0].name, **param_attr),
-                Input(input[1].name)],
+        inputs=[Input(a.name, **param_attr.attr),
+                Input(b.name)],
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.TENSOR_LAYER, parents=input,
+    return LayerOutput(name, LayerType.TENSOR_LAYER, parents=[a, b],
                        activation=act, size=size)
 
 
-@wrap_param_attr_default()
-def trans_full_matrix_projection(input, size=0, param_attr=None):
-    """
-    Different from full_matrix_projection, this projection performs matrix
-    multiplication, using transpose of weight.
-
-    ..  math::
-        out.row[i] += in.row[i] * w^\mathrm{T}
-
-    :math:`w^\mathrm{T}` means transpose of weight.
-    The simply usage is:
-
-    .. code-block:: python
-
-       proj = trans_full_matrix_projection(input=layer,
-                                           size=100,
-                                           param_attr=ParamAttr(
-                                                name='_proj',
-                                                initial_mean=0.0,
-                                                initial_std=0.01))
-
-    :param input: input layer
-    :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
-    :type size: int
-    :param param_attr: Parameter config, None if use default.
-    :type param_attr: ParameterAttribute
-    :return: A TransposedFullMatrixProjection Object.
-    :rtype: TransposedFullMatrixProjection
-    """
-    proj = TransposedFullMatrixProjection(input_layer_name=input.name,
-                                          size=size,
-                                          **param_attr.attr)
-    proj.origin = input
-    proj.origin.projection = "trans_matrix"
-    return proj
-
-
 @wrap_name_default()
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-def selective_fc_layer(input, size, act=None, name=None,
+def selective_fc_layer(input, select, size, act=None, name=None,
                        pass_generation=False,
                        has_selected_colums=True,
                        mul_ratio=0.02,
@@ -2798,12 +2996,15 @@ def selective_fc_layer(input, size, act=None, name=None,
 
     .. code-block:: python
 
-       sel_fc = selective_fc_layer(input=input, 128, act=TanhActivation())
+       sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
 
     :param name: The Layer Name.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput|list|tuple
+    :param select: The select layer. The output of select layer should be a
+                   sparse binary matrix, and treat as the mask of selective fc.
+    :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
     :param act: Activation Type. Default is tanh.
@@ -2816,38 +3017,38 @@ def selective_fc_layer(input, size, act=None, name=None,
     :type bias_attr: ParameterAttribute|None|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
-        assert not isinstance(param_attr, list)
+        assert not isinstance(param_attr, collections.Sequence)
         param_attr = [param_attr]
     else:
-        if isinstance(param_attr, list) or isinstance(param_attr, tuple):
+        if isinstance(param_attr, collections.Sequence):
             assert len(input) == len(param_attr)
         else:
             param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
 
-    assert isinstance(input, list)
-
-    def __idx_to_input__(i):
-        attr = param_attr[i]
-        assert isinstance(attr, ParameterAttribute)
-        return Input(input[i].name, **attr.attr)
-
+    assert isinstance(input, collections.Sequence)
+    assert isinstance(select, LayerOutput)
+    if select.size is not None:
+        assert select.size == size
     Layer(
-        inputs=map(__idx_to_input__, range(len(input))),
+        inputs=[Input(ipt.name, **attr.attr) for ipt, attr in zip(
+            input, param_attr)] + [select.name],
         name=name,
         type=LayerType.SEL_FC_LAYER,
         size=size,
+        bias=ParameterAttribute.to_bias(bias_attr),
         active_type=act.name,
         selective_fc_pass_generation=pass_generation,
         has_selected_colums=has_selected_colums,
         selective_fc_full_mul_ratio=mul_ratio,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.SEL_FC_LAYER, input, activation=act,
+    return LayerOutput(name, LayerType.SEL_FC_LAYER, list(input) + [select],
+                       activation=act,
                        size=size)
 
 
@@ -2867,7 +3068,7 @@ def sampling_id_layer(input, name=None):
     :type input: LayerOutput
     :param name: The Layer Name.
     :type name: basestring
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(
@@ -2901,7 +3102,7 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
     :type slope: float.
     :param intercept: the offset.
     :type intercept: float.
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(
@@ -2915,50 +3116,69 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
 
 
 @wrap_name_default()
-def convex_comb_layer(input, size, name=None):
+def linear_comb_layer(weights, vectors, size=None, name=None):
     """
-    A layer for convex weighted average of vectors takes two inputs.
-      - Input: a vector containing the convex weights (batchSize x weightdim),
-               and a matrix in a vector form (batchSize x (weightdim * datadim)).
-      - Output: a vector (batchSize * datadim).
+    A layer for weighted sum of vectors takes two inputs.
+      - Input: size of weights is M
+               size of vectors is M*N
+      - Output: a vector of size=N
 
     .. math::
 
-       y[i][j] = \sum_{j}(x_{1}(i, j) * x_{2}(i,j + i * dataDim)),
+       z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
+    where :math:`0 \le i \le N-1`
+
+    Or in the matrix notation:
 
-                   i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+    .. math::
+
+       z = x^\mathrm{T} Y
 
     In this formular:
-      - :math:`x_{1}`: the first input.
-      - :math:`x_{2}`: the second input.
-      - :math:`y`: the output.
+      - :math:`x`: weights
+      - :math:`y`: vectors.
+      - :math:`z`: the output.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The simple usage is:
 
     .. code-block:: python
 
-       convex_comb = convex_comb_layer(input=inputs,
+       linear_comb = linear_comb_layer(weights=weight, vectors=vectors,
                                        size=elem_dim)
 
-    :param input: The input layers.
-    :type input: LayerOutput
+    :param weights: The weight layer.
+    :type weights: LayerOutput
+    :param vectors: The vector layer.
+    :type vectors: LayerOutput
     :param size: the dimension of this layer.
     :type size: int
     :param name: The Layer Name.
     :type name: basestring
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
-
-    assert isinstance(input, list) or isinstance(input, tuple)
-    assert len(input) == 2
+    assert isinstance(weights, LayerOutput) and isinstance(vectors, LayerOutput)
+    if vectors.size is not None and weights.size is not None:
+        assert vectors.size % weights.size == 0
+        if size is None:
+                size = vectors.size / weights.size
+        else:
+            assert size == vectors.size / weights.size
     Layer(
         name=name,
-        type=LayerType.CONVEX_COMBINATION_LAYER,
+        type=LayerType.LINEAR_COMBINATION_LAYER,
         size=size,
-        inputs=[Input(input[0].name), Input(input[1].name)],
+        inputs=[Input(weights.name), Input(vectors.name)],
     )
-    return LayerOutput(name, LayerType.CONVEX_COMBINATION_LAYER, input, size=size)
+    return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
+                       [weights, vectors], size=size)
+
+
+convex_comb_layer = linear_comb_layer
+
 
 @wrap_name_default()
 def block_expand_layer(input,
@@ -3016,32 +3236,44 @@ def block_expand_layer(input,
     :type padding_y: int
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     Layer(name=name,
           input=Input(input.name,
-                      block_expand=BlockExpand(channel=channel,
+                      block_expand=BlockExpand(channels=channel,
                                                block_x=block_x,
                                                block_y=block_y,
                                                stride_x=stride_x,
                                                stride_y=stride_y,
                                                padding_x=padding_x,
                                                padding_y=padding_y)
-                       ),
+                      ),
           type=LayerType.BLOCK_EXPAND,
-         )
+          )
+
+    return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
 
-    return LayerOutput(name, LayerType.BLOCK_EXPAND,
-                       parents=[input], size=size)
 
 @wrap_name_default()
-def ctc_layer(input, label, size, name=None, norm_by_times=False):
+def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
     """
     Connectionist Temporal Classification (CTC) is designed for temporal
     classication task. That is, for sequence labeling problems where the
     alignment between the inputs and the target labels is unknown.
 
+    More details can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        Considering the 'blank' label needed by CTC, you need to use
+        (num_classes + 1) as the input size. num_classes is the category number.
+        And the 'blank' is the last category index. So the size of 'input' layer, such as
+        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
+        should also be num_classes + 1.
+
     The simple usage:
 
     .. code-block:: python
@@ -3051,32 +3283,39 @@ def ctc_layer(input, label, size, name=None, norm_by_times=False):
                       size=9055,
                       norm_by_times=True)
 
-    :param input: The input layers.
+    :param input: The input layer.
     :type input: LayerOutput
     :param label: The data layer of label with variable length.
     :type label: LayerOutput
-    :param size: category numbers.
+    :param size: category numbers + 1.
     :type size: int
-    :param name: The name of this layer, which can not specify.
-    :type name: string|None
+    :param name: The name of this layer
+    :type name: basestring|None
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert isinstance(input, LayerOutput)
     assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
     Layer(
-        name = name,
-        type = LayerType.CTC_LAYER,
-        size = size,
-        norm_by_times = norm_by_times,
-        inputs = [input.name, label.name]
+        name=name,
+        type=LayerType.CTC_LAYER,
+        size=size,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name]
     )
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
+
 @wrap_name_default()
-def crf_layer(input, label, size, weight=None, param_attr=None, name=None):
+@wrap_param_attr_default()
+def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
     """
     A layer for calculating the cost of sequential conditional random
     field model.
@@ -3092,7 +3331,7 @@ def crf_layer(input, label, size, weight=None, param_attr=None, name=None):
     :param input: The first input layer is the feature.
     :type input: LayerOutput
     :param label: The second input layer is label.
-    :type input: LayerOutput
+    :type label: LayerOutput
     :param size: The category number.
     :type size: int
     :param weight: The third layer is "weight" of each sample, which is an
@@ -3102,30 +3341,38 @@ def crf_layer(input, label, size, weight=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert isinstance(input, LayerOutput)
     assert isinstance(label, LayerOutput)
     assert weight is None or isinstance(weight, LayerOutput)
+    if input.size is not None and label.size is not None:
+        assert input.size == label.size
+        if size is None:
+            size = input.size
+        else:
+            assert size == input.size
 
-    ipts = [Input(input.name, **param_attr),
+    ipts = [Input(input.name, **param_attr.attr),
             Input(label.name)]
     if weight is not None:
         ipts.append(Input(weight.name))
 
     Layer(
-        name = name,
-        type = LayerType.CRF_LAYER,
-        size = size,
-        inputs = ipts,
+        name=name,
+        type=LayerType.CRF_LAYER,
+        size=size,
+        inputs=ipts,
     )
     parents = [input, label]
     if weight is not None:
         parents.append(weight)
     return LayerOutput(name, LayerType.CRF_LAYER, parents, size=size)
 
+
 @wrap_name_default()
+@wrap_param_attr_default()
 def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
     """
     A layer for calculating the decoding sequence of sequential conditional
@@ -3144,36 +3391,40 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
     assert isinstance(input, LayerOutput)
     assert label is None or isinstance(label, LayerOutput)
 
-    ipts = [Input(input.name, **param_attr)]
+    ipts = [Input(input.name, **param_attr.attr)]
     if label is not None:
         ipts.append(Input(label.name))
 
     Layer(
-        name = name,
-        type = LayerType.CRF_DECODING_LAYER,
-        size = size,
-        inputs = ipts,
+        name=name,
+        type=LayerType.CRF_DECODING_LAYER,
+        size=size,
+        inputs=ipts,
     )
     parents = [input]
     if label is not None:
         parents.append(label)
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
 
+
 """
 following are cost Layers.
 """
+
+
 @wrap_name_default()
-def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
+def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
     """
     A cost Layer for learning to rank using gradient descent. Details can refer
-    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
+    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
+    ICML_ranking.pdf>`_.
     This layer contains at least three inputs. The weight is an optional
     argument, which affects the cost.
 
@@ -3213,7 +3464,7 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert left.size == 1
@@ -3230,12 +3481,13 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
           type=LayerType.RANK_COST,
           inputs=ipts,
           coeff=coeff,
-         )
+          )
 
     return LayerOutput(name, LayerType.RANK_COST, parents=parents)
 
+
 @wrap_name_default()
-def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0):
+def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
     """
     lambdaCost for lambdaRank LTR approach.
 
@@ -3248,9 +3500,7 @@ def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0):
                          NDCG_num=8,
                          max_sort_size=-1)
 
-    :param input: The 1st input. Samples of the same query should be loaded
-                  as sequence. User should provided socres for each sample.
-                  The score should be the 2nd input of this layer.
+    :param input: Samples of the same query should be loaded as sequence.
     :type input: LayerOutput
     :param score: The 2nd input. Score of each sample.
     :type input: LayerOutput
@@ -3262,27 +3512,28 @@ def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0):
                           If max_sort_size = -1, then for each list, the
                           algorithm will sort the entire list to get gradient.
                           In other cases, max_sort_size must be greater than or
-                          equal to NDCG_num. And if max_sort_size is greater than
-                          the size of a list, the algorithm will sort the entire
-                          list of get gradient.
+                          equal to NDCG_num. And if max_sort_size is greater
+                          than the size of a list, the algorithm will sort the
+                          entire list of get gradient.
     :type max_sort_size: int
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
+    assert isinstance(input, LayerOutput) and isinstance(score, LayerOutput)
+    if score.size is not None:
+        assert score.size == 1
     Layer(name=name,
           type=LayerType.LAMBDA_COST,
           inputs=[input.name, score.name],
           NDCG_num=NDCG_num,
-          max_sort_size=max_sort_size,
-          coeff=coeff,
-         )
+          max_sort_size=max_sort_size
+          )
 
     return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
 
+
 @wrap_name_default()
 def cross_entropy(input, label, name=None, coeff=1.0):
     """
@@ -3302,7 +3553,7 @@ def cross_entropy(input, label, name=None, coeff=1.0):
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput.
     """
 
@@ -3310,9 +3561,10 @@ def cross_entropy(input, label, name=None, coeff=1.0):
           type=LayerType.CROSS_ENTROPY,
           inputs=[input.name, label.name],
           coeff=coeff,
-         )
+          )
     return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
 
+
 @wrap_name_default()
 def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
                                 softmax_selfnorm_alpha=0.1):
@@ -3335,7 +3587,7 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
     :type coeff: float.
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
     :type softmax_selfnorm_alpha: float.
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput.
     """
     Layer(name=name,
@@ -3343,12 +3595,13 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
           inputs=[input.name, label.name],
           coeff=coeff,
           softmax_selfnorm_alpha=softmax_selfnorm_alpha,
-         )
+          )
 
     return LayerOutput(name,
                        LayerType.CROSS_ENTROPY_WITH_SELFNORM,
                        parents=[input, label])
 
+
 @wrap_name_default()
 def huber_cost(input, label, name=None, coeff=1.0):
     """
@@ -3362,23 +3615,24 @@ def huber_cost(input, label, name=None, coeff=1.0):
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param type: The type of cost.
-    :type type: basestring.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput.
     """
-
+    assert isinstance(input, LayerOutput)
+    if input.size is not None:
+        assert input.size == 1
     Layer(name=name,
           type=LayerType.HUBER,
           inputs=[input.name, label.name],
           coeff=coeff,
-         )
+          )
     return LayerOutput(name, LayerType.HUBER, parents=[input, label])
 
+
 @wrap_name_default()
 def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
     """
@@ -3398,19 +3652,20 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
-    :return: a object of LayerOutput.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
-    if not isinstance(input.act, SigmoidActivation):
+    if input.activation is None or \
+            not isinstance(input.activation, SigmoidActivation):
         logger.log(logging.WARN,
                    "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
+                   "maybe the relu is better" % repr(input.activation))
 
     Layer(name=name,
           type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
           inputs=[input.name, label.name],
           coeff=coeff,
-         )
+          )
     return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
                        parents=[input, label])
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1d0a1d52a9..e59e93acbe 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -466,7 +466,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param reverse: is lstm reversed
+    :param reverse: whether to process the input data in a reverse order
     :type reverse: bool
     :param mat_param_attr: mixed layer's matrix projection parameter attribute.
     :type mat_param_attr: ParameterAttribute
@@ -475,11 +475,11 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     :type bias_param_attr: ParameterAttribute|False
     :param inner_param_attr: lstm cell parameter attribute.
     :type inner_param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
     :param mixed_layer_attr: mixed layer's extra attribute.
     :type mixed_layer_attr: ExtraLayerAttribute
@@ -503,12 +503,43 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input, name=None, size=None, param_attr=None,
-                   act=None, gate_act=None, state_act=None, 
+                   act=None, gate_act=None, state_act=None,
                    mixed_bias_attr=None, lstm_bias_attr=None,
                    mixed_layer_attr=None,lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
-    TODO(yuyang18): complete docs
+    Define calculations that a LSTM unit performs in a single time step.
+    This function itself is not a recurrent layer, so that it can not be
+    directly applied to sequence input. This function is always used in
+    recurrent_group (see layers.py for more details) to implement attention
+    mechanism.
+
+    Please refer to  **Generating Sequences With Recurrent Neural Networks**
+    for more details about LSTM. The link goes as follows:
+    .. _Link: https://arxiv.org/abs/1308.0850
+
+    ..  math::
+
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+
+        h_t & = o_t tanh(c_t)
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = lstmemory_unit(input=[layer1],
+                                   size=256,
+                                   act=TanhActivation(),
+                                   gate_act=SigmoidActivation(),
+                                   state_act=TanhActivation())
+
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -518,11 +549,11 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None,
     :type size: int
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
     :param mixed_bias_attr: bias parameter attribute of mixed layer. 
                             False means no bias, None means default bias.
@@ -579,7 +610,31 @@ def lstmemory_group(input, size=None, name=None,
                     mixed_layer_attr=None, lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    TODO(yuyang18): complete docs
+    lstm_group is a recurrent layer group version Long Short Term Memory. It
+    does exactly the same calculation as the lstmemory layer (see lstmemory in
+    layers.py for the maths) does. A promising benefit is that LSTM memory
+    cell states, or hidden states in every time step are accessible to for the
+    user. This is especially useful in attention model. If you do not need to
+    access to the internal states of the lstm, but merely use its outputs,
+    it is recommended to use the lstmemory, which is relatively faster than
+    lstmemory_group.
+
+    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
+    multiplications:
+    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
+    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    speed up the calculations. Consequently, an additional mixed_layer with
+    full_matrix_projection must be included before lstmemory_unit is called.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = lstmemory_group(input=[layer1],
+                                    size=256,
+                                    act=TanhActivation(),
+                                    gate_act=SigmoidActivation(),
+                                    state_act=TanhActivation())
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -591,13 +646,13 @@ def lstmemory_group(input, size=None, name=None,
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activate type
+    :param act: lstm final activiation type
     :type act: BaseActivation
-    :param gate_act: lstm gate activate type
+    :param gate_act: lstm gate activiation type
     :type gate_act: BaseActivation
-    :param state_act: lstm state activate type.
+    :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+    :param mixed_bias_attr: bias parameter attribute of mixed layer.
                             False means no bias, None means default bias.
     :type mixed_bias_attr: ParameterAttribute|False
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
@@ -609,7 +664,7 @@ def lstmemory_group(input, size=None, name=None,
     :type lstm_layer_attr: ExtraLayerAttribute
     :param get_output_layer_attr: get output layer's extra attribute.
     :type get_output_layer_attr: ExtraLayerAttribute
-    :return: lstmemory group name.
+    :return: the lstmemory group.
     :rtype: LayerOutput
     """
 
@@ -639,16 +694,28 @@ def gru_unit(input,
              gate_act=None,
              gru_layer_attr=None):
     """
+    Define calculations that a gated recurrent unit performs in a single time
+    step. This function itself is not a recurrent layer, so that it can not be
+    directly applied to sequence input. This function is almost always used in
+    the recurrent_group (see layers.py for more details) to implement attention
+    mechanism.
 
-    :param input:
+    Please see grumemory in layers.py for the details about the maths.
+
+    :param input: input layer name.
     :type input: LayerOutput
-    :param name:
-    :param size:
-    :param gru_bias_attr:
-    :param act:
-    :param gate_act:
-    :param gru_layer_attr:
-    :return:
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param act: type of the activation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activation
+    :type gate_act: BaseActivation
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru output layer.
+    :rtype: LayerOutput
     """
 
     assert input.size % 3 == 0
@@ -678,6 +745,46 @@ def gru_group(input,
               gru_bias_attr=None,
               act=None, gate_act=None,
               gru_layer_attr=None):
+
+    """
+    gru_group is a recurrent layer group version Gated Recurrent Unit. It
+    does exactly the same calculation as the grumemory layer does. A promising
+    benefit is that gru hidden sates are accessible to for the user. This is
+    especially useful in attention model. If you do not need to access to
+    any internal state, but merely use the outputs of a GRU, it is recommanded
+    to use the grumemory, which is relatively faster.
+
+    Please see grumemory in layers.py for more detail about the maths.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = gur_group(input=[layer1],
+                        size=256,
+                        act=TanhActivation(),
+                        gate_act=SigmoidActivation())
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+
     def __gru_step__(ipt):
         return gru_unit(
             input=ipt,
@@ -708,6 +815,43 @@ def simple_gru(input,
                gate_act=None,
                gru_layer_attr=None
                ):
+    """
+    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
+    gru_group. The difference only lies in implemention details.
+    The computational speed is that, grumemory is relatively better than
+    gru_group, and gru_group is relatively better than simple_gru.
+
+    simple_gru does exactly the same calculation as the grumemory layer does.
+    Please see grumemory in layers.py for more detail about the maths.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = gur_group(input=[layer1],
+                        size=256,
+                        act=TanhActivation(),
+                        gate_act=SigmoidActivation())
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
     with mixed_layer(name='%s_transform' % name,
                      size=size * 3,
                      bias_attr=mixed_bias_param_attr,
@@ -739,7 +883,22 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                        last_seq_attr=None, first_seq_attr=None,
                        concat_attr=None, concat_act=None):
     """
-    TODO(yuyang18): Complete docs
+    A bidirectional_lstm is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    Please refer to  **Neural Machine Translation by Jointly Learning to Align
+    and Translate** for more details about the bidirectional lstm.
+    The link goes as follows:
+    .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        lstm_step = bidirectional_lstm(input=[input1], size=512)
 
     :param name: bidirectional lstm layer name.
     :type name: basestring
@@ -747,8 +906,11 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param return_seq: If False, concat word in last time step and return.
-                       If True, concat sequnce in all time step and return.
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
     :type return_seq: bool
     :return: lstm layer name.
     :rtype: LayerOutput
@@ -890,7 +1052,7 @@ def dropout_layer(input, dropout_rate, name=None):
                        layer_attr=ExtraAttr(drop_rate=dropout_rate))
 
 
-def outputs(layers):
+def outputs(layers, *args):
     """
     Declare the end of network. Currently it will only calculate the
     input/output order of network. It will calculate the predict network or
@@ -927,9 +1089,12 @@ def outputs(layers):
     if isinstance(layers, LayerOutput):
         layers = [layers]
 
+    if len(args) != 0:
+        layers.extend(args)
+
     assert len(layers) > 0
     if len(layers) != 1:
-        logger.warning("EndOfNetwork routine try to calculate network's"
+        logger.warning("`outputs` routine try to calculate network's"
                        " inputs and outputs order. It might not work well."
                        "Please see follow log carefully.")
     inputs = []
@@ -959,12 +1124,13 @@ def outputs(layers):
     logger.info(
         "".join(["The input order is [", ", ".join(final_inputs), "]"])
     )
+
+    if len(final_outputs) == 0:
+        final_outputs = map(lambda x: x.name, layers)
+
     logger.info(
         "".join(["The output order is [", ", ".join(final_outputs), "]"
                  ]))
 
     Inputs(*final_inputs)
-    if len(final_outputs) != 0:
-        Outputs(*final_outputs)
-    else:
-        Outputs(*map(lambda x: x.name, layers))
+    Outputs(*final_outputs)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index f0e51c3de5..4660a6b500 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -64,14 +64,6 @@ class BaseSGDOptimizer(Optimizer):
         w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
 
     where :math:`\\eta` is learning rate. And :math:`n` is batch size.
-
-    The SGD method is implemented by paddle with multiple extensions. Such as
-    momentum, adagrad, rmsprop, adam. Please use method 'use_xxx', such as
-    use_adam, to enhance the SGD method.
-
-    WARNING: IN PADDLE'S IMPLEMENTATION, BATCH_SIZE IS SET FOR ONE COMPUTE
-    PROCESS(NODE). IF YOU USE MULTIPLE MACHINE TO TRAIN YOUR NETWORK, THE GLOBAL
-    BATCH SIZE WILL BE (BATCH_SIZE * MACHINE_COUNT).
     """
 
     def to_setting_kwargs(self):
@@ -79,16 +71,41 @@ class BaseSGDOptimizer(Optimizer):
 
 
 class MomentumOptimizer(BaseSGDOptimizer):
+    """
+    MomentumOptimizer.
+
+    When sparse=True, the update scheme:
+
+    ..  math::
+
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th step.
+
+    :param sparse: with sparse support or not.
+    :type sparse: bool
+    """
     def extra_settings(self):
         default_momentum(self.momentum)
 
     def to_setting_kwargs(self):
-        return {
-            'learning_method': 'momentum'
-        }
+        if self.sparse:
+            return {
+                'learning_method': 'sparse_momentum'
+            }
+        else:
+            return {
+                'learning_method': 'momentum'
+            }
 
-    def __init__(self, momentum=1e-3):
+    def __init__(self, momentum=None, sparse=False):
         self.momentum = momentum
+        self.sparse = sparse
 
 
 class AdamOptimizer(BaseSGDOptimizer):
@@ -352,17 +369,35 @@ def settings(batch_size,
              gradient_clipping_threshold=None
              ):
     """
-    TODO(yuyang18): Complete docs.
-
-
-    :param batch_size:
-    :param learning_rate:
-    :param learning_method:
-    :param regularization:
-    :param is_async:
-    :param model_average:
-    :param gradient_clipping_threshold:
-    :return:
+    Set the optimization method, learning rate, batch size, and other training
+    settings. The currently supported algorithms are SGD and Async-SGD.
+
+    ..  warning::
+
+        Note that the 'batch_size' in PaddlePaddle is not equal to global
+        training batch size. It represents the single training process's batch
+        size. If you use N processes to train one model, for example use three
+        GPU machines, the global batch size is N*'batch_size'.
+
+    :param batch_size: batch size for one training process.
+    :type batch_size: int
+    :param learning_rate: learning rate for SGD
+    :type learning_rate: float
+    :param learning_method: The extension optimization algorithms of gradient
+                            descent, such as momentum, adagrad, rmsprop, etc.
+                            Note that it should be instance with base type
+                            BaseSGDOptimizer.
+    :type learning_method: BaseSGDOptimizer
+    :param regularization: The regularization method.
+    :type regularization: BaseRegularization
+    :param is_async: Is Async-SGD or not. Default value is False.
+    :type is_async: bool
+    :param model_average: Model Average Settings.
+    :type model_average: ModelAverage
+    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
+                                        value larger than some value, will be
+                                        clipped.
+    :type gradient_clipping_threshold: float
     """
     if isinstance(regularization, BaseRegularization):
         regularization = [regularization]
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 5e06d82005..d627daab0c 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -47,9 +47,14 @@ class MaxPooling(BasePoolingType):
     ..  math::
 
         max(samples\\_of\\_a\\_sequence)
+
+    :param output_max_index: True if output sequence max index instead of max
+                             value. None means use default value in proto.
+    :type output_max_index: bool|None
     """
-    def __init__(self):
+    def __init__(self, output_max_index=None):
         BasePoolingType.__init__(self, "max")
+        self.output_max_index = output_max_index
         
 
 class AvgPooling(BasePoolingType):
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
new file mode 100644
index 0000000000..cf52b06bfe
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -0,0 +1,10 @@
+#################### test_config_parser #########################
+add_test(NAME layers_test
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
new file mode 100644
index 0000000000..52378fe7a4
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
@@ -0,0 +1 @@
+*protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
new file mode 100644
index 0000000000..29928b6f7b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -0,0 +1,17 @@
+7e6919d17562516e9a1d9a88de1fb3b9  img_layers.protostr
+a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
+9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
+5913f87b39cee3b2701fa158270aca26  projections.protostr
+6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
+0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
+144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
+2378518bdb71e8c6e888b1842923df58  test_fc.protostr
+8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
+1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
+d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
+251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
+e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
+2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
+67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
+f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
+8122477f4f65244580cec09edc590041  util_layers.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
new file mode 100755
index 0000000000..fc2acbd41e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -e
+cd `dirname $0`
+export PYTHONPATH=$PWD/../../../../
+
+configs=(test_fc layer_activations projections test_print_layer
+test_sequence_pooling test_lstmemory_layer test_grumemory_layer
+last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
+img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group)
+
+
+for conf in ${configs[*]}
+do
+    echo "Generating " $conf
+    python -m paddle.utils.dump_config $conf.py > $conf.protostr
+done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
new file mode 100644
index 0000000000..6c8ba8be84
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -0,0 +1,20 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-3,
+    batch_size=1000
+)
+
+img = data_layer(name='image', size=256*256)
+
+img_conv = img_conv_layer(input=img, num_channels=1, num_filters=64,
+                          filter_size=(32, 64), padding=(1, 0), stride=(1, 1),
+                          act=LinearActivation())
+img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
+
+img_norm = img_cmrnorm_layer(input=img_bn, size=32)
+
+img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
+
+
+outputs(img_pool, img_norm)
\ No newline at end of file
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
new file mode 100644
index 0000000000..d54a1c49fd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -0,0 +1,26 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=30)
+
+seq_op = [
+    first_seq,
+    last_seq
+]
+
+agg_level = [
+    AggregateLevel.EACH_SEQUENCE,
+    AggregateLevel.EACH_TIMESTEP
+]
+
+opts = []
+
+for op in seq_op:
+    for al in agg_level:
+        opts.append(op(input=din, agg_level=al))
+
+outputs(opts)
\ No newline at end of file
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
new file mode 100644
index 0000000000..ba10dc78e1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
@@ -0,0 +1,21 @@
+'''
+Test all activations.
+'''
+
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='input', size=100)
+
+acts = [
+    TanhActivation, SigmoidActivation, SoftmaxActivation, IdentityActivation,
+    LinearActivation, ExpActivation, ReluActivation, BReluActivation,
+    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation]
+
+outputs(
+    [fc_layer(input=din, size=100, act=act(), name="layer_%d" % i) for i, act in
+     enumerate(acts)])
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
new file mode 100644
index 0000000000..4066c5bc6e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -0,0 +1,47 @@
+'''
+Test mixed layer, projections and operators.
+'''
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='test', size=100)
+
+din = embedding_layer(input=din, size=256)
+
+with mixed_layer(size=100) as m1:
+    m1 += full_matrix_projection(input=din)
+
+with mixed_layer(size=100) as m2:
+    m2 += table_projection(input=m1)
+
+with mixed_layer(size=100) as m3:
+    m3 += identity_projection(input=m2)
+
+with mixed_layer(size=100) as m4:
+    m4 += dotmul_projection(input=m3)
+
+with mixed_layer() as m5:
+    m5 += context_projection(input=m4, context_len=3)
+
+with mixed_layer() as m6:
+    m6 += dotmul_operator(a=m3, b=m4)
+
+img = data_layer(name='img', size=32*32)
+flt = data_layer(name='filter', size=3*3*1*64)
+
+with mixed_layer() as m7:
+    m7 += conv_operator(img=img, filter=flt, num_filters=64,
+                        num_channel=1, filter_size=3)
+
+end = mixed_layer(input=[full_matrix_projection(input=m5),
+                         trans_full_matrix_projection(input=m6),
+                         full_matrix_projection(input=m7)],
+                  size=100,
+                  layer_attr=ExtraAttr(drop_rate=0.5,
+                                       error_clipping_threshold=40))
+
+outputs(end)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
new file mode 100755
index 0000000000..78114ce32b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+cd `dirname $0`
+set -e
+./generate_protostr.sh
+md5sum -c check.md5
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
new file mode 100644
index 0000000000..87c2a85cf9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
@@ -0,0 +1,36 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=200)
+
+hidden = fc_layer(input=din, size=200, act=SigmoidActivation())
+
+rnn = recurrent_layer(input=hidden, act=SigmoidActivation())
+
+rnn2 = recurrent_layer(input=hidden, act=SigmoidActivation(), reverse=True)
+
+lstm1_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
+                       bias_attr=False)
+
+lstm1 = lstmemory(input=lstm1_param, act=SigmoidActivation())
+
+lstm2_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
+                       bias_attr=False)
+
+lstm2 = lstmemory(input=lstm2_param, act=SigmoidActivation(), reverse=True)
+
+gru1_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
+                      bias_attr=False)
+gru1 = grumemory(input=gru1_param, act=SigmoidActivation())
+
+gru2_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
+                      bias_attr=False)
+gru2 = grumemory(input=gru2_param, act=SigmoidActivation(), reverse=True)
+
+outputs(last_seq(input=rnn), first_seq(input=rnn2),
+        last_seq(input=lstm1), first_seq(input=lstm2),
+        last_seq(input=gru1), first_seq(gru2))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
new file mode 100644
index 0000000000..64b45f4ded
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -0,0 +1,26 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+seq_in = data_layer(name='input', size=200)
+labels = data_layer(name='labels', size=5000)
+
+probs = data_layer(name='probs', size=10)
+xe_label = data_layer(name='xe-label', size=10)
+
+outputs(ctc_layer(input=seq_in, label=labels),
+        crf_layer(input=fc_layer(input=seq_in, size=4),
+                  label=data_layer(name='crf_label', size=4)),
+        rank_cost(left=data_layer(name='left', size=1),
+                  right=data_layer(name='right', size=1),
+                  label=data_layer(name='label', size=1)),
+        lambda_cost(input=data_layer(name='list_feature', size=100),
+                    score=data_layer(name='list_scores', size=1)),
+        cross_entropy(input=probs, label=xe_label),
+        cross_entropy_with_selfnorm(input=probs, label=xe_label),
+        huber_cost(input=data_layer(name='huber_probs', size=1),
+                   label=data_layer(name='huber_label', size=1)),
+        multi_binary_label_cross_entropy(input=probs, label=xe_label))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
new file mode 100644
index 0000000000..d9c841ab27
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=30)
+data_seq = data_layer(name='data_seq', size=30)
+
+outputs(expand_layer(input=din, expand_as=data_seq,
+                     expand_level=ExpandLevel.FROM_SEQUENCE),
+        expand_layer(input=din, expand_as=data_seq,
+                     expand_level=ExpandLevel.FROM_TIMESTEP))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
new file mode 100644
index 0000000000..a6d033f291
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
@@ -0,0 +1,20 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=100)
+
+trans = trans_layer(input=din)
+
+hidden = fc_layer(input=trans, size=100,
+                  bias_attr=False)
+
+mask = data_layer(name='mask', size=100)
+
+hidden_sel = selective_fc_layer(input=din, select=mask, size=100,
+                                act=SigmoidActivation())
+
+outputs(hidden, hidden_sel)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
new file mode 100644
index 0000000000..8d9fd9df51
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=120)
+
+outputs(grumemory(input=din, size=40, reverse=True, gate_act=TanhActivation(),
+                  act=SigmoidActivation()))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
new file mode 100644
index 0000000000..46069074de
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='data', size=100)
+label = data_layer(name='label', size=10)
+
+outputs(hsigmoid(input=din, label=label, num_classes=10))
\ No newline at end of file
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
new file mode 100644
index 0000000000..56304addb1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=128)
+
+outputs(lstmemory(input=din, reverse=True, gate_act=TanhActivation(),
+                  act=TanhActivation(), size=32))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
new file mode 100644
index 0000000000..4d8e1fdc6b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+weight = data_layer(name='w', size=1)
+a = data_layer(name='a', size=100)
+b = data_layer(name='b', size=100)
+c = data_layer(name='c', size=200)
+d = data_layer(name='d', size=31)
+
+outputs(interpolation_layer(input=[a, b], weight=weight),
+        power_layer(input=a, weight=weight),
+        scaling_layer(input=a, weight=weight),
+        cos_sim(a=a, b=b),
+        cos_sim(a=a, b=c, size=2),
+        sum_to_one_norm_layer(input=a),
+        conv_shift_layer(a=a, b=d),
+        tensor_layer(a=a, b=b, size=1000),
+        slope_intercept_layer(input=a, slope=0.7, intercept=0.9),
+        linear_comb_layer(weights=b, vectors=c))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
new file mode 100644
index 0000000000..f6b2661c7b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='input', size=100)
+
+print_layer(input=din)
+
+outputs(din)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
new file mode 100644
index 0000000000..53f5c5d249
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -0,0 +1,35 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+seq = data_layer(name='seq_input', size=100)
+sub_seq = data_layer(name='sub_seq_input', size=100)
+lbl = data_layer(name='label', size=1)
+
+
+def generate_rnn_simple(name):
+    def rnn_simple(s):
+        m = memory(name=name, size=200)
+        fc = fc_layer(input=[s, m], size=200, name=name)
+        return fc
+
+    return rnn_simple
+
+
+with mixed_layer() as lstm_param:  # test lstm unit, rnn group
+    lstm_param += full_matrix_projection(input=seq, size=100 * 4)
+
+with mixed_layer() as gru_param:
+    gru_param += full_matrix_projection(input=seq, size=100 * 3)
+
+outputs(last_seq(input=recurrent_group(step=generate_rnn_simple('rnn_forward'),
+                                       input=seq)),
+        first_seq(input=recurrent_group(step=generate_rnn_simple('rnn_back'),
+                                        input=seq, reverse=True)),
+        last_seq(input=recurrent_group(step=generate_rnn_simple(
+            'rnn_subseq_forward'), input=SubsequenceInput(input=sub_seq))),
+        last_seq(input=lstmemory_group(input=lstm_param, size=100)),
+        last_seq(input=gru_group(input=gru_param, size=100)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
new file mode 100644
index 0000000000..2e24164b55
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -0,0 +1,30 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='dat_in', size=100)
+
+POOL_TYPE = [
+    MaxPooling,
+    AvgPooling,
+    SumPooling
+]
+
+AGG_LEVEL = [
+    AggregateLevel.EACH_SEQUENCE,
+    AggregateLevel.EACH_TIMESTEP
+]
+
+opts = []
+
+for pt in POOL_TYPE:
+    for al in AGG_LEVEL:
+        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
+
+opts.append(pooling_layer(input=din,
+                          pooling_type=MaxPooling(output_max_index=True)))
+
+outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
new file mode 100644
index 0000000000..a6a3d09a43
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+probs = data_layer(name='probs', size=100)
+
+outputs(
+    sampling_id_layer(input=probs),  # It seems not support training
+
+    # It seems this layer is not correct, and should be rewrite.
+    # block_expand_layer(input=probs, channel=1, block_x=1, block_y=3),
+)
\ No newline at end of file
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
new file mode 100644
index 0000000000..aadb3f3f5e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
@@ -0,0 +1,15 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+a = data_layer(name='a', size=10)
+b = data_layer(name='b', size=10)
+
+result = addto_layer(input=[a, b])
+concat1 = concat_layer(input=[a, b])
+concat2 = concat_layer(input=[
+    identity_projection(input=a),
+    identity_projection(input=b)
+])
+
+outputs(result, concat1, concat2)
\ No newline at end of file
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
new file mode 100644
index 0000000000..3b55667354
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize(
+        'trainer_config_helpers/tests/layers_test_config.py', '')
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
new file mode 100644
index 0000000000..faaab9107d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+num_classes = 5
+
+x = data_layer(name="input1", size=3)
+y = data_layer(name="input2", size=5)
+
+z = out_prod_layer(input1=x, input2=y)
+
+x1 = fc_layer(input=x, size=5)
+y1 = fc_layer(input=y, size=5)
+
+z1 = mixed_layer(act=LinearActivation(),
+                 input=[conv_operator(img=x1,
+                                      filter=y1,
+                                      filter_size=1,
+                                      num_filters=5,
+                                      num_channel=5,
+                                      stride=1)])
+
+y2 = fc_layer(input=y, size=15)
+
+cos1 = cos_sim(a=x1, b=y1)
+cos3 = cos_sim(a=x1, b=y2, size=3)
+
+linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
+
+out = fc_layer(input=[cos1, cos3, linear_comb, z, z1],
+               size=num_classes,
+               act=SoftmaxActivation())
+
+print_layer(input=[out])
+
+outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
+
+dotmul = mixed_layer(input=[dotmul_operator(a=x1, b=x1),
+                            dotmul_projection(input=y1)])
+
+proj_with_attr_init = mixed_layer(input=full_matrix_projection(input=y1,
+                                                               param_attr=ParamAttr(learning_rate = 0,
+                                                                                 initial_mean = 0,
+                                                                                 initial_std = 0)),
+                               bias_attr = ParamAttr(initial_mean=0, initial_std=0, learning_rate=0),
+                               act = LinearActivation(),
+                               size = 5,
+                               name='proj_with_attr_init')
+
+
+# for ctc
+tmp = fc_layer(input=[x1, dotmul, proj_with_attr_init],
+               size=num_classes + 1,
+               act=SoftmaxActivation())
+ctc = ctc_layer(input=tmp,
+                label=y,
+                size=num_classes + 1)
+ctc_eval = ctc_error_evaluator(input=tmp, label=y)
+
+settings(
+    batch_size=10,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
diff --git a/python/setup.py.in b/python/setup.py.in
index 3f906ed46f..d2fb95f27f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -8,7 +8,7 @@ packages=['paddle',
           'paddle.trainer_config_helpers',
           'paddle.utils']
 
-if len(INTERNAL_PACKAGE) == 0:
+if len(INTERNAL_PACKAGE) != 0:
     packages.append(INTERNAL_PACKAGE)
 
 setup(name='paddle',