From 2da240c7ec776b44ffe6e06fa551fbff960c3b18 Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Mon, 14 Aug 2017 15:13:23 -0700
Subject: [PATCH 01/23] fix local recordio reader

---
 python/paddle/v2/reader/creator.py            |  12 ++++++----
 python/paddle/v2/reader/tests/creator_test.py |  22 ++++++++++++++++++
 .../v2/reader/tests/test_reader_recordio.dat  | Bin 0 -> 76 bytes
 3 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/v2/reader/tests/test_reader_recordio.dat

diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index d0f18e4b66..97e844b92c 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -57,7 +57,7 @@ def text_file(path):
     return reader
 
 
-def recordio_local(paths, buf_size=100):
+def recordio(paths, buf_size=100):
     """
     Creates a data reader from given RecordIO file paths separated by ",",
         glob pattern is supported.
@@ -67,15 +67,19 @@ def recordio_local(paths, buf_size=100):
 
     import recordio as rec
     import paddle.v2.reader.decorator as dec
+    import cPickle as pickle
 
     def reader():
-        a = ','.join(paths)
-        f = rec.reader(a)
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
         while True:
             r = f.read()
             if r is None:
                 break
-            yield r
+            yield pickle.loads(r)
         f.close()
 
     return dec.buffered(reader, buf_size)
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index 359f3eeefb..cf190aa664 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -34,5 +34,27 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.v2.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
GIT binary patch
literal 76
zcmZQ!W@4P2Bs!asfq}sSh?#)+KN|x>v0q|9K_sIV14Bftj}1RiRKwGd%hQO<)0nHI
Tz>rH1B4onlY0Bkk1`z@P(}N7c

literal 0
HcmV?d00001


From c3bda2acf854c75a13fa96fe2cd7511d17a28f0f Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Mon, 14 Aug 2017 15:16:41 -0700
Subject: [PATCH 02/23] Add recordio as paddle's dependency.

---
 Dockerfile              | 1 +
 python/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 41b6729124..ea2a00d6cd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -64,6 +64,7 @@ RUN pip install --upgrade pip && \
     pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
     pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install 'recordio>=0.1.0' && \
     pip install opencv-python rarfile 'scipy>=0.19.0' 'nltk>=3.2.2'
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
diff --git a/python/requirements.txt b/python/requirements.txt
index 3df822bd76..e19453c25d 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,7 +1,7 @@
 requests==2.9.2
 numpy>=1.12
 protobuf==3.1
-recordio
+recordio>=0.1.0
 matplotlib
 rarfile
 scipy>=0.19.0

From d1cda3331646806afd07c18d4ef9f6a6b88ce72b Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 15 Aug 2017 11:03:17 +0800
Subject: [PATCH 03/23] build documentation don't need install Paddle before

---
 paddle/scripts/docker/build.sh | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2f0205b770..a382d4368c 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -82,10 +82,6 @@ EOF
 fi
 
 
-# To build documentation, we need to run cmake again after installing
-# PaddlePaddle.  This awkwardness is due to
-# https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
-# describes a solution.
 if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
     cat <<EOF
 ========================================
@@ -93,11 +89,6 @@ Building documentation ...
    In /paddle/build_doc
 ========================================
 EOF
-    # build documentation need install Paddle before
-    make install -j `nproc`
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
-    paddle version
-
     mkdir -p /paddle/build_doc
     pushd /paddle/build_doc
     cmake .. \
@@ -106,7 +97,8 @@ EOF
           -DWITH_AVX=${WITH_AVX:-ON} \
           -DWITH_SWIG_PY=ON \
           -DWITH_STYLE_CHECK=OFF
-    make paddle_docs paddle_docs_cn
+    make -j `nproc` gen_proto_py
+    make -j `nproc` paddle_docs paddle_docs_cn
     popd
 fi
 

From 72be3b920a949ae86089b9be2d9d6346212e6d81 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 15 Aug 2017 12:16:06 +0800
Subject: [PATCH 04/23] add install infomation in build.sh

---
 paddle/scripts/docker/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a382d4368c..6c2f5fed40 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -174,3 +174,7 @@ ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
+
+set +xe
+printf "If you need to install PaddlePaddle in develop docker image,"
+printf "please make install or pip install build/python/dist/*.whl.\n"

From 95fe318e3ee19004419eb5aff09bca7ddaacad46 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 15 Aug 2017 14:08:20 +0800
Subject: [PATCH 05/23] init

---
 Dockerfile                        | 14 ------
 cmake/flags.cmake                 |  7 ---
 paddle/platform/CMakeLists.txt    |  2 +-
 paddle/platform/device_context.cc | 79 +++++++++++++++++++++++++------
 paddle/platform/device_context.h  | 12 +++--
 5 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index da00471025..98f61ba586 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,20 +71,6 @@ RUN pip install -r /root/requirements.txt
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 
-# TODO(qijun) The template library Eigen doesn't work well with GCC 5 
-# coming with the default Docker image, so we switch to use GCC 4.8 
-# by default. And I will check Eigen library later.
-
-RUN ln -sf gcc-4.8 /usr/bin/gcc && \
-    ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
-    ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
-    ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
-    ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
-    ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
-    ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
-    ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
-    ln -sf g++-4.8 /usr/bin/g++ && \
-    ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ 
 
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b27eb71550..47bb83b00a 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag)
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
         endif()
-        if(NOT ANDROID)
-            # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
-            # Use Debug mode instead for now.
-            if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
-                set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
-            endif()
-        endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
         # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4154aad15c..c1ad60d160 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -16,5 +16,5 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc DEPS memory place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index a928e09778..dc345bdd57 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/device_context.h"
+#include "paddle/memory/memory.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +37,59 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
 #ifndef PADDLE_ONLY_CPU
 
+class EigenCudaStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenCudaStreamDevice() override {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const cudaStream_t& stream() const override { return *stream_; }
+
+  const cudaDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    paddle::memory::Alloc(place_, num_bytes);
+  }
+
+  void deallocate(void* buffer) const override {
+    paddle::memory::Free(place_, buffer);
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch =
+          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      PADDLE_ENFORCE(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  const cudaStream_t* stream_;         // not owned;
+  const cudaDeviceProp* device_prop_;  // not owned;
+  mutable char* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
 template <>
 Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
@@ -43,19 +97,9 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
 
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
   SetDeviceId(place_.device);
-  // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
-  // here will cause segment fault. We must implement a class derived from
-  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
-  // later. Please refer to the implementation of class EigenCudaStreamDevice
-  // in TensorFlow.
-  //
-  // We find that CUDA 7 introduces a new option, the per-thread default stream,
-  // that has two effects. Please refer to https://devblogs.nvidia.com/
-  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
-  //
-  // So, we decide to use default stream and add –default-stream per-thread nvcc
-  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
-  eigen_stream_.reset(new Eigen::CudaStreamDevice());
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream_, place);
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }
 
@@ -75,12 +119,13 @@ CUDADeviceContext::~CUDADeviceContext() {
   }
   eigen_stream_.reset();
   eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 }
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
 }
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
@@ -91,6 +136,7 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
   if (!cublas_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+    PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
   }
   return cublas_handle_;
 }
@@ -99,10 +145,13 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
   if (!cudnn_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnnHandle_t, stream_));
   }
   return cudnn_handle_;
 }
 
+cudaStream_t CUDADeviceContext::stream() { return stream_; }
+
 curandGenerator_t CUDADeviceContext::curand_generator() {
   if (!curand_generator_) {
     SetDeviceId(place_.device);
@@ -110,6 +159,8 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
                                                   CURAND_RNG_PSEUDO_DEFAULT));
     PADDLE_ENFORCE(
         dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
+
+    PADDLE_ENFORCE(dynload::curandSetStream(curandGenerator_t, stream_));
   }
   return curand_generator_;
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 08b5b2cff9..b68e177c0a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -76,6 +76,9 @@ class CUDADeviceContext : public DeviceContext {
 
   /*! \brief  Return curand handle in the device context. */
   curandGenerator_t curand_generator();
+
+  /*! \brief  Return cuda stream in the device context. */
+  cudaStream_t      stream();
   // clang-format on
 
  private:
@@ -83,15 +86,16 @@ class CUDADeviceContext : public DeviceContext {
 
  private:
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
-  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
  private:
   uint64_t seed_;
 
   // clang-format off
-  cudnnHandle_t     cudnn_handle_     = nullptr;
-  cublasHandle_t    cublas_handle_    = nullptr;
-  curandGenerator_t curand_generator_ = nullptr;
+  cudaStream_t       stream_{nullptr}
+  cudnnHandle_t      cudnn_handle_{nullptr};
+  cublasHandle_t     cublas_handle_{nullptr};
+  curandGenerator_t  curand_generator_{nullptr};
   // clang-format on
 };
 

From 962cb25c9d4b29ae4e61bfa891faaead5e140633 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 15 Aug 2017 14:10:15 +0800
Subject: [PATCH 06/23] fix crash when disable WITH_SWIG_PY

---
 python/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index d2f064bea0..7bd6d59b00 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -50,8 +50,11 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
-add_custom_target(paddle_python ALL DEPENDS
-    ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel ${MKL_DEPENDS})
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
+if(WITH_SWIG_PY)
+    list(APPEND paddle_python_deps python_api_wheel)
+endif()
+add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 

From f168843e47df6cee8a81a30408ba4c2d092893fa Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 15 Aug 2017 06:59:05 +0000
Subject: [PATCH 07/23] fix gpu build error

---
 paddle/memory/CMakeLists.txt      | 2 +-
 paddle/platform/CMakeLists.txt    | 5 ++++-
 paddle/platform/device_context.cc | 8 ++++----
 paddle/platform/device_context.h  | 3 ++-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8035d93bfe..9cc4233e43 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index c1ad60d160..acfc063973 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -16,5 +16,8 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS memory place eigen3 ${GPU_CTX_DEPS})
+# memcpy deoends on device_context, here add deps individually for
+# avoiding cycle dependencies
+cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index dc345bdd57..f92c15ae45 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -57,7 +57,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
-    paddle::memory::Alloc(place_, num_bytes);
+    return paddle::memory::Alloc(place_, num_bytes);
   }
 
   void deallocate(void* buffer) const override {
@@ -86,7 +86,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   GPUPlace place_;
   const cudaStream_t* stream_;         // not owned;
   const cudaDeviceProp* device_prop_;  // not owned;
-  mutable char* scratch_;
+  mutable void* scratch_;
   mutable unsigned int* semaphore_;
 };
 
@@ -145,7 +145,7 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
   if (!cudnn_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnnHandle_t, stream_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
   }
   return cudnn_handle_;
 }
@@ -160,7 +160,7 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
     PADDLE_ENFORCE(
         dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
 
-    PADDLE_ENFORCE(dynload::curandSetStream(curandGenerator_t, stream_));
+    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
   }
   return curand_generator_;
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index b68e177c0a..c5042ae33e 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -52,6 +52,7 @@ class CPUDeviceContext : public DeviceContext {
 };
 
 #ifndef PADDLE_ONLY_CPU
+class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
  public:
@@ -92,7 +93,7 @@ class CUDADeviceContext : public DeviceContext {
   uint64_t seed_;
 
   // clang-format off
-  cudaStream_t       stream_{nullptr}
+  cudaStream_t       stream_{nullptr};
   cudnnHandle_t      cudnn_handle_{nullptr};
   cublasHandle_t     cublas_handle_{nullptr};
   curandGenerator_t  curand_generator_{nullptr};

From 2403045cbd57eb837d5ab82e2acc66767c1d3224 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 15 Aug 2017 07:03:54 +0000
Subject: [PATCH 08/23] refine device_context_test

---
 paddle/platform/device_context_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 65345c433c..8b764bdcd9 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -45,6 +45,7 @@ TEST(Device, CUDADeviceContext) {
     ASSERT_NE(nullptr, cublas_handle);
     curandGenerator_t curand_handle = device_context->curand_generator();
     ASSERT_NE(nullptr, curand_handle);
+    ASSERT_NE(nullptr, device_context->stream());
     delete device_context;
   }
 }

From d08550fdd22453227e9a3f3f5e061c2849290304 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Tue, 15 Aug 2017 15:53:24 +0800
Subject: [PATCH 09/23] Large model train doc (#3445)

* large model train note

* update

* update doc
---
 .../cluster_train/large_model_dist_train.md   | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 doc/design/cluster_train/large_model_dist_train.md

diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000..0c4b5bc24c
--- /dev/null
+++ b/doc/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.

From 881c5d00d0e44cd957eeea2532c556a0eef8117a Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 15 Aug 2017 16:08:11 +0800
Subject: [PATCH 10/23] remove device_context deps in memcpy

---
 paddle/memory/memcpy.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index aaab1142ca..a19a3e3675 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 
 #include <cstring>  // for memcpy
 
-#include "paddle/platform/device_context.h"
-
 namespace paddle {
 namespace memory {
 

From 9eaef75397926819294edda04dbed34aa069f5f4 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 15 Aug 2017 17:44:08 +0800
Subject: [PATCH 11/23] RNN backward create (#3490)

* insert rnn's backward into Backward()

* add device_context into backward_test
---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/backward.cc    | 17 +++++++++++++++++
 paddle/operators/recurrent_op.h |  4 ++--
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 0398526024..68304c9fc8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -38,7 +38,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward)
+cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
 if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 83b7e4cdac..c226e4e3d2 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -17,6 +17,7 @@
 #include <list>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
+#include "paddle/operators/recurrent_op.h"
 
 namespace paddle {
 namespace framework {
@@ -178,6 +179,22 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
                      return false;
                    });
 
+    // process recurrent gradient op as a special operator.
+    if (forwardOp.Type() == "recurrent_op") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::RecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::RecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.stepnet());
+      // create stepnet's gradient op
+      auto grad_stepnet = BackwardRecursive(stepnet_op, no_grad_names, uniq_id);
+      rnn_grad_op->set_stepnet(
+          std::static_pointer_cast<operators::NetOp>(grad_stepnet));
+    }
+
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index caca644c96..171a0bd2ae 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -127,7 +127,7 @@ class RecurrentOp final : public framework::OperatorBase {
   }
 
   void set_stepnet(std::shared_ptr<NetOp> net) { stepnet_ = net; }
-  const NetOp* stepnet() const { return stepnet_.get(); }
+  const NetOp& stepnet() const { return *stepnet_; }
 
   static const rnn::ArgumentName kArgName;
 
@@ -158,7 +158,7 @@ class RecurrentGradientOp final : public framework::OperatorBase {
   static const rnn::ArgumentName kArgName;
 
   void set_stepnet(const std::shared_ptr<NetOp>& net) { stepnet_ = net; }
-  const NetOp* stepnet() const { return stepnet_.get(); }
+  const NetOp& stepnet() const { return *stepnet_; }
 
  private:
   RecurrentGradientAlgorithm alg_;

From 79a336b78f707ba04076e3130e7b7c0d87a484a7 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 15 Aug 2017 18:39:28 +0800
Subject: [PATCH 12/23] add shared warpctc lib in whl

---
 python/setup.py.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 287442e013..db15f6d950 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -31,7 +31,9 @@ paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
                '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main']
 
 paddle_rt_lib_dir = 'local/lib'
-paddle_rt_libs = [] if '${MKL_SHARED_LIBS}'== '' else '${MKL_SHARED_LIBS}'.split(';')
+paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
+if '${MKL_SHARED_LIBS}'!= '':
+  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
 setup(name='paddlepaddle',
       version='${PADDLE_VERSION}',

From 7bc60b02737ba3695997086ac96d6915b1acb3f9 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 15 Aug 2017 14:21:35 -0700
Subject: [PATCH 13/23] Move OpRegistry functions to .cc file and move OpMaker
 to Op module

---
 paddle/framework/op_registry.cc |  46 ++++++++++-
 paddle/framework/op_registry.h  | 138 ++------------------------------
 paddle/framework/operator.cc    |  38 +++++++++
 paddle/framework/operator.h     |  68 ++++++++++++++++
 4 files changed, 156 insertions(+), 134 deletions(-)

diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 1caa02a2a1..f801f970f2 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -17,5 +17,49 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+
+std::shared_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
+                                                   const VarNameMap& inputs,
+                                                   const VarNameMap& outputs,
+                                                   AttributeMap attrs) {
+  auto it = op_info_map().find(type);
+  PADDLE_ENFORCE(it != op_info_map().end(),
+                 "Operator '%s' has not been registered.", type);
+  it->second.checker_->Check(attrs);
+  auto op = it->second.creator_(type, inputs, outputs, attrs);
+  return std::shared_ptr<OperatorBase>(op);
+}
+
+std::shared_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
+OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
+    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+  VarNameMap ret_val;
+  for (auto& var : op_desc_vars) {
+    auto& var_names = ret_val[var.parameter()];
+    auto& var_names_in_proto = var.arguments();
+    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
+    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
+              std::back_inserter(var_names));
+  }
+  return ret_val;
+}
+
+std::shared_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
+  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
+  std::shared_ptr<OperatorBase> grad_op(BuildGradOp(&op));
+  return grad_op;
+}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 120f4ede6b..cc2234d50e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -29,103 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {
- public:
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
-
-  ~OpProtoAndCheckerMaker() {
-    PADDLE_ENFORCE(validated_, "should call Validate after build");
-  }
-
-  void Validate() {
-    validated_ = true;
-    CheckNoDuplicatedInOutAttrs();
-  }
-
- protected:
-  struct VariableBuilder {
-    OpProto::Var* var_;
-
-    VariableBuilder& AsDuplicable() {
-      var_->set_duplicable(true);
-      return *this;
-    }
-
-    VariableBuilder& AsIntermediate() {
-      var_->set_intermediate(true);
-      return *this;
-    }
-
-    // TODO(FengJiayi, yuyang18): `AsNoGradient` is a very bad name, because it
-    // means that input/output is not needed when calculate gradient. It does
-    // not mean no gradient when backward. It should be changed soon.
-    VariableBuilder& AsNoGradient() {
-      var_->set_no_gradient(true);
-      return *this;
-    }
-  };
-
-  VariableBuilder AddInput(const std::string& name,
-                           const std::string& comment) {
-    auto* input = proto_->add_inputs();
-    input->set_name(name);
-    input->set_comment(comment);
-    return VariableBuilder{input};
-  }
-
-  VariableBuilder AddOutput(const std::string& name,
-                            const std::string& comment) {
-    auto* output = proto_->add_outputs();
-    output->set_name(name);
-    output->set_comment(comment);
-    return VariableBuilder{output};
-  }
-
-  template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment,
-                               bool generated = false) {
-    auto* attr = proto_->add_attrs();
-    attr->set_name(name);
-    attr->set_comment(comment);
-    attr->set_generated(generated);
-    attr->set_type(AttrTypeID<T>());
-    return op_checker_->AddAttrChecker<T>(name);
-  }
-
-  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
-
- private:
-  void CheckNoDuplicatedInOutAttrs() {
-    std::unordered_set<std::string> names;
-    auto checker = [&](const std::string& name) {
-      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
-      names.insert(name);
-    };
-    for (auto& attr : proto_->attrs()) {
-      checker(attr.name());
-    }
-    for (auto& input : proto_->inputs()) {
-      checker(input.name());
-    }
-    for (auto& output : proto_->outputs()) {
-      checker(output.name());
-    }
-  }
-
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
-  bool validated_{false};
-};
-
-class NOPMaker : public OpProtoAndCheckerMaker {
- public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {}
-};
-
 class OpRegistry {
   using VarNameMap = OperatorBase::VarNameMap;
   using OpCreator = std::function<OperatorBase*(
@@ -177,45 +80,14 @@ class OpRegistry {
   static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
                                                 const VarNameMap& inputs,
                                                 const VarNameMap& outputs,
-                                                AttributeMap attrs) {
-    auto it = op_info_map().find(type);
-    PADDLE_ENFORCE(it != op_info_map().end(),
-                   "Operator '%s' has not been registered.", type);
-    it->second.checker_->Check(attrs);
-    auto op = it->second.creator_(type, inputs, outputs, attrs);
-    return std::shared_ptr<OperatorBase>(op);
-  }
-
-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-    VarNameMap ret_val;
-    for (auto& var : op_desc_vars) {
-      auto& var_names = ret_val[var.parameter()];
-      auto& var_names_in_proto = var.arguments();
-      var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
-      std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
-                std::back_inserter(var_names));
-    }
-    return ret_val;
-  }
+                                                AttributeMap attrs);
 
-  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
-    VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-    VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-    AttributeMap attrs;
-    for (auto& attr : op_desc.attrs()) {
-      attrs[attr.name()] = GetAttrValue(attr);
-    }
+  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-    return CreateOp(op_desc.type(), inputs, outputs, attrs);
-  }
+  static VarNameMap ConvertOpDescVarsToVarNameMap(
+      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
 
-  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
-    PADDLE_ENFORCE(!op.IsNetOp(),
-                   "Use framework::Backward to get backward ops");
-    std::shared_ptr<OperatorBase> grad_op(BuildGradOp(&op));
-    return grad_op;
-  }
+  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
 
   static std::unordered_map<std::string, const OpInfo>& op_info_map() {
     static std::unordered_map<std::string, const OpInfo> op_info_map_;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 0daf12e7f5..eadd8f3316 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -164,5 +164,43 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
   return ret_val;
 }
 
+void OpProtoAndCheckerMaker::Validate() {
+  validated_ = true;
+  CheckNoDuplicatedInOutAttrs();
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
+    const std::string& name, const std::string& comment) {
+  auto* input = proto_->add_inputs();
+  input->set_name(name);
+  input->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{input};
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
+    const std::string& name, const std::string& comment) {
+  auto* output = proto_->add_outputs();
+  output->set_name(name);
+  output->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{output};
+}
+
+void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
+  std::unordered_set<std::string> names;
+  auto checker = [&](const std::string& name) {
+    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    names.insert(name);
+  };
+  for (auto& attr : proto_->attrs()) {
+    checker(attr.name());
+  }
+  for (auto& input : proto_->inputs()) {
+    checker(input.name());
+  }
+  for (auto& output : proto_->outputs()) {
+    checker(output.name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60d4f06c7e..2c8620a7ce 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -138,6 +138,74 @@ class NOP : public OperatorBase {
            const platform::DeviceContext& dev_ctx) const override {}
 };
 
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate();
+
+ protected:
+  struct VariableBuilder {
+    OpProto::Var* var_;
+
+    VariableBuilder& AsDuplicable() {
+      var_->set_duplicable(true);
+      return *this;
+    }
+
+    VariableBuilder& AsIntermediate() {
+      var_->set_intermediate(true);
+      return *this;
+    }
+
+    // TODO(FengJiayi, yuyang18): `AsNoGradient` is a very bad name, because it
+    // means that input/output is not needed when calculate gradient. It does
+    // not mean no gradient when backward. It should be changed soon.
+    VariableBuilder& AsNoGradient() {
+      var_->set_no_gradient(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment);
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment,
+                               bool generated = false) {
+    auto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
+    attr->set_generated(generated);
+    attr->set_type(AttrTypeID<T>());
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+
+ private:
+  void CheckNoDuplicatedInOutAttrs();
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+  bool validated_{false};
+};
+
+class NOPMaker : public OpProtoAndCheckerMaker {
+ public:
+  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {}
+};
+
 class InferShapeContext {
  public:
   InferShapeContext(const OperatorBase& op, const Scope& scope)

From c307ee303b982c97ee66f91981f81c606c62ec63 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Wed, 16 Aug 2017 11:31:21 +0800
Subject: [PATCH 14/23] clang format with version check (#3513)

* add clang-format with version check 3.8

* improve doc
---
 .clang_format.hook      | 15 +++++++++++++++
 .pre-commit-config.yaml |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100755 .clang_format.hook

diff --git a/.clang_format.hook b/.clang_format.hook
new file mode 100755
index 0000000000..1d92821686
--- /dev/null
+++ b/.clang_format.hook
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+readonly VERSION="3.8"
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bb8c88787d..a772125df6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,10 +19,10 @@
     -   id: end-of-file-fixer
 -   repo: local
     hooks:
-    -   id: clang-format
+    -   id: clang-format-with-version-check
         name: clang-format
         description: Format files with ClangFormat.
-        entry: clang-format -i
+        entry: ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang

From 13c20ad39e23f0d377bab05c7fea0621d46abd07 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 16 Aug 2017 11:53:46 +0800
Subject: [PATCH 15/23] remove --default-stream per-thread nvcc flag

---
 cmake/flags.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 47bb83b00a..ff246b2eb4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")

From 0d2ab5e993c9dd16ada677a8ea9de563553a7428 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 16 Aug 2017 11:50:11 +0800
Subject: [PATCH 16/23] use param header to save mkldnn format info

---
 doc/design/mkldnn/README.MD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index e956994431..2929514b08 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -101,6 +101,7 @@ if use_mkldnn
 5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
 6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
 7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展`Header`里面的`int32_t version;     // = 0, file format version`信息。这个`version`值，不管是在v1还是在v2里面，一直保存的是0。所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，`MKLDNNLayer`就可以知道得到的参数是哪种格式的了。只不过目前v2里面是写的固定值0，而不是保存的`Header`本身，这一点相信v2未来应该会优化的。
 
 ## References
 

From 137a05eb752f33d2529437c08bf6e58a7010c03d Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 16 Aug 2017 13:53:07 +0800
Subject: [PATCH 17/23] update

---
 doc/design/mkldnn/README.MD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index 2929514b08..fe8da907d9 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -101,7 +101,7 @@ if use_mkldnn
 5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
 6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
 7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展`Header`里面的`int32_t version;     // = 0, file format version`信息。这个`version`值，不管是在v1还是在v2里面，一直保存的是0。所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，`MKLDNNLayer`就可以知道得到的参数是哪种格式的了。只不过目前v2里面是写的固定值0，而不是保存的`Header`本身，这一点相信v2未来应该会优化的。
+8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
 
 ## References
 

From 0f8688192cfd4892c379c5f994a2d7149fa3c63d Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Wed, 16 Aug 2017 16:09:09 +0800
Subject: [PATCH 18/23] Fix invalid paddle binary file path (#3421)

Fix invalid paddle executable file path with pip install
---
 .../build_and_install/build_from_source_en.md | 13 +++++----
 paddle/scripts/docker/build.sh                | 26 ++---------------
 paddle/scripts/submit_local.sh.in             | 29 ++++---------------
 python/setup.py.in                            | 12 ++++----
 4 files changed, 21 insertions(+), 59 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index c0608ede8e..2f14614894 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -68,7 +68,7 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
     To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
@@ -131,9 +131,9 @@ As a simple example, consider the following:
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -172,6 +172,7 @@ export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+
 ## <span id="centos">Build on Centos 7</span>
 
 ### Install Dependencies
@@ -192,9 +193,9 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -222,7 +223,7 @@ mkdir build && cd build
 ``` 
 
 Finally, you can build and install PaddlePaddle:
-
+  
 ```bash
 # you can add build option here, such as:    
 cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 6c2f5fed40..7c12664aed 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -120,25 +120,6 @@ EOF
     /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
 fi
 
-# generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-if [[ ${WITH_DEB:-ON} == "ON" ]]; then
-    cat <<EOF
-========================================
-Generating .deb package ...
-========================================
-EOF
-    set +e
-    cpack -D CPACK_GENERATOR='DEB' -j `nproc` ..
-    err_code=$?
-    if [ ${err_code} -ne 0 ]; then
-        # cat error logs if cpack failed.
-        cat /paddle/build/_CPack_Packages/Linux/DEB/PreinstallOutput.log
-        exit ${err_code}
-    fi
-    set -e
-fi
-
 cat <<EOF
 ========================================
 Generate /paddle/build/Dockerfile ...
@@ -158,14 +139,13 @@ EOF
 fi
 
 cat >> /paddle/build/Dockerfile <<EOF
-# Use different deb file when building different type of images
-ADD *.deb /
+ADD python/dist/*.whl /
 # run paddle version to install python packages first
 RUN apt-get update &&\
     apt-get install -y wget python-pip && pip install -U pip && \
-    dpkg -i /*.deb ; apt-get install -f -y && \
+    pip install /*.whl; apt-get install -f -y && \
     apt-get clean -y && \
-    rm -f /*.deb && \
+    rm -f /*.whl && \
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 2ab7d5b52f..26f9c0fcd4 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -56,8 +56,7 @@ if [ -z "${PADDLE_NO_STAT+x}" ]; then
     fi
 fi
 
-
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 if [ ! -z "${DEBUGGER}" ]; then
     echo "Using debug command ${DEBUGGER}"
@@ -93,34 +92,16 @@ else:
   sys.exit(0)
 EOF
 
-if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-    echo "First time run paddle, need to install some python dependencies."
-    # setuptools normalizes package version, so we need to use normalized
-    # package version for paddle python package
-    PYTHON_PADDLE_VERSION=$(python -c 'import packaging.version
-import setuptools
-print str(packaging.version.Version("@PADDLE_VERSION@"))
-' 2>/dev/null)
-    BASEDIR=$(dirname "$0")
-    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
-    if [ $? -ne 0 ]; then
-	echo "pip install wheels failed. "
-	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-	echo "PaddlePaddle will install some python dependencies automatically."
-	exit 1
-    fi
-    echo "Python dependencies are installed."
-fi
 
 case "$1" in
     "train")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_merge_model ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
         ;;
     "pserver")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_pserver_main ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
         ;;
     "dump_config")
         python -m paddle.utils.dump_config ${@:2}
@@ -129,7 +110,7 @@ case "$1" in
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
     "usage")
-        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        $PADDLE_BIN_PATH/paddle_usage ${@:2}
         ;;
     "version")
         version
diff --git a/python/setup.py.in b/python/setup.py.in
index 287442e013..82f5006121 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -24,13 +24,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]
 
 # the prefix is sys.prefix which should always be usr
-paddle_bin_dir = 'local/opt/paddle/bin'
+paddle_bin_dir = 'opt/paddle/bin'
 paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main']
+               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
-paddle_rt_lib_dir = 'local/lib'
+paddle_rt_lib_dir = 'lib'
 paddle_rt_libs = [] if '${MKL_SHARED_LIBS}'== '' else '${MKL_SHARED_LIBS}'.split(';')
 
 setup(name='paddlepaddle',
@@ -50,8 +51,7 @@ setup(name='paddlepaddle',
           'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
-      scripts=['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'],
+      scripts=paddle_bins,
       distclass=BinaryDistribution,
-      data_files=[(paddle_bin_dir, paddle_bins),
-                  (paddle_rt_lib_dir, paddle_rt_libs)]
+      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )

From 57d96f88e1d59f4ed6173602a44b1380fed30a4e Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 16 Aug 2017 16:15:12 +0800
Subject: [PATCH 19/23] Fix document error.

---
 python/paddle/v2/trainer.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 9c4dd5f250..1daf23a738 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -27,16 +27,21 @@ class SGD(object):
     SGD Trainer combines data reader, network topolopy and update_equation together
     to train/test a neural network.
 
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
     :param cost: Target cost that neural network should be optimized.
     :type cost: paddle.v2.config_base.Layer
     :param parameters: The parameters dictionary.
     :type parameters: paddle.v2.parameters.Parameters
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
-    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
+    :param is_local: Whether trainning locally
+    :type is_local: bool
+    :param pserver_spec: pserver location, eg: localhost:3000
+    :type pserver_spec: string
+    :param use_etcd: Whether using etcd pserver.
+    :param use_etcd: bool
     """
 
     def __init__(self,

From fd107ae550be7e93e45a88bc2826a9be803dd710 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 16 Aug 2017 17:00:57 +0800
Subject: [PATCH 20/23] Modify pserver_spec's doc.

---
 python/paddle/v2/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 1daf23a738..4cf4d8b11d 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -38,7 +38,9 @@ class SGD(object):
     :type extra_layers: paddle.v2.config_base.Layer
     :param is_local: Whether trainning locally
     :type is_local: bool
-    :param pserver_spec: pserver location, eg: localhost:3000
+    :param pserver_spec: pserver location, eg: localhost:3000,
+                         if use_etcd is true, pserver_spec indicates
+                         the etcd endpoints, eg: http://127.0.0.1:2379
     :type pserver_spec: string
     :param use_etcd: Whether using etcd pserver.
     :param use_etcd: bool

From 5d18aaf8223ef7de420e09ad1de8fd93dbdf6db7 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 16 Aug 2017 09:11:03 +0000
Subject: [PATCH 21/23] Add a c-api interface to get the output of a specified
 layer.

---
 paddle/capi/gradient_machine.cpp | 16 ++++++++++++++++
 paddle/capi/gradient_machine.h   | 18 +++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index b3287552db..629449bbd4 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -146,3 +146,19 @@ paddle_error paddle_gradient_machine_randomize_param(
   m->machine->randParameters();
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index c613ade5b2..28eeb23e3b 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -39,7 +39,11 @@ PD_API paddle_error paddle_gradient_machine_create_for_inference(
 /**
  * @brief Create a gradient machine used for model inference, using config with
  *        parameters which is generated by `paddle merge_model`.
- * @param [out] machine that used for model inference.
+ *        Example:
+ *          paddle merge_model \
+ *                 --model_dir="pass-00000" \
+ *                 --model_file="merged_model.paddle"
+ * @param [out] machine that used for model inference
  * @param [in] mergedModel
  * @param [in] size
  * @return paddle_error
@@ -97,6 +101,18 @@ paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
 PD_API paddle_error
 paddle_gradient_machine_destroy(paddle_gradient_machine machine);
 
+/**
+ * @brief Get the output of the layer named `layerName`.
+ * @param [in] gradient machine that have run a inference
+ * @param [in] layerName name of specified layer
+ * @param [out] args output of the specified layer
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
+                                         const char* layerName,
+                                         paddle_arguments args);
+
 #ifdef __cplusplus
 }
 #endif

From f7d32c614dc047faa3e19eb471d3bca6269d2d03 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 16 Aug 2017 17:21:59 +0800
Subject: [PATCH 22/23] Fix bug of enforce when dladdr not found

* Wrong Printf format before
---
 paddle/platform/enforce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 15fdf7a94f..81448897e9 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -86,7 +86,7 @@ struct EnforceNotMet : public std::exception {
                                   2 + sizeof(void*) * 2, call_stack[i],
                                   demangled, addr_offset);
         } else {
-          sout << string::Sprintf("%-3d %*0p %s\n", i, 2 + sizeof(void*) * 2,
+          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
                                   call_stack[i]);
         }
       }

From 4be8189a8033010cf3517d14bed30d991780285b Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 16 Aug 2017 19:50:40 +0800
Subject: [PATCH 23/23] Modify pserver_spec's doc.

---
 python/paddle/v2/trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 4cf4d8b11d..0654a30104 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -38,9 +38,10 @@ class SGD(object):
     :type extra_layers: paddle.v2.config_base.Layer
     :param is_local: Whether trainning locally
     :type is_local: bool
-    :param pserver_spec: pserver location, eg: localhost:3000,
-                         if use_etcd is true, pserver_spec indicates
-                         the etcd endpoints, eg: http://127.0.0.1:2379
+    :param pserver_spec: comma string for pserver location,
+                         eg:127.10.0.10:3000,127.10.0.11:3000,
+                         and this parameter is only used for fault
+                         tolerant mode cluster training.
     :type pserver_spec: string
     :param use_etcd: Whether using etcd pserver.
     :param use_etcd: bool