From 0e45f952a29d9e9e02545e06cf81218c2992cc11 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 14 Jun 2017 13:44:05 +0800
Subject: [PATCH 01/86] Add a NNPACKConvFunction.

---
 paddle/function/nnpack/NNPACKConvOp.cpp | 224 ++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100644 paddle/function/nnpack/NNPACKConvOp.cpp
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000..57a6681f29
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "nnpack.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            false,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    CHECK_EQ(groups_, (size_t)1);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    // algorithm_ = nnp_convolution_algorithm_auto;
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    threadpool_ = nullptr;
+    if (FLAGS_nnpack_num_threads) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+  ~NNPACKConvFunction() {
+    if (threadpool_) {
+      pthreadpool_destroy(threadpool_);
+    }
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    check(input, filter, output);
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    // size_t outputHeight = output[2];
+    // size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = paddingH(),
+                           .right = paddingW(),
+                           .bottom = paddingH(),
+                           .left = paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = strideW(), .height = strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(stride_, 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      LOG(INFO) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    if (batchSize == 1) {
+      nnp_status status =
+          nnp_convolution_inference(algorithm_,
+                                    transform_strategy_,
+                                    inputChannels,
+                                    outputChannels,
+                                    inputSize,
+                                    padding,
+                                    kernelSize,
+                                    outputSubsampling,
+                                    inputData,
+                                    filterData,
+                                    nullptr, /* bias */
+                                    outputData,
+                                    bufferPtr,
+                                    sizePtr,
+                                    nnp_activation_identity,
+                                    nullptr,
+                                    threadpool_, /* threadpool */
+                                    nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(stride_, 1);
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  pthreadpool_t threadpool_;
+};
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle

From 85e42cbeaa105b34fd3dba864ba75b95adcfe73f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 23 Jun 2017 15:16:37 +0800
Subject: [PATCH 02/86] ENH: supoort commnad `make target_name`

---
 cmake/generic.cmake | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 69e8164a00..19f0db5273 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -97,15 +97,12 @@ function(merge_static_libs TARGET_NAME)
   endforeach()
 
   if(APPLE) # Use OSX's libtool to merge archives
-    add_custom_target(${TARGET_NAME}_archive
-      COMMAND libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-      DEPENDS ${libs}
-      )
-    add_library(${TARGET_NAME} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${TARGET_NAME} PROPERTY
-      IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a")
-    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_archive)
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
 	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library

From 84d1c734ca2fe7a17e000467823d49891507cf0b Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sun, 25 Jun 2017 15:40:45 -0700
Subject: [PATCH 03/86] add paddle/memory/detail/cpu_allocator*

---
 paddle/CMakeLists.txt                      |  1 +
 paddle/memory/CMakeLists.txt               |  1 +
 paddle/memory/README.md                    | 14 ++---
 paddle/memory/detail/CMakeLists.txt        |  1 +
 paddle/memory/detail/cpu_allocator.h       | 63 ++++++++++++++++++++++
 paddle/memory/detail/cpu_allocator_test.cc | 32 +++++++++++
 paddle/memory/memory.cc                    | 51 ++++++++++++++++++
 paddle/memory/memory.h                     | 27 ++++++++++
 paddle/platform/place.cc                   | 12 ++---
 paddle/platform/place.h                    | 45 ++++++++++------
 paddle/platform/place_test.cc              | 14 ++---
 11 files changed, 224 insertions(+), 37 deletions(-)
 create mode 100644 paddle/memory/CMakeLists.txt
 create mode 100644 paddle/memory/detail/CMakeLists.txt
 create mode 100644 paddle/memory/detail/cpu_allocator.h
 create mode 100644 paddle/memory/detail/cpu_allocator_test.cc
 create mode 100644 paddle/memory/memory.cc
 create mode 100644 paddle/memory/memory.h

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 573bd937a3..0cddb95244 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
 add_subdirectory(strings)
+add_subdirectory(memory)
 
 # Do not build go directory until go cmake is working smoothly.
 # if(CMAKE_Go_COMPILER)
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
new file mode 100644
index 0000000000..3943c3cfad
--- /dev/null
+++ b/paddle/memory/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(detail)
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index fd32d07ef4..e5f7880e4c 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -31,7 +31,7 @@ In `paddle/memory/memory.h` we have:
 namespace memory {
 template <typename Place> void* Alloc(Place, size_t);
 template <typename Place> void Free(Place, void*);
-template <typename Place> void Used(Place);
+template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
 
@@ -39,7 +39,7 @@ These function templates have specializations on either `platform::CPUPlace` or
 
 ```cpp
 template<>
-void Alloc<CPUPlace>(CPUPlace p, size_t size) {
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
   return GetCPUBuddyAllocator()->Alloc(size);
 }
 ```
@@ -102,15 +102,11 @@ class BuddyAllocator {
 };
 ```
 
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.  A system allocator holds information about a device, including the amount of memory has been allocated, so we can call
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
 
-- `GPUAllocator::Used()` and
-- `CPUAllocator::Used()`
-
-to get the amount of memory that has been allocated so far.
+#### System Allocators
 
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
 
 ## Justification
 
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
new file mode 100644
index 0000000000..fb8a11062d
--- /dev/null
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -0,0 +1 @@
+cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
new file mode 100644
index 0000000000..8a872d3800
--- /dev/null
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <malloc.h>  // for malloc and free
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// CPUAllocator<staging=true> calls cudaMallocHost, which returns
+// pinned and mlocked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use CPUAllocator<staging=false>.
+template <bool staging>
+class CPUAllocator {
+public:
+  void* Alloc(size_t size);
+  void Free(void* p);
+};
+
+template <>
+class CPUAllocator<false> {
+public:
+  void* Alloc(size_t size) { return malloc(size); }
+  void Free(void* p) { free(p); }
+};
+
+// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the
+// following specialization that depends on the CUDA library.
+#ifdef WITH_GPU
+template <>
+class CPUAllocator<true> {
+public:
+  void* Alloc(size_t size) {
+    void* p;
+    if (cudaMallocHost(&p, size) != cudaSuccess) {
+      return NULL;
+    }
+    return *p;
+  }
+
+  void Free(void* p) { cudaFreeHost(p); }
+};
+#endif  // WITH_GPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
new file mode 100644
index 0000000000..0aa33a22fd
--- /dev/null
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/cpu_allocator.h"
+#include "gtest/gtest.h"
+
+TEST(CPUAllocator, NonStaging) {
+  paddle::memory::detail::CPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p);
+}
+
+#ifdef WITH_GPU
+TEST(CPUAllocator, Staging) {
+  paddle::memory::detail::CPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p);
+}
+#endif  // WITH_GPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
new file mode 100644
index 0000000000..5f1253ede6
--- /dev/null
+++ b/paddle/memory/memory.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+
+namespace paddle {
+namespace memory {
+
+template <>
+void* Alloc<CPUPlace>(CPUPlace, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
+  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+}
+
+template <>
+void Free<CPUPlace>(CPUPlace, void* p) {
+  return GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
+  return GetGPUBuddyAllocator(pl.device)->Free(p);
+}
+
+template <>
+size_t Used<CPUPlace>(CPUPlace) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+template <>
+size_t Alloc<GPUPlace>(GPUPlace pl) {
+  return GetGPUBuddyAllocator(pl.device)->Used();
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
new file mode 100644
index 0000000000..ae8ac6ca52
--- /dev/null
+++ b/paddle/memory/memory.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/frameowork/place.h"
+
+namespace paddle {
+namespace memory {
+
+typename<typename paddle::framework::Place> void* Alloc(Place, size_t);
+typename<typename paddle::framework::Place> void Free(Place, void*);
+typename<typename paddle::framework::Place> size_t Used(Place);
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 1afd03c011..0704820aa0 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -8,8 +8,8 @@ namespace detail {
 class PlacePrinter : public boost::static_visitor<> {
  public:
   PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CpuPlace &) { os_ << "CpuPlace"; }
-  void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; }
+  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
  private:
   std::ostream &os_;
@@ -22,14 +22,14 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
 
-const GpuPlace default_gpu() { return GpuPlace(0); }
-const CpuPlace default_cpu() { return CpuPlace(); }
+const GPUPlace default_gpu() { return GPUPlace(0); }
+const CPUPlace default_cpu() { return CPUPlace(); }
 
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGpuPlace(), p);
+  return boost::apply_visitor(IsGPUPlace(), p);
 }
 bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGpuPlace(), p);
+  return !boost::apply_visitor(IsGPUPlace(), p);
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 489572c526..7cead18388 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -1,43 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
+
 #include <boost/variant.hpp>
 #include <iostream>
 
 namespace paddle {
 namespace platform {
 
-struct CpuPlace {
+struct CPUPlace {
   // WORKAROUND: for some reason, omitting this constructor
   // causes errors with boost 1.59 and OSX
-  CpuPlace() {}
+  CPUPlace() {}
 
   // needed for variant equality comparison
-  inline bool operator==(const CpuPlace &) const { return true; }
-  inline bool operator!=(const CpuPlace &) const { return false; }
+  inline bool operator==(const CPUPlace &) const { return true; }
+  inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
-struct GpuPlace {
-  GpuPlace() : GpuPlace(0) {}
-  GpuPlace(int d) : device(d) {}
+struct GPUPlace {
+  GPUPlace() : GPUPlace(0) {}
+  GPUPlace(int d) : device(d) {}
 
   // needed for variant equality comparison
-  inline bool operator==(const GpuPlace &o) const { return device == o.device; }
-  inline bool operator!=(const GpuPlace &o) const { return !(*this == o); }
+  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
 
   int device;
 };
 
-struct IsGpuPlace : public boost::static_visitor<bool> {
-  bool operator()(const CpuPlace &) const { return false; }
-  bool operator()(const GpuPlace &gpu) const { return true; }
+struct IsGPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<GPUPlace, CPUPlace> Place;
 
 void set_place(const Place &);
 const Place &get_place();
 
-const GpuPlace default_gpu();
-const CpuPlace default_cpu();
+const GPUPlace default_gpu();
+const CPUPlace default_cpu();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 73fccceedf..33e2e5a439 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -3,8 +3,8 @@
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
-  paddle::platform::CpuPlace cpu;
-  paddle::platform::GpuPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CPUPlace cpu;
+  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
@@ -22,19 +22,19 @@ TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
 
-  paddle::platform::set_place(paddle::platform::CpuPlace());
+  paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
 }
 
 TEST(Place, Print) {
   {
     std::stringstream ss;
-    ss << paddle::platform::GpuPlace(1);
-    EXPECT_EQ("GpuPlace(1)", ss.str());
+    ss << paddle::platform::GPUPlace(1);
+    EXPECT_EQ("GPUPlace(1)", ss.str());
   }
   {
     std::stringstream ss;
-    ss << paddle::platform::CpuPlace();
-    EXPECT_EQ("CpuPlace", ss.str());
+    ss << paddle::platform::CPUPlace();
+    EXPECT_EQ("CPUPlace", ss.str());
   }
 }

From db128c4586c3c925a6c53a9ae770cb07cdbea1bf Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sun, 25 Jun 2017 17:54:06 -0700
Subject: [PATCH 04/86] Pass cpu_allocator_test

---
 CMakeLists.txt                             |  2 +-
 cmake/generic.cmake                        |  4 ++++
 paddle/memory/detail/CMakeLists.txt        |  6 +++++-
 paddle/memory/detail/cpu_allocator.h       | 13 +++++++++----
 paddle/memory/detail/cpu_allocator_test.cc | 16 +++++++++++-----
 paddle/memory/memory.cc                    | 14 ++++++++++++--
 paddle/memory/memory.h                     | 16 +++++++++++++---
 7 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5d7f2c7ec..3c719d35ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,7 @@ if(ANDROID)
         "Disable RDMA when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
 if (WITH_C_API AND WITH_PYTHON)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 69e8164a00..840155750e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -78,6 +78,10 @@
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
 
+if(WITH_GPU)
+  add_definitions(-DPADDLE_WITH_GPU)
+endif()
+
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index fb8a11062d..c425e9f947 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,5 @@
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but
+else(${WITH_GPU})
+  cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't.
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index 8a872d3800..0d8ea3f52b 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include <malloc.h>  // for malloc and free
 #include <stddef.h>  // for size_t
 
+#ifdef PADDLE_WITH_GPU
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif  // PADDLE_WITH_GPU
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -40,9 +45,9 @@ public:
   void Free(void* p) { free(p); }
 };
 
-// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the
+// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the
 // following specialization that depends on the CUDA library.
-#ifdef WITH_GPU
+#ifdef PADDLE_WITH_GPU
 template <>
 class CPUAllocator<true> {
 public:
@@ -51,12 +56,12 @@ public:
     if (cudaMallocHost(&p, size) != cudaSuccess) {
       return NULL;
     }
-    return *p;
+    return p;
   }
 
   void Free(void* p) { cudaFreeHost(p); }
 };
-#endif  // WITH_GPU
+#endif  // PADDLE_WITH_GPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
index 0aa33a22fd..464bc84e5c 100644
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -22,11 +22,17 @@ TEST(CPUAllocator, NonStaging) {
   a.Free(p);
 }
 
-#ifdef WITH_GPU
+#ifdef PADDLE_WITH_GPU
 TEST(CPUAllocator, Staging) {
   paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p);
+
+  int devices;
+  if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) {
+    void* p = a.Alloc(4096);
+    EXPECT_NE(p, nullptr);
+    a.Free(p);
+  } else {
+    EXPECT_EQ(a.Alloc(4096), nullptr);
+  }
 }
-#endif  // WITH_GPU
+#endif  // PADDLE_WITH_GPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5f1253ede6..b617923731 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -19,7 +19,11 @@ namespace memory {
 
 template <>
 void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
+}
+
+void* AllocStaging(CPUPlace, size_t size) {
+  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
 }
 
 template <>
@@ -29,9 +33,14 @@ void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
 
 template <>
 void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator()->Free(p);
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
+}
+
+void FreeStaging(CPUPlace, void* p) {
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
 }
 
+#ifdef PADDLE_WITH_GPU
 template <>
 void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
   return GetGPUBuddyAllocator(pl.device)->Free(p);
@@ -46,6 +55,7 @@ template <>
 size_t Alloc<GPUPlace>(GPUPlace pl) {
   return GetGPUBuddyAllocator(pl.device)->Used();
 }
+#endif  // PADDLE_WITH_GPU
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index ae8ac6ca52..8c15a133bb 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,19 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-typename<typename paddle::framework::Place> void* Alloc(Place, size_t);
-typename<typename paddle::framework::Place> void Free(Place, void*);
-typename<typename paddle::framework::Place> size_t Used(Place);
+template <typename paddle::framework::Place>
+void* Alloc(Place, size_t);
+template <typename paddle::framework::Place>
+void Free(Place, void*);
+template <typename paddle::framework::Place>
+size_t Used(Place);
+
+// Staging memory means "pinned" host memory that can be mapped into
+// the CUDA memory space and accessed by the device rapidly.  Don't
+// allocate too much staging memory; otherwise system performance will
+// degrade because the OS cannot find enough swap memory space.
+void* AllocStaging(CPUPlace, size_t);
+void* FreeStaging(CPUPlace, size_t);
 
 }  // namespace memory
 }  // namespace paddle

From ce938ae5f9baea2b2d136154ee9a696b394929e1 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 26 Jun 2017 23:32:46 +0800
Subject: [PATCH 05/86] FIX: Pinned memory

---
 paddle/memory/README.md                    |  1 +
 paddle/memory/detail/CMakeLists.txt        |  6 +---
 paddle/memory/detail/cpu_allocator.h       | 39 ++++++++++++----------
 paddle/memory/detail/cpu_allocator_test.cc | 16 +++------
 4 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index e5f7880e4c..96a331a486 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -97,6 +97,7 @@ class BuddyAllocator {
   struct Block {
     size_t size;
     Block* left, right;
+    size_t index; // allocator id
   };
   ...
 };
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c425e9f947..fb8a11062d 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1 @@
-if(${WITH_GPU})
-  nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but
-else(${WITH_GPU})
-  cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't.
-endif(${WITH_GPU})
+cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index 0d8ea3f52b..a487fecef4 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -14,20 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#include <malloc.h>  // for malloc and free
 #include <stddef.h>  // for size_t
+#include <cstdlib>   // for malloc and free
 
-#ifdef PADDLE_WITH_GPU
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif  // PADDLE_WITH_GPU
+#ifndef _WIN32
+#include <sys/mman.h>  // for mlock and munlock
+#endif
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// CPUAllocator<staging=true> calls cudaMallocHost, which returns
-// pinned and mlocked memory as staging areas for data exchange
+// CPUAllocator<staging=true> calls mlock, which returns
+// pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
 // amount of memory available to the system for paging.  So, by
 // default, we should use CPUAllocator<staging=false>.
@@ -35,33 +34,37 @@ template <bool staging>
 class CPUAllocator {
 public:
   void* Alloc(size_t size);
-  void Free(void* p);
+  void Free(void* p, size_t size);
 };
 
 template <>
 class CPUAllocator<false> {
 public:
-  void* Alloc(size_t size) { return malloc(size); }
-  void Free(void* p) { free(p); }
+  void* Alloc(size_t size) { return std::malloc(size); }
+  void Free(void* p, size_t size) { std::free(p); }
 };
 
-// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the
-// following specialization that depends on the CUDA library.
-#ifdef PADDLE_WITH_GPU
 template <>
 class CPUAllocator<true> {
 public:
   void* Alloc(size_t size) {
-    void* p;
-    if (cudaMallocHost(&p, size) != cudaSuccess) {
-      return NULL;
+    void* p = std::malloc(size);
+    if (p == nullptr) {
+      return p;
     }
+#ifndef _WIN32
+    mlock(p, size);
+#endif
     return p;
   }
 
-  void Free(void* p) { cudaFreeHost(p); }
+  void Free(void* p, size_t size) {
+#ifndef _WIN32
+    munlock(p, size);
+#endif
+    std::free(p);
+  }
 };
-#endif  // PADDLE_WITH_GPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
index 464bc84e5c..4e45266cd8 100644
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -19,20 +19,12 @@ TEST(CPUAllocator, NonStaging) {
   paddle::memory::detail::CPUAllocator<false> a;
   void* p = a.Alloc(4096);
   EXPECT_NE(p, nullptr);
-  a.Free(p);
+  a.Free(p, 4096);
 }
 
-#ifdef PADDLE_WITH_GPU
 TEST(CPUAllocator, Staging) {
   paddle::memory::detail::CPUAllocator<true> a;
-
-  int devices;
-  if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) {
-    void* p = a.Alloc(4096);
-    EXPECT_NE(p, nullptr);
-    a.Free(p);
-  } else {
-    EXPECT_EQ(a.Alloc(4096), nullptr);
-  }
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
 }
-#endif  // PADDLE_WITH_GPU

From ce70df86b1e8c892cdde5312caa0c2699f368f7d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 27 Jun 2017 00:15:36 +0800
Subject: [PATCH 06/86] Add gpu_allocator

---
 paddle/memory/.clang-format                |  5 ++
 paddle/memory/detail/CMakeLists.txt        |  1 +
 paddle/memory/detail/cpu_allocator.h       |  6 +-
 paddle/memory/detail/gpu_allocator.h       | 92 ++++++++++++++++++++++
 paddle/memory/detail/gpu_allocator_test.cc | 30 +++++++
 5 files changed, 131 insertions(+), 3 deletions(-)
 create mode 100644 paddle/memory/.clang-format
 create mode 100644 paddle/memory/detail/gpu_allocator.h
 create mode 100644 paddle/memory/detail/gpu_allocator_test.cc

diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/memory/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index fb8a11062d..81ca8a0bbf 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,2 @@
 cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
+nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index a487fecef4..17753ccef7 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -32,21 +32,21 @@ namespace detail {
 // default, we should use CPUAllocator<staging=false>.
 template <bool staging>
 class CPUAllocator {
-public:
+ public:
   void* Alloc(size_t size);
   void Free(void* p, size_t size);
 };
 
 template <>
 class CPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) { return std::malloc(size); }
   void Free(void* p, size_t size) { std::free(p); }
 };
 
 template <>
 class CPUAllocator<true> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = std::malloc(size);
     if (p == nullptr) {
diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h
new file mode 100644
index 0000000000..9452c41fb8
--- /dev/null
+++ b/paddle/memory/detail/gpu_allocator.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
+// pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use GPUAllocator<staging=false>.
+template <bool staging>
+class GPUAllocator {
+public:
+  void* Alloc(size_t size);
+  void Free(void* p, size_t size);
+};
+
+template <>
+class GPUAllocator<false> {
+public:
+  void* Alloc(size_t size) {
+    void* p = 0;
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      return p;
+    }
+    // clear last error
+    cudaGetLastError();
+    return nullptr;
+  }
+
+  void Free(void* p, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds. 
+    auto err = cudaFree(p);
+    if (err != cudaErrorCudartUnloading) {
+        throw_on_error(err, "cudaFree failed");
+    }
+  }
+};
+
+template <>
+class GPUAllocator<true> {
+public:
+  void* Alloc(size_t size) {
+    void* p = 0;
+    cudaError_t result = cudaMallocHost(&p, size);
+    if (result == cudaSuccess) {
+        return p;
+    }
+    // clear last error
+    cudaGetLastError();
+    return nullptr;
+  }
+
+  void Free(void* p, size_t size) {
+    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
+  }
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/gpu_allocator_test.cc
new file mode 100644
index 0000000000..18c1c9ab43
--- /dev/null
+++ b/paddle/memory/detail/gpu_allocator_test.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/gpu_allocator.h"
+#include "gtest/gtest.h"
+
+TEST(GPUAllocator, NonStaging) {
+  paddle::memory::detail::GPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+TEST(GPUAllocator, Staging) {
+  paddle::memory::detail::GPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}

From e02859c0f53dfe4616976b015d4fefd8aaa6eb39 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Mon, 26 Jun 2017 15:27:01 -0700
Subject: [PATCH 07/86] Replace {cpu,gpu}_allocator.h and
 {cpu,gpu}_allocator_test.cc by system_allocator{.h,_test.cc}

---
 paddle/memory/CMakeLists.txt                  |  6 ++
 paddle/memory/detail/CMakeLists.txt           |  3 +-
 paddle/memory/detail/cpu_allocator.h          | 71 -----------------
 paddle/memory/detail/cpu_allocator_test.cc    | 30 -------
 .../{gpu_allocator.h => system_allocator.h}   | 79 +++++++++++--------
 ...cator_test.cc => system_allocator_test.cc} | 20 ++++-
 paddle/memory/memory.cc                       | 67 +++++++---------
 paddle/memory/memory.h                        | 16 +---
 8 files changed, 106 insertions(+), 186 deletions(-)
 delete mode 100644 paddle/memory/detail/cpu_allocator.h
 delete mode 100644 paddle/memory/detail/cpu_allocator_test.cc
 rename paddle/memory/detail/{gpu_allocator.h => system_allocator.h} (58%)
 rename paddle/memory/detail/{gpu_allocator_test.cc => system_allocator_test.cc} (69%)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad..8662512496 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,7 @@
 add_subdirectory(detail)
+
+if(${WITH_GPU})
+  nv_library(memory SRCS memory.cc)
+else(${WITH_GPU})
+  cc_library(memory SRCS memroy.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 81ca8a0bbf..3b5bbd7a12 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,2 +1 @@
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
-nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
+cc_test(system_allocator_test SRCS system_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
deleted file mode 100644
index 17753ccef7..0000000000
--- a/paddle/memory/detail/cpu_allocator.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-#include <cstdlib>   // for malloc and free
-
-#ifndef _WIN32
-#include <sys/mman.h>  // for mlock and munlock
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-// CPUAllocator<staging=true> calls mlock, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use CPUAllocator<staging=false>.
-template <bool staging>
-class CPUAllocator {
- public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class CPUAllocator<false> {
- public:
-  void* Alloc(size_t size) { return std::malloc(size); }
-  void Free(void* p, size_t size) { std::free(p); }
-};
-
-template <>
-class CPUAllocator<true> {
- public:
-  void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p == nullptr) {
-      return p;
-    }
-#ifndef _WIN32
-    mlock(p, size);
-#endif
-    return p;
-  }
-
-  void Free(void* p, size_t size) {
-#ifndef _WIN32
-    munlock(p, size);
-#endif
-    std::free(p);
-  }
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
deleted file mode 100644
index 4e45266cd8..0000000000
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "gtest/gtest.h"
-
-TEST(CPUAllocator, NonStaging) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
-
-TEST(CPUAllocator, Staging) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/system_allocator.h
similarity index 58%
rename from paddle/memory/detail/gpu_allocator.h
rename to paddle/memory/detail/system_allocator.h
index 9452c41fb8..0a64553188 100644
--- a/paddle/memory/detail/gpu_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -14,20 +14,58 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>  // for size_t
+#include <stddef.h>    // for size_t
+#include <sys/mman.h>  // for mlock and munlock
+#include <cstdlib>     // for malloc and free
 
-#include <thrust/system_error.h>
+#ifndef PADDLE_ONLY_CPU
 #include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_ONLY_CPU
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
+class SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size) = 0;
+  virtual void* Free(void* p) = 0;
+};
+
+// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
+// and locked memory as staging areas for data exchange between host
+// and device.  Allocates too much would reduce the amount of memory
+// available to the system for paging.  So, by default, we should use
+// CPUAllocator<staging=false>.
+template <bool lock_memory>
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size) {
+    void* p = std::malloc(size);
+    if (p != nullptr && lock_memory) {
+      mlock(p, size);
+    }
+    return p;
+  }
+
+  virtual void Free(void* p, size_t size) {
+    if (p != nullptr && lock_memory) {
+      munlock(p, size);
+    }
+    std::free(p);
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+namespace {
 inline void throw_on_error(cudaError_t e, const char* message) {
   if (e) {
     throw thrust::system_error(e, thrust::cuda_category(), message);
   }
 }
+}  // namespace
 
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
@@ -36,17 +74,11 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 // default, we should use GPUAllocator<staging=false>.
 template <bool staging>
 class GPUAllocator {
-public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class GPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
-    cudaError_t result = cudaMalloc(&p, size);
+    cudaError_t result =
+        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
     if (result == cudaSuccess) {
       return p;
     }
@@ -60,32 +92,15 @@ public:
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
     // process is terminating, in which case we don't care if
-    // cudaFree succeeds. 
-    auto err = cudaFree(p);
+    // cudaFree succeeds.
+    auto err = staging ? cudaFreeHost(p) : cudaFree(p);
     if (err != cudaErrorCudartUnloading) {
-        throw_on_error(err, "cudaFree failed");
+      throw_on_error(err, "cudaFree failed");
     }
   }
 };
 
-template <>
-class GPUAllocator<true> {
-public:
-  void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = cudaMallocHost(&p, size);
-    if (result == cudaSuccess) {
-        return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
-  }
-};
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
similarity index 69%
rename from paddle/memory/detail/gpu_allocator_test.cc
rename to paddle/memory/detail/system_allocator_test.cc
index 18c1c9ab43..4e7b8018b6 100644
--- a/paddle/memory/detail/gpu_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -12,9 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/gpu_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
 #include "gtest/gtest.h"
 
+TEST(CPUAllocator, NoLockMem) {
+  paddle::memory::detail::CPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+TEST(CPUAllocator, LockMem) {
+  paddle::memory::detail::CPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
 TEST(GPUAllocator, NonStaging) {
   paddle::memory::detail::GPUAllocator<false> a;
   void* p = a.Alloc(4096);
@@ -28,3 +44,5 @@ TEST(GPUAllocator, Staging) {
   EXPECT_NE(p, nullptr);
   a.Free(p, 4096);
 }
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index b617923731..ca3c01ebdb 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,48 +14,41 @@ limitations under the License. */
 
 #include "paddle/memory/memory.h"
 
+#include "paddle/memory/detail/cpu_allocator.h"
+#include "paddle/memory/detail/gpu_allocator.h"
+
 namespace paddle {
 namespace memory {
 
-template <>
-void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
-}
-
-void* AllocStaging(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
-}
-
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
-  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
-}
-
-template <>
-void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-void FreeStaging(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-#ifdef PADDLE_WITH_GPU
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
-  return GetGPUBuddyAllocator(pl.device)->Free(p);
-}
-
-template <>
-size_t Used<CPUPlace>(CPUPlace) {
+void Alloc(paddle::platform::Place pl, size_t size) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+void Free(paddle::platform::Place pl, void* p) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    GetGPUBuddyAllocator(pl.device)->Free(p);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+size_t Used(paddle::platform::Place pl) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Used();
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
   return GetCPUBuddyAllocator()->Used();
 }
 
-template <>
-size_t Alloc<GPUPlace>(GPUPlace pl) {
-  return GetGPUBuddyAllocator(pl.device)->Used();
-}
-#endif  // PADDLE_WITH_GPU
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 8c15a133bb..0bc609205e 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,19 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-template <typename paddle::framework::Place>
-void* Alloc(Place, size_t);
-template <typename paddle::framework::Place>
-void Free(Place, void*);
-template <typename paddle::framework::Place>
-size_t Used(Place);
-
-// Staging memory means "pinned" host memory that can be mapped into
-// the CUDA memory space and accessed by the device rapidly.  Don't
-// allocate too much staging memory; otherwise system performance will
-// degrade because the OS cannot find enough swap memory space.
-void* AllocStaging(CPUPlace, size_t);
-void* FreeStaging(CPUPlace, size_t);
+void* Alloc(paddle::framework::Place, size_t);
+void Free(paddle::framework::Place, void*);
+size_t Used(paddle::framework::Place);
 
 }  // namespace memory
 }  // namespace paddle

From 6250d108bfd39afb3b2beba438ecb22eca8991bc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 27 Jun 2017 09:51:55 +0800
Subject: [PATCH 08/86] FIX: clang-format

---
 paddle/memory/detail/gpu_allocator.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h
index 9452c41fb8..682afdf7d3 100644
--- a/paddle/memory/detail/gpu_allocator.h
+++ b/paddle/memory/detail/gpu_allocator.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <stddef.h>  // for size_t
 
-#include <thrust/system_error.h>
 #include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
 
 namespace paddle {
 namespace memory {
@@ -36,14 +36,14 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 // default, we should use GPUAllocator<staging=false>.
 template <bool staging>
 class GPUAllocator {
-public:
+ public:
   void* Alloc(size_t size);
   void Free(void* p, size_t size);
 };
 
 template <>
 class GPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
     cudaError_t result = cudaMalloc(&p, size);
@@ -60,22 +60,22 @@ public:
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
     // process is terminating, in which case we don't care if
-    // cudaFree succeeds. 
+    // cudaFree succeeds.
     auto err = cudaFree(p);
     if (err != cudaErrorCudartUnloading) {
-        throw_on_error(err, "cudaFree failed");
+      throw_on_error(err, "cudaFree failed");
     }
   }
 };
 
 template <>
 class GPUAllocator<true> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
     cudaError_t result = cudaMallocHost(&p, size);
     if (result == cudaSuccess) {
-        return p;
+      return p;
     }
     // clear last error
     cudaGetLastError();

From f149d183f7d78fdaa171f2afabaf8a138596c8ff Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Mon, 26 Jun 2017 20:41:33 -0700
Subject: [PATCH 09/86] Add system_allocator

---
 paddle/memory/detail/CMakeLists.txt           |  6 +-
 paddle/memory/detail/system_allocator.h       | 84 ++++++++++++-------
 paddle/memory/detail/system_allocator_test.cc | 44 +++++-----
 3 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 3b5bbd7a12..c16dfadeb2 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,5 @@
-cc_test(system_allocator_test SRCS system_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+else(${WITH_GPU})
+  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 0a64553188..1768f9a0da 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -23,14 +23,31 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_ONLY_CPU
 
+#include "paddle/platform/assert.h"
+
 namespace paddle {
 namespace memory {
 namespace detail {
 
-class SystemAllocator {
+class CPUDeleter {
  public:
-  virtual void* Alloc(size_t size) = 0;
-  virtual void* Free(void* p) = 0;
+  CPUDeleter(void* ptr, size_t size, bool locked)
+      : ptr_(ptr), size_(size), locked_(locked) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    if (ptr_ != nullptr && locked_) {
+      munlock(ptr_, size_);
+    }
+    std::free(ptr_);
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool locked_;
 };
 
 // CPUAllocator<lock_memory=true> calls mlock, which returns pinned
@@ -39,21 +56,14 @@ class SystemAllocator {
 // available to the system for paging.  So, by default, we should use
 // CPUAllocator<staging=false>.
 template <bool lock_memory>
-class CPUAllocator : public SystemAllocator {
+class CPUAllocator {
  public:
-  virtual void* Alloc(size_t size) {
+  static CPUDeleter Alloc(size_t size) {
     void* p = std::malloc(size);
     if (p != nullptr && lock_memory) {
       mlock(p, size);
     }
-    return p;
-  }
-
-  virtual void Free(void* p, size_t size) {
-    if (p != nullptr && lock_memory) {
-      munlock(p, size);
-    }
-    std::free(p);
+    return CPUDeleter(p, size, lock_memory);
   }
 };
 
@@ -67,6 +77,32 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 }
 }  // namespace
 
+class GPUDeleter {
+ public:
+  GPUDeleter(void* ptr, size_t size, bool staging)
+      : ptr_(ptr), size_(size), staging_(staging) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+      throw_on_error(err, "cudaFree{Host} failed");
+    }
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool staging_;
+};
+
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
@@ -75,28 +111,14 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 template <bool staging>
 class GPUAllocator {
  public:
-  void* Alloc(size_t size) {
+  static GPUDeleter Alloc(size_t size) {
     void* p = 0;
     cudaError_t result =
         staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    auto err = staging ? cudaFreeHost(p) : cudaFree(p);
-    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree failed");
+    if (result != cudaSuccess) {
+      cudaGetLastError();  // clear error if there is any.
     }
+    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
   }
 };
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 4e7b8018b6..fec70a65b7 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -13,36 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
 #include "gtest/gtest.h"
 
-TEST(CPUAllocator, NoLockMem) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+template <typename Allocator>
+void TestAllocator() {
+  {
+    auto d = Allocator::Alloc(sizeof(int));
+    EXPECT_NE(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+  {
+    auto d = Allocator::Alloc(0);
+    EXPECT_EQ(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
 }
 
+TEST(CPUAllocator, NoLockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+}
 TEST(CPUAllocator, LockMem) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
 }
 
 #ifndef PADDLE_ONLY_CPU
-
-TEST(GPUAllocator, NonStaging) {
-  paddle::memory::detail::GPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+TEST(GPUAllocator, NoStaging) {
+  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
 }
-
 TEST(GPUAllocator, Staging) {
-  paddle::memory::detail::GPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
 }
-
 #endif  // PADDLE_ONLY_CPU

From 2f47562df8e9cbd81e96bba642646b0036e7bab6 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 27 Jun 2017 23:39:59 +0800
Subject: [PATCH 10/86] scope-impl

---
 doc/design/scope.md             |  2 +-
 paddle/framework/CMakeLists.txt |  5 +++
 paddle/framework/scope.cc       | 54 +++++++++++++++++++++++++++++++++
 paddle/framework/scope.h        | 51 +++++++++++++++++++++++++++++++
 paddle/framework/scope_test.cc  | 47 ++++++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 paddle/framework/scope.cc
 create mode 100644 paddle/framework/scope.h
 create mode 100644 paddle/framework/scope_test.cc

diff --git a/doc/design/scope.md b/doc/design/scope.md
index 2ff416f06e..4d14a64977 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -41,7 +41,7 @@ class Scope {
   const Variable* GetVariable(const std::string& name) const;
 
  private:
-    std::unordered_map<std::string, std::unique_ptr<Vairable>> vars_;
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 };
 ```
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e3c3155aa9..7ea17f7114 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,6 +1,11 @@
+# ddim lib
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 cc_test(variable_test SRCS variable_test.cc)
+
+# scope lib
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
new file mode 100644
index 0000000000..ed75aece01
--- /dev/null
+++ b/paddle/framework/scope.cc
@@ -0,0 +1,54 @@
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+Error Scope::CreateVariable(const std::string &name) {
+  if (name == "") {
+    return Error("Variable name should not be empty");
+  }
+
+  if (HaveVariable(name)) {
+    return AlreadyCreated;
+  }
+  vars_[name] = std::unique_ptr<Variable>(new Variable());
+  return Error();
+}
+
+Variable* Scope::GetVarLocally(const std::string& name) const {
+  if (vars_.count(name)) {
+    return vars_.at(name).get();
+  }
+  return nullptr;
+}
+
+Variable* Scope::GetVariable(const std::string &name) const {
+  Variable* var = GetVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  } else if (parent_ != nullptr) {
+    return parent_->GetVariable(name);
+  } else {
+    return nullptr;
+  }
+}
+
+Variable* Scope::GetOrCreateVariable(const std::string &name) {
+  Variable* var;
+  var = GetVariable(name);
+  if (var == nullptr) {
+    auto err = CreateVariable(name);
+    if (!err.isOK()) {
+      return nullptr;
+    }
+  }
+  return GetVariable(name);
+}
+
+bool Scope::HaveVariable(const std::string &name) {
+  return vars_.count(name) != 0;
+}
+
+}  // namespace framework
+}  // namespace paddle
+
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
new file mode 100644
index 0000000000..ad1ed2ddab
--- /dev/null
+++ b/paddle/framework/scope.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include "paddle/framework/variable.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+const static Error AlreadyCreated("Variable has already been created");
+
+/**
+ * Scope is an association of a name to Variable. All variables belong to `Scope`.
+ * You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can
+ * run in different scopes and update different variable in the scope.
+ */
+class Scope {
+ public:
+  Scope() {}
+
+  explicit Scope(const std::shared_ptr<Scope> &scope):
+          parent_(scope) {}
+
+  ~Scope() {}
+
+  // Create Variable in this Scope. Return error if Variable already been
+  // created.
+  Error __must_check CreateVariable(const std::string& name);
+
+  // Get Variable from this Scope, this function will recursive find Variable
+  // from it's parent scope.
+  // Return nullptr if not found.
+  Variable* GetVariable(const std::string& name) const;
+
+  // find and return Variables in the scope it self.
+  Variable* GetVarLocally(const std::string& name) const;
+
+  // Get a Variable from Scope, if the Variable is not exist then create it.
+  // User should call this function most of time.
+  Variable* GetOrCreateVariable(const std::string& name);
+
+  bool HaveVariable(const std::string& name);
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
new file mode 100644
index 0000000000..09fbb78d69
--- /dev/null
+++ b/paddle/framework/scope_test.cc
@@ -0,0 +1,47 @@
+#include "paddle/framework/scope.h"
+#include "gtest/gtest.h"
+
+TEST(Scope, Create) {
+  using paddle::framework::Scope;
+  using paddle::Error;
+  using paddle::framework::Variable;
+  using paddle::framework::AlreadyCreated;
+
+  Scope* scope = new Scope();
+
+  Error err = scope->CreateVariable("");
+  EXPECT_FALSE(err.isOK());
+
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var1, nullptr);
+
+  Error err1 = scope->CreateVariable("a");
+  EXPECT_TRUE(err1.isOK());
+
+  Error err2 = scope->CreateVariable("a");
+  EXPECT_EQ(err2, AlreadyCreated);
+
+  Variable* var2 = scope->GetVariable("a");
+  EXPECT_NE(var2, nullptr);
+
+  Variable* var3 = scope->GetOrCreateVariable("b");
+  EXPECT_NE(var3, nullptr);
+}
+
+TEST(Scope, Parent) {
+  using paddle::framework::Scope;
+  using paddle::framework::Variable;
+  using paddle::Error;
+
+  const auto parent_scope_ptr = std::shared_ptr<Scope>(new Scope());
+  Scope* scope = new Scope(parent_scope_ptr);
+
+  Error err = parent_scope_ptr->CreateVariable("a");
+  EXPECT_TRUE(err.isOK());
+
+  Variable* var1 = scope->GetVarLocally("a");
+  EXPECT_EQ(var1, nullptr);
+
+  Variable* var2 = scope->GetVariable("a");
+  EXPECT_NE(var2, nullptr);
+}
\ No newline at end of file

From 5e8d8e073bf0e208927c20103d3fc383f9e90316 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 27 Jun 2017 23:53:37 +0800
Subject: [PATCH 11/86] refine GetOrCreateVariable

---
 paddle/framework/scope.cc | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index ed75aece01..31956aecb9 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -34,15 +34,17 @@ Variable* Scope::GetVariable(const std::string &name) const {
 }
 
 Variable* Scope::GetOrCreateVariable(const std::string &name) {
-  Variable* var;
-  var = GetVariable(name);
-  if (var == nullptr) {
-    auto err = CreateVariable(name);
-    if (!err.isOK()) {
-      return nullptr;
-    }
+  Variable* var = GetVariable(name);
+  if (var != nullptr) {
+    return var;
+  }
+
+  Error err = CreateVariable(name);
+  if (!err.isOK()) {
+    return nullptr;
+  } else {
+    return GetVariable(name);
   }
-  return GetVariable(name);
 }
 
 bool Scope::HaveVariable(const std::string &name) {

From 7d138593d206f8bc16ddacb2a9aa072c08cb829d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 28 Jun 2017 00:15:00 +0800
Subject: [PATCH 12/86] refine code of scope with style check

---
 paddle/framework/scope.cc |  9 ++++-----
 paddle/framework/scope.h  | 17 ++++++++---------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 31956aecb9..e985598849 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -3,7 +3,7 @@
 namespace paddle {
 namespace framework {
 
-Error Scope::CreateVariable(const std::string &name) {
+Error Scope::CreateVariable(const std::string& name) {
   if (name == "") {
     return Error("Variable name should not be empty");
   }
@@ -22,7 +22,7 @@ Variable* Scope::GetVarLocally(const std::string& name) const {
   return nullptr;
 }
 
-Variable* Scope::GetVariable(const std::string &name) const {
+Variable* Scope::GetVariable(const std::string& name) const {
   Variable* var = GetVarLocally(name);
   if (var != nullptr) {
     return var;
@@ -33,7 +33,7 @@ Variable* Scope::GetVariable(const std::string &name) const {
   }
 }
 
-Variable* Scope::GetOrCreateVariable(const std::string &name) {
+Variable* Scope::GetOrCreateVariable(const std::string& name) {
   Variable* var = GetVariable(name);
   if (var != nullptr) {
     return var;
@@ -47,10 +47,9 @@ Variable* Scope::GetOrCreateVariable(const std::string &name) {
   }
 }
 
-bool Scope::HaveVariable(const std::string &name) {
+bool Scope::HaveVariable(const std::string& name) {
   return vars_.count(name) != 0;
 }
 
 }  // namespace framework
 }  // namespace paddle
-
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index ad1ed2ddab..90c8141e4f 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <vector>
 #include <unordered_map>
+#include <vector>
 #include "paddle/framework/variable.h"
 #include "paddle/utils/Error.h"
 
@@ -11,16 +11,16 @@ namespace framework {
 const static Error AlreadyCreated("Variable has already been created");
 
 /**
- * Scope is an association of a name to Variable. All variables belong to `Scope`.
- * You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can
- * run in different scopes and update different variable in the scope.
+ * Scope is an association of a name to Variable. All variables belong to
+ * `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * One net can run in different scopes and update different variable in the
+ * scope.
  */
 class Scope {
  public:
   Scope() {}
 
-  explicit Scope(const std::shared_ptr<Scope> &scope):
-          parent_(scope) {}
+  explicit Scope(const std::shared_ptr<Scope>& scope) : parent_(scope) {}
 
   ~Scope() {}
 
@@ -29,8 +29,7 @@ class Scope {
   Error __must_check CreateVariable(const std::string& name);
 
   // Get Variable from this Scope, this function will recursive find Variable
-  // from it's parent scope.
-  // Return nullptr if not found.
+  // from it's parent scope. Return nullptr if not found.
   Variable* GetVariable(const std::string& name) const;
 
   // find and return Variables in the scope it self.
@@ -44,7 +43,7 @@ class Scope {
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_ {nullptr};
+  std::shared_ptr<Scope> parent_{nullptr};
 };
 
 }  // namespace framework

From dd08d337c0138c9def5f7ce95f88bae5599e5f92 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:30:57 +0800
Subject: [PATCH 13/86] FIX: fix cmake type error

---
 CMakeLists.txt               |  2 ++
 paddle/CMakeLists.txt        | 10 +---------
 paddle/memory/CMakeLists.txt |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c719d35ec..b779caefb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
+find_package(Boost QUIET)
 
 include(simd)
 
@@ -109,6 +110,7 @@ include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
+include_directories(${Boost_INCLUDE_DIRS})
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 0cddb95244..979b68e827 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -10,17 +10,9 @@ add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
 add_subdirectory(strings)
-add_subdirectory(memory)
-
-# Do not build go directory until go cmake is working smoothly.
-# if(CMAKE_Go_COMPILER)
-#   add_subdirectory(go)
-# endif()
-
-find_package(Boost QUIET)
 
 if(Boost_FOUND)
-  include_directories(${Boost_INCLUDE_DIRS})
+  add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
 endif()
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8662512496..e74ce75c93 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -3,5 +3,5 @@ add_subdirectory(detail)
 if(${WITH_GPU})
   nv_library(memory SRCS memory.cc)
 else(${WITH_GPU})
-  cc_library(memory SRCS memroy.cc)
+  cc_library(memory SRCS memory.cc)
 endif(${WITH_GPU})

From dde0da9e0ffee7a49510061a139ab2abc7ab55b9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:31:24 +0800
Subject: [PATCH 14/86] ENH: Add cuda.h in platform

---
 paddle/platform/cuda.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 paddle/platform/cuda.h

diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
new file mode 100644
index 0000000000..864a5d3340
--- /dev/null
+++ b/paddle/platform/cuda.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+namespace paddle {
+namespace platform {
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+int GetDeviceCount(void) {
+    int count;
+    throw_on_error(cudaGetDeviceCount(&count),
+                   "cudaGetDeviceCount failed");
+    return count;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU 

From 29c7512b3ce13ca7b89d3ff3f4aea2c7d7f27478 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:31:46 +0800
Subject: [PATCH 15/86] FIX: fix memory.h/cc

---
 paddle/memory/memory.cc | 23 ++++++++++++++---------
 paddle/memory/memory.h  |  8 ++++----
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index ca3c01ebdb..0d123d99e2 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -13,41 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
 
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "paddle/memory/detail/gpu_allocator.h"
+#include <boost/variant.hpp>
 
 namespace paddle {
 namespace memory {
 
-void Alloc(paddle::platform::Place pl, size_t size) {
+void* Alloc(platform::Place pl, size_t size) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Alloc(size);
+  return detail::GetCPUBuddyAllocator()->Alloc(size);
 }
 
 void Free(paddle::platform::Place pl, void* p) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    GetGPUBuddyAllocator(pl.device)->Free(p);
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  GetCPUBuddyAllocator()->Free(p);
+  detail::GetCPUBuddyAllocator()->Free(p);
 }
 
 size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    return GetGPUBuddyAllocator(pl.device)->Used();
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Used();
+  return detail::GetCPUBuddyAllocator()->Used();
 }
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 0bc609205e..a33092bade 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/frameowork/place.h"
+#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::framework::Place, size_t);
-void Free(paddle::framework::Place, void*);
-size_t Used(paddle::framework::Place);
+void* Alloc(paddle::platform::Place, size_t);
+void Free(paddle::platform::Place, void*);
+size_t Used(paddle::platform::Place);
 
 }  // namespace memory
 }  // namespace paddle

From b22dd12854150c31b9cb9e3e550bdee4b5df5977 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:32:06 +0800
Subject: [PATCH 16/86] ENH: Add buddy allocator draft

---
 paddle/memory/detail/CMakeLists.txt    |  4 +-
 paddle/memory/detail/buddy_allocator.h | 79 ++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 paddle/memory/detail/buddy_allocator.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c16dfadeb2..cd5622203f 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(${WITH_GPU})
-  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
 else(${WITH_GPU})
-  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
 endif(${WITH_GPU})
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
new file mode 100644
index 0000000000..35e96fd507
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/system_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+template<typename Allocator>
+class BuddyAllocator {
+  public:
+    // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon
+    BuddyAllocator() {}
+    ~BuddyAllocator() {}
+
+  public:
+    void* Alloc(size_t size) {
+        return Allocator::Alloc(size); 
+    }
+    void Free(void*) {
+      // Because all info like size are stored in meta data,
+      // thus it's duplicate if add the parameter `size` in
+      // `Free(void*)` interface.
+    }
+    size_t Used();
+
+  public:
+    BuddyAllocator(const BuddyAllocator&) = delete;
+    BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
+  private:
+    size_t min_alloc_size_;
+    size_t max_alloc_size_;
+
+  private:
+    std::mutex mutex_;
+};
+
+BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
+  static BuddyAllocator<CPUAllocator>* a = nullptr;
+  if (a == nullptr) {
+    a = new BuddyAllocator<CPUAllocator>();
+  }
+  return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator<GPUAllocator>** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount(); 
+    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+        as[gpu] = new BuddyAllocator<GPUAllocator>();
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU 
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle

From 79373dabc8d2e4edc87fbef40efdfa1f54b35a9f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:33:06 +0800
Subject: [PATCH 17/86] TEST: Add test for system allocator and deleter

---
 paddle/memory/detail/system_allocator.h       | 108 ++++++------------
 paddle/memory/detail/system_allocator_test.cc |  40 ++++---
 2 files changed, 60 insertions(+), 88 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 1768f9a0da..f411019854 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -18,107 +18,69 @@ limitations under the License. */
 #include <sys/mman.h>  // for mlock and munlock
 #include <cstdlib>     // for malloc and free
 
-#ifndef PADDLE_ONLY_CPU
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-#endif  // PADDLE_ONLY_CPU
-
+#include <gflags/gflags.h>
 #include "paddle/platform/assert.h"
+#include "paddle/platform/cuda.h"
+
+DEFINE_bool(uses_pinned_memory, false,
+            "If set, allocate cpu/gpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-class CPUDeleter {
- public:
-  CPUDeleter(void* ptr, size_t size, bool locked)
-      : ptr_(ptr), size_(size), locked_(locked) {}
-
-  void* Ptr() { return ptr_; }
-
-  void operator()(void* ptr) {
-    PADDLE_ASSERT(ptr == ptr_);
-    if (ptr_ != nullptr && locked_) {
-      munlock(ptr_, size_);
-    }
-    std::free(ptr_);
-  }
-
- private:
-  void* ptr_;
-  size_t size_;
-  bool locked_;
-};
-
-// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
-// and locked memory as staging areas for data exchange between host
-// and device.  Allocates too much would reduce the amount of memory
-// available to the system for paging.  So, by default, we should use
-// CPUAllocator<staging=false>.
-template <bool lock_memory>
+// If uses_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to uses_pinned_memory.
 class CPUAllocator {
  public:
-  static CPUDeleter Alloc(size_t size) {
+  static void* Alloc(size_t size) {
     void* p = std::malloc(size);
-    if (p != nullptr && lock_memory) {
+    if (p != nullptr && FLAGS_uses_pinned_memory) {
       mlock(p, size);
     }
-    return CPUDeleter(p, size, lock_memory);
+    return p;
   }
-};
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-namespace {
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-}  // namespace
-
-class GPUDeleter {
- public:
-  GPUDeleter(void* ptr, size_t size, bool staging)
-      : ptr_(ptr), size_(size), staging_(staging) {}
-
-  void* Ptr() { return ptr_; }
 
-  void operator()(void* ptr) {
-    PADDLE_ASSERT(ptr == ptr_);
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
-    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree{Host} failed");
+  static void Free(void* p, size_t size) {
+    if (p != nullptr && FLAGS_uses_pinned_memory) {
+      munlock(p, size);
     }
+    std::free(p);
   }
-
- private:
-  void* ptr_;
-  size_t size_;
-  bool staging_;
 };
 
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
 // amount of memory available to the system for paging.  So, by
 // default, we should use GPUAllocator<staging=false>.
-template <bool staging>
 class GPUAllocator {
  public:
-  static GPUDeleter Alloc(size_t size) {
+  static void* Alloc(size_t size) {
     void* p = 0;
-    cudaError_t result =
-        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+    cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size)
+                                                  : cudaMalloc(&p, size);
     if (result != cudaSuccess) {
       cudaGetLastError();  // clear error if there is any.
     }
-    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
+    return result == cudaSuccess ? p : nullptr;
+  }
+
+  static void Free(void* p, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+    if (err != cudaErrorCudartUnloading) {
+      platform::throw_on_error(err, "cudaFree{Host} failed");
+    }
   }
 };
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index fec70a65b7..829d3558ba 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -17,34 +17,44 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 template <typename Allocator>
-void TestAllocator() {
-  {
-    auto d = Allocator::Alloc(sizeof(int));
-    EXPECT_NE(d.Ptr(), nullptr);
-    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
-  }
-  {
-    auto d = Allocator::Alloc(0);
-    EXPECT_EQ(d.Ptr(), nullptr);
-    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
-  }
+void TestAllocator(void* p) {
+  p = Allocator::Alloc(1024);
+
+  int* i = static_cast<int*>(p);
+  std::shared_ptr<int> ptr(i, [](int* p) { Allocator::Free(p, 1024); });
+
+  EXPECT_NE(p, nullptr);
 }
 
 TEST(CPUAllocator, NoLockMem) {
-  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = false;
+  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
+
 TEST(CPUAllocator, LockMem) {
-  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = true;
+  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
-  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = false;
+  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 TEST(GPUAllocator, Staging) {
-  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = true;
+  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 #endif  // PADDLE_ONLY_CPU

From b8f5922d88e5f7949eb9a469f761ad49981d677a Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 27 Jun 2017 16:32:24 -0700
Subject: [PATCH 18/86] Make CPUAllocator and GPUAllocator subclasses of
 SystemAllocator

---
 paddle/memory/detail/CMakeLists.txt           |  6 +-
 paddle/memory/detail/system_allocator.h       | 80 +++++--------------
 paddle/memory/detail/system_allocator_test.cc | 57 +++++++------
 3 files changed, 59 insertions(+), 84 deletions(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index cd5622203f..72d3749ad7 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1,7 @@
 if(${WITH_GPU})
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 else(${WITH_GPU})
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index f411019854..184b383f7f 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -14,76 +14,38 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>    // for size_t
-#include <sys/mman.h>  // for mlock and munlock
-#include <cstdlib>     // for malloc and free
-
-#include <gflags/gflags.h>
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
-
-DEFINE_bool(uses_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+#include <stddef.h>  // for size_t
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// If uses_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to uses_pinned_memory.
-class CPUAllocator {
+// SystemAllocator is the parent class of CPUAllocator and
+// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
+// pointing to the underlying system allocator.  An alternative to
+// this class hierarchy is to pass a system allocator class to
+// BuddyAllocator as a template parameter.  This approach makes
+// BuddyAllocator a class template, and it's very complicated
+// algorithm would make the buddy_allocator.h messy.
+class SystemAllocator {
  public:
-  static void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p != nullptr && FLAGS_uses_pinned_memory) {
-      mlock(p, size);
-    }
-    return p;
-  }
-
-  static void Free(void* p, size_t size) {
-    if (p != nullptr && FLAGS_uses_pinned_memory) {
-      munlock(p, size);
-    }
-    std::free(p);
-  }
+  virtual ~SystemAllocator() {}
+  virtual void* Alloc(size_t size) = 0;
+  virtual void Free(void* p, size_t size) = 0;
 };
 
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use GPUAllocator<staging=false>.
-class GPUAllocator {
+class CPUAllocator : public SystemAllocator {
  public:
-  static void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size)
-                                                  : cudaMalloc(&p, size);
-    if (result != cudaSuccess) {
-      cudaGetLastError();  // clear error if there is any.
-    }
-    return result == cudaSuccess ? p : nullptr;
-  }
-
-  static void Free(void* p, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
-    if (err != cudaErrorCudartUnloading) {
-      platform::throw_on_error(err, "cudaFree{Host} failed");
-    }
-  }
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
 };
 
+#ifndef PADDLE_ONLY_CPU
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
+};
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 829d3558ba..c461d8ac62 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -17,44 +17,55 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-#include "glog/logging.h"
+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 
-template <typename Allocator>
-void TestAllocator(void* p) {
-  p = Allocator::Alloc(1024);
+DECLARE_bool(use_pinned_memory);
 
-  int* i = static_cast<int*>(p);
-  std::shared_ptr<int> ptr(i, [](int* p) { Allocator::Free(p, 1024); });
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
+  bool freed = false;
+  {
+    void* p = a->Alloc(size);
+    if (size > 0) {
+      EXPECT_NE(p, nullptr);
+    } else {
+      EXPECT_EQ(p, nullptr);
+    }
 
-  EXPECT_NE(p, nullptr);
+    int* i = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(i, [&freed, a, size](void* p) {
+      freed = true;
+      a->Free(p, size);
+    });
+  }
+  EXPECT_TRUE(freed);
 }
 
 TEST(CPUAllocator, NoLockMem) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = false;
-  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = true;
-  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = false;
-  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 TEST(GPUAllocator, Staging) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = true;
-  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 #endif  // PADDLE_ONLY_CPU

From 2f52cb7909c5e8f372015454e3af33166713bfa7 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 27 Jun 2017 23:46:54 +0000
Subject: [PATCH 19/86] fix pserver test

---
 go/pserver/client_test.go  | 2 +-
 go/pserver/service_test.go | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index 6ecf1fa08a..4a62ae88a4 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -31,7 +31,7 @@ func init() {
 		port[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService("", time.Second*5)
+			s, err := pserver.NewService("", 1, time.Second*5)
 			if err != nil {
 				panic(err)
 			}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index f317535592..1d84f15d78 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func TestFull(t *testing.T) {
-	s, err := pserver.NewService("", time.Second*5)
+	s, err := pserver.NewService("", 1, time.Second*5)
 	if err != nil {
 		t.Error(err)
 	}
@@ -75,7 +75,7 @@ func TestFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService("", time.Second*5)
+	s, err := pserver.NewService("", 1, time.Second*5)
 	if err != nil {
 		t.Error(err)
 	}
@@ -91,7 +91,7 @@ func TestMultipleInit(t *testing.T) {
 }
 
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService("", time.Second*5)
+	s, err := pserver.NewService("", 1, time.Second*5)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -99,7 +99,7 @@ func TestUninitialized(t *testing.T) {
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService("", time.Second*5)
+	s, err := pserver.NewService("", 1, time.Second*5)
 	if err != nil {
 		t.Error(err)
 	}

From 3e087f763e9c6c15a4f1d542fb3bdc327f7441c7 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 27 Jun 2017 16:48:25 -0700
Subject: [PATCH 20/86] Add buddy_allocator.cc and system_allocator.cc

---
 paddle/memory/detail/buddy_allocator.cc       | 35 ++++++++
 paddle/memory/detail/buddy_allocator.h        | 76 ++++++++--------
 paddle/memory/detail/system_allocator.cc      | 90 +++++++++++++++++++
 paddle/memory/detail/system_allocator_test.cc | 24 ++---
 4 files changed, 177 insertions(+), 48 deletions(-)
 create mode 100644 paddle/memory/detail/buddy_allocator.cc
 create mode 100644 paddle/memory/detail/system_allocator.cc

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
new file mode 100644
index 0000000000..895bf319d7
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/system_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
+                               SystemAllocator* system_allocator)
+    : pool_size_(pool_size),
+      max_pools_(max_pools),
+      system_allocator_(system_allocator) {
+  PADDLE_ASSERT(pool_size > 0);
+  PADDLE_ASSERT(max_pools > 0);
+  PADDLE_ASSERT(system_allocator != nullptr);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 35e96fd507..129b137ed7 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
@@ -20,34 +20,38 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-template<typename Allocator>
 class BuddyAllocator {
-  public:
-    // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon
-    BuddyAllocator() {}
-    ~BuddyAllocator() {}
-
-  public:
-    void* Alloc(size_t size) {
-        return Allocator::Alloc(size); 
-    }
-    void Free(void*) {
-      // Because all info like size are stored in meta data,
-      // thus it's duplicate if add the parameter `size` in
-      // `Free(void*)` interface.
-    }
-    size_t Used();
+ public:
+  BuddyAllocator(size_t pool_size, size_t max_pools,
+                 SystemAllocator* system_allocator);
+  ~BuddyAllocator();
+
+  void* Alloc(size_t size);
+  void Free(void*);
+  size_t Used();
+
+ private:
+  struct Block {
+    size_t size_;
+    Block* left_;   // left buddy
+    Block* right_;  // right buddy
+  };
+
+  // Initially, there is only one pool.  If a Alloc founds not enough
+  // memory from that pool, and there has not been max_num_pools_,
+  // create a new pool by calling system_allocator_.Alloc(pool_size_).
+  std::vector<void*> pools_;
+
+  size_t pool_size_;      // the size of each pool;
+  size_t max_num_pools_;  // the size of all pools;
 
-  public:
-    BuddyAllocator(const BuddyAllocator&) = delete;
-    BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+  SystemAllocator* system_allocator_;
 
-  private:
-    size_t min_alloc_size_;
-    size_t max_alloc_size_;
+  std::mutex mutex_;
 
-  private:
-    std::mutex mutex_;
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 };
 
 BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
@@ -63,16 +67,16 @@ BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
 BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
   static BuddyAllocator<GPUAllocator>** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount(); 
+    int gpu_num = platform::GetDeviceCount();
     as = new BuddyAllocator<GPUAllocator>*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-        as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] = new BuddyAllocator<GPUAllocator>();
     }
   }
   return as[gpu_id];
 }
 
-#endif  // PADDLE_ONLY_CPU 
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
new file mode 100644
index 0000000000..50bec926f8
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <stdlib.h>    // for malloc and free
+#include <sys/mman.h>  // for mlock and munlock
+
+#include "gflags/gflags.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda.h"
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, false,
+            "If set, allocate cpu/gpu pinned memory.");
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void* CPUAllocator::Alloc(size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  void* p = malloc(size);
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    mlock(p, size);
+  }
+  return p;
+}
+
+void CPUAllocator::Free(void* p, size_t size) {
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    munlock(p, size);
+  }
+  free(p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+void* GPUAllocator::Alloc(size_t size) {
+  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) {
+    return nullptr;
+  }
+
+  void* p = 0;
+  cudaError_t result =
+      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+  if (result != cudaSuccess) {
+    cudaGetLastError();  // clear error if there is any.
+  }
+  return result == cudaSuccess ? p : nullptr;
+}
+
+void GPUAllocator::Free(void* p, size_t size) {
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  if (err != cudaErrorCudartUnloading) {
+    platform::throw_on_error(err, "cudaFree{Host} failed");
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index c461d8ac62..9bd5706a4e 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 
-void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a->Alloc(size);
+    void* p = a.Alloc(size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -33,9 +33,9 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
     }
 
     int* i = static_cast<int*>(p);
-    std::shared_ptr<int> ptr(i, [&freed, a, size](void* p) {
+    std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a->Free(p, size);
+      a.Free(p, size);
     });
   }
   EXPECT_TRUE(freed);
@@ -44,28 +44,28 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::GPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 TEST(GPUAllocator, Staging) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::GPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 #endif  // PADDLE_ONLY_CPU

From 80642bee00c3f723d213b0475749aeee60d89795 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang>
Date: Wed, 28 Jun 2017 08:22:58 +0800
Subject: [PATCH 21/86] fix_xmap and refine flowers dataset

---
 python/paddle/v2/dataset/__init__.py          |  3 +-
 python/paddle/v2/dataset/flowers.py           | 67 ++++++++++---------
 .../paddle/v2/dataset/tests/flowers_test.py   |  4 +-
 python/paddle/v2/reader/decorator.py          | 47 +++++++------
 .../paddle/v2/reader/tests/decorator_test.py  | 18 ++---
 5 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 26252d5bbd..2e4beb6882 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -25,8 +25,9 @@ import uci_housing
 import sentiment
 import wmt14
 import mq2007
+import flowers
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers'
 ]
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 07c13cf719..a181f3881a 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 """
 This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
 and parse train/test set intopaddle reader creators.
 
-This set contains images of flowers belonging to 102 different categories. 
+This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a
 minimum of 40 images for each category.
 
 The database was used in:
 
 Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision, 
-Graphics and Image Processing (2008) 
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
@@ -34,9 +34,9 @@ from common import download
 import tarfile
 import scipy.io as scio
 from paddle.v2.image import *
+from paddle.v2.reader import *
 import os
 import numpy as np
-import paddle.v2 as paddle
 from multiprocessing import cpu_count
 __all__ = ['train', 'test', 'valid']
 
@@ -53,8 +53,8 @@ def default_mapper(sample):
     map image bytes data to type needed by model input layer
     '''
     img, label = sample
-    img = paddle.image.load_image_bytes(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
+    img = load_image_bytes(img)
+    img = simple_transform(img, 256, 224, True)
     return img.flatten().astype('float32'), label
 
 
@@ -63,22 +63,23 @@ def reader_creator(data_file,
                    setid_file,
                    dataset_name,
                    mapper=default_mapper,
-                   buffered_size=1024):
+                   buffered_size=1024,
+                   useXmap=True):
     '''
-    1. read images from tar file and 
+    1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
     2. get a reader to read sample from batch file
-    
-    :param data_file: downloaded data file 
+
+    :param data_file: downloaded data file
     :type data_file: string
-    :param label_file: downloaded label file 
+    :param label_file: downloaded label file
     :type label_file: string
     :param setid_file: downloaded setid file containing information
                         about how to split dataset
     :type setid_file: string
     :param dataset_name: data set name (tstid|trnid|valid)
     :type dataset_name: string
-    :param mapper: a function to map image bytes data to type 
+    :param mapper: a function to map image bytes data to type
                     needed by model input layer
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
@@ -105,15 +106,17 @@ def reader_creator(data_file,
             for sample, label in itertools.izip(data, batch['label']):
                 yield sample, int(label)
 
-    return paddle.reader.xmap_readers(mapper, reader,
-                                      cpu_count(), buffered_size)
+    if useXmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
 
 
-def train(mapper=default_mapper, buffered_size=1024):
+def train(mapper=default_mapper, buffered_size=1024, useXmap=True):
     '''
-    Create flowers training set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -128,15 +131,15 @@ def train(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
+        buffered_size, useXmap)
 
 
-def test(mapper=default_mapper, buffered_size=1024):
+def test(mapper=default_mapper, buffered_size=1024, useXmap=True):
     '''
-    Create flowers test set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -151,15 +154,15 @@ def test(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
+        buffered_size, useXmap)
 
 
-def valid(mapper=default_mapper, buffered_size=1024):
+def valid(mapper=default_mapper, buffered_size=1024, useXmap=True):
     '''
-    Create flowers validation set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -175,7 +178,7 @@ def valid(mapper=default_mapper, buffered_size=1024):
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
         download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
-        buffered_size)
+        buffered_size, useXmap)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
index cc0626f4fe..a8ae9a07ac 100644
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -31,13 +31,13 @@ class TestFlowers(unittest.TestCase):
     def test_train(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 1020)
+        self.assertEqual(instances, 6149)
         self.assertEqual(max_label_value, 102)
 
     def test_test(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 6149)
+        self.assertEqual(instances, 1020)
         self.assertEqual(max_label_value, 102)
 
     def test_valid(self):
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index e432003129..45a4288751 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -166,12 +166,12 @@ def buffered(reader, size):
     The buffered data reader will read and save data entries into a
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
-    
+
     :param reader: the data reader to read from.
     :type reader: callable
     :param size: max buffer size.
     :type size: int
-    
+
     :returns: the buffered data reader.
     """
 
@@ -238,7 +238,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     :type mapper: callable
     :param reader: the data reader to read from
     :type reader: callable
-    :param process_num: process number to handle original sample 
+    :param process_num: process number to handle original sample
     :type process_num: int
     :param buffer_size: max buffer size
     :type buffer_size: int
@@ -248,9 +248,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     :rtype: callable
     """
     end = XmapEndSignal()
-    in_queue = Queue(buffer_size)
-    out_queue = Queue(buffer_size)
-    out_order = [0]
 
     # define a worker to read samples from reader to in_queue
     def read_worker(reader, in_queue):
@@ -266,12 +263,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
             in_order += 1
         in_queue.put(end)
 
-    # start a read worker in a thread
-    target = order_read_worker if order else read_worker
-    t = Thread(target=target, args=(reader, in_queue))
-    t.daemon = True
-    t.start()
-
     # define a worker to handle samples from in_queue by mapper
     # and put mapped samples into out_queue
     def handle_worker(in_queue, out_queue, mapper):
@@ -298,19 +289,27 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         in_queue.put(end)
         out_queue.put(end)
 
-    # start several handle_workers
-    target = order_handle_worker if order else handle_worker
-    args = (in_queue, out_queue, mapper, out_order) if order else (
-        in_queue, out_queue, mapper)
-    workers = []
-    for i in xrange(process_num):
-        worker = Thread(target=target, args=args)
-        worker.daemon = True
-        workers.append(worker)
-    for w in workers:
-        w.start()
-
     def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
         sample = out_queue.get()
         while not isinstance(sample, XmapEndSignal):
             yield sample
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index bb3c5d220b..5a92951b10 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -132,15 +132,17 @@ class TestXmap(unittest.TestCase):
         for order in orders:
             for tNum in thread_nums:
                 for size in buffered_size:
-                    result = []
-                    for i in paddle.v2.reader.xmap_readers(mapper,
+                    reader = paddle.v2.reader.xmap_readers(mapper,
                                                            reader_creator_10(0),
-                                                           tNum, size, order)():
-                        result.append(i)
-                    if not order:
-                        result.sort()
-                    for idx, e in enumerate(result):
-                        self.assertEqual(e, mapper(idx))
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
 
 
 if __name__ == '__main__':

From 4cc9680cc60296f6071fa34893fda4f3d6806b97 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 28 Jun 2017 01:16:28 +0000
Subject: [PATCH 22/86] Make pserver able to get server index without etcd
 (decouple pserver with etcd)

The pserver need to have server index for saving model on the
distributed file system. The server index comes from etcd if etcd is
used, or user can manually specify them. So we need
pserver.NewService() to take index as an argument. Since index could
come from etcd, it would be strange if pserver takes an index as
argument, at the same time get the index from etcd. so we will need to
decouple pserver with etcd.
---
 go/cmd/pserver/pserver.go  |   8 +-
 go/master/etcd_client.go   |   4 +-
 go/pserver/client_test.go  |   3 +-
 go/pserver/etcd_client.go  | 181 +++++++++++++++++++++++++++++++++++++
 go/pserver/service.go      | 156 ++------------------------------
 go/pserver/service_test.go |   8 +-
 6 files changed, 201 insertions(+), 159 deletions(-)
 create mode 100644 go/pserver/etcd_client.go

diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 6c85b1804b..8a42d4f8af 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -30,7 +30,13 @@ func main() {
 	log.SetLevel(level)
 
 	timeout := time.Second * time.Duration((*etcdTimeout))
-	s, err := pserver.NewService(*etcdEndpoint, *numPservers, timeout)
+	e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+	idx, err := e.Register()
+	if err != nil {
+		panic(err)
+	}
+
+	s, err := pserver.NewService(idx)
 	if err != nil {
 		panic(err)
 	}
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index b7293a7598..f7b4638577 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -18,8 +18,8 @@ const (
 	DefaultAddrPath = "/master/addr"
 )
 
-// EtcdClient is the etcd client that master uses for fault tolerance
-// and service registry.
+// EtcdClient is the etcd client that the master uses for fault
+// tolerance and service registry.
 type EtcdClient struct {
 	lockPath  string
 	statePath string
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index 4a62ae88a4..5bd16118a7 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -7,7 +7,6 @@ import (
 	"strconv"
 	"strings"
 	"testing"
-	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
@@ -31,7 +30,7 @@ func init() {
 		port[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService("", 1, time.Second*5)
+			s, err := pserver.NewService(0)
 			if err != nil {
 				panic(err)
 			}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
new file mode 100644
index 0000000000..4d88243edd
--- /dev/null
+++ b/go/pserver/etcd_client.go
@@ -0,0 +1,181 @@
+package pserver
+
+import (
+	"context"
+	"errors"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
+)
+
+// EtcdClient is the etcd client that the pserver uses for fault
+// tolerance, service registry and coordination.
+type EtcdClient struct {
+	numPservers   int
+	etcdEndpoints string
+	etcdClient    *clientv3.Client
+	// etcdTimeout is also used as retry intervals.
+	etcdTimeout time.Duration
+	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
+	externalIP string
+	// desired number of pservers in the job.
+	// assume desired will not change during one training job.
+	desired int
+}
+
+// NewEtcdClient creates an EtcdClient
+func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
+	return &EtcdClient{
+		etcdTimeout:   timeout,
+		numPservers:   numPservers,
+		etcdEndpoints: endpoints,
+	}
+}
+
+// Register registers the pserver on etcd
+//
+// Register returns the index of the current pserver.
+func (e *EtcdClient) Register() (int, error) {
+
+	var err error
+	e.externalIP, err = networkhelper.GetExternalIP()
+	if err != nil {
+		return 0, err
+	}
+
+	// initialize connection to etcd.
+	ep := strings.Split(e.etcdEndpoints, ",")
+	for {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: e.etcdTimeout,
+		})
+		if err != nil {
+			log.Errorf("connect to etcd error: %v", err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		e.etcdClient = cli
+		log.Debugf("inited client to %s", e.etcdEndpoints)
+		break
+	}
+	// init /ps_desired using transaction, for multiple pservers may want to write
+	// it at the same time.
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		_, err := e.initDesiredPsercers(ctx, e.numPservers)
+		cancel()
+		if err != nil {
+			log.Warn(err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		break
+	}
+	// TODO: when implementing extending or reducing pservers, /ps_desired is
+	// changed, then we need to watch /ps_desired node for events. For now, just
+	// write once when init and read from it.
+	// wait and set s.desired init value
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		resp, err := e.etcdClient.Get(ctx, PsDesired)
+		cancel()
+		if err != nil {
+			log.Errorf("getting %s error: %v", PsDesired, err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		if len(resp.Kvs) != 0 {
+			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+			if err != nil {
+				log.Errorf("value of %s invalid %v\n", PsDesired, err)
+				time.Sleep(e.etcdTimeout)
+				// NOTE: wait util ps_desired value change
+				continue
+			}
+			break
+		}
+	}
+
+	var pserverIdx int
+	// try register pserver node on etcd
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		var err error
+		pserverIdx, err = e.registerPserverEtcd(ctx)
+		cancel()
+		if err != nil {
+			log.Warn(err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		break
+	}
+
+	return pserverIdx, nil
+}
+
+func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+		dsStr := c.Get(PsDesired)
+		if dsStr == "" {
+			c.Put(PsDesired, strconv.Itoa(numPservers))
+		}
+		return nil
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+}
+
+// registerPserverEtcd registers pserver node on etcd using transaction.
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+	var idx int
+	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+		registered := false
+		for i := 0; i < e.desired; i++ {
+			psKey := "/ps/" + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			ps := c.Get(psKey)
+			log.Debugf("got value (%s) for key: %s", ps, psKey)
+
+			if ps == "" {
+				resp, err := e.etcdClient.Grant(context.TODO(), 5)
+				if err != nil {
+					log.Fatal(err)
+				}
+				// find the first id and write info
+				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
+				if kaerr != nil {
+					log.Errorf("keepalive etcd node error: %v", kaerr)
+					return kaerr
+				}
+
+				// Eat the keep alive message so etcd
+				// will not expire the lease.
+				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
+					ka := <-ch
+					log.Debugf("keepalive: %d\n", ka.TTL)
+				}(ch)
+				log.Debug("register finished")
+				idx = i
+				registered = true
+				break
+			}
+		}
+		if registered == true {
+			return nil
+		}
+		return errors.New("not registerd, may due to already have enough pservers")
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+
+	if err != nil {
+		return 0, err
+	}
+
+	return idx, nil
+}
diff --git a/go/pserver/service.go b/go/pserver/service.go
index f966595fdc..f386ebea1e 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,18 +1,9 @@
 package pserver
 
 import (
-	"context"
 	"errors"
 	"fmt"
-	"strconv"
-	"strings"
 	"sync"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
@@ -55,160 +46,25 @@ type Gradient Parameter
 // Service is the RPC service for pserver.
 type Service struct {
 	initialized chan struct{}
+	idx         int
 
 	mu       sync.Mutex
 	opt      *optimizer
 	paramMap map[string]Parameter
-
-	etcdEndpoints string
-	etcdClient    *clientv3.Client
-	// etcdTimeout is also used as retry intervals.
-	etcdTimeout time.Duration
-	// desired number of pservers in the job.
-	// assume desired will not change during one training job.
-	desired int
-	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
-	externalIP string
 }
 
 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified.
-func NewService(endpoints string, numPservers int, timeout time.Duration) (*Service, error) {
-	s := &Service{opt: newOptimizer(sgd, 0.005)}
+func NewService(idx int) (*Service, error) {
+	s := &Service{
+		idx: idx,
+		opt: newOptimizer(sgd, 0.005),
+	}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
-	s.etcdEndpoints = endpoints
-	s.etcdTimeout = timeout
-
-	var err error
-	s.externalIP, err = networkhelper.GetExternalIP()
-	if err != nil {
-		return nil, err
-	}
-
-	if endpoints != "" {
-		// initialize connection to etcd, try
-		ep := strings.Split(s.etcdEndpoints, ",")
-		for {
-			cli, err := clientv3.New(clientv3.Config{
-				Endpoints:   ep,
-				DialTimeout: s.etcdTimeout,
-			})
-			if err != nil {
-				log.Errorf("connect to etcd error: %v", err)
-				time.Sleep(s.etcdTimeout)
-				continue
-			}
-			s.etcdClient = cli
-			log.Debugf("inited client to %s", s.etcdEndpoints)
-			break
-		}
-		// init /ps_desired using transaction, for multiple pservers may want to write
-		// it at the same time.
-		for {
-			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-			_, err := s.initDesiredPsercers(ctx, numPservers)
-			cancel()
-			if err != nil {
-				log.Warn(err)
-				time.Sleep(s.etcdTimeout)
-				continue
-			}
-			break
-		}
-		// TODO: when implementing extending or reducing pservers, /ps_desired is
-		// changed, then we need to watch /ps_desired node for events. For now, just
-		// write once when init and read from it.
-		// wait and set s.desired init value
-		for {
-			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-			resp, err := s.etcdClient.Get(ctx, PsDesired)
-			cancel()
-			if err != nil {
-				log.Errorf("getting %s error: %v", PsDesired, err)
-				time.Sleep(s.etcdTimeout)
-				continue
-			}
-			if len(resp.Kvs) != 0 {
-				s.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
-				if err != nil {
-					log.Errorf("value of %s invalid %v\n", PsDesired, err)
-					time.Sleep(s.etcdTimeout)
-					// NOTE: wait util ps_desired value change
-					continue
-				}
-				break
-			}
-		}
-		// try register pserver node on etcd
-		for {
-			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-			_, err := s.registerPserverEtcd(ctx)
-			cancel()
-			if err != nil {
-				log.Warn(err)
-				time.Sleep(s.etcdTimeout)
-				continue
-			}
-			break
-		}
-	} // if endpoints != ""
-	// Bypass etcd registration if no endpoints specified
 	return s, nil
 }
 
-func (s *Service) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
-		dsStr := c.Get(PsDesired)
-		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers))
-		}
-		return nil
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-}
-
-// registerPserverEtcd registers pserver node on etcd using transaction.
-func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
-		registered := false
-		for i := 0; i < s.desired; i++ {
-			psKey := "/ps/" + strconv.Itoa(i)
-			log.Debugf("checking %s", psKey)
-			ps := c.Get(psKey)
-			log.Debugf("got value (%s) for key: %s", ps, psKey)
-
-			if ps == "" {
-				resp, err := s.etcdClient.Grant(context.TODO(), 5)
-				if err != nil {
-					log.Fatal(err)
-				}
-				// find the first id and write info
-				c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, s.externalIP)
-				ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID)
-				if kaerr != nil {
-					log.Errorf("keepalive etcd node error: %v", kaerr)
-					return kaerr
-				}
-
-				// Eat the keep alive message so etcd
-				// will not expire the lease.
-				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
-					ka := <-ch
-					log.Debugf("keepalive: %d\n", ka.TTL)
-				}(ch)
-				log.Debug("register finished")
-				registered = true
-				break
-			}
-		}
-		if registered == true {
-			return nil
-		}
-		return errors.New("not registerd, may due to already have enough pservers")
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-}
-
 // InitParam initializes a parameter.
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 1d84f15d78..d9d887cffd 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func TestFull(t *testing.T) {
-	s, err := pserver.NewService("", 1, time.Second*5)
+	s, err := pserver.NewService(0)
 	if err != nil {
 		t.Error(err)
 	}
@@ -75,7 +75,7 @@ func TestFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService("", 1, time.Second*5)
+	s, err := pserver.NewService(0)
 	if err != nil {
 		t.Error(err)
 	}
@@ -91,7 +91,7 @@ func TestMultipleInit(t *testing.T) {
 }
 
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService("", 1, time.Second*5)
+	s, err := pserver.NewService(0)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -99,7 +99,7 @@ func TestUninitialized(t *testing.T) {
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService("", 1, time.Second*5)
+	s, err := pserver.NewService(0)
 	if err != nil {
 		t.Error(err)
 	}

From 3e9aa7fd8bfac7434057afcdd6ae62ea7a92bff1 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 10:42:54 +0800
Subject: [PATCH 23/86] FIX: Pass CI

---
 cmake/generic.cmake                     | 4 ----
 paddle/memory/CMakeLists.txt            | 6 ------
 paddle/memory/detail/buddy_allocator.cc | 2 +-
 paddle/memory/detail/buddy_allocator.h  | 3 +++
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 840155750e..69e8164a00 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -78,10 +78,6 @@
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
 
-if(WITH_GPU)
-  add_definitions(-DPADDLE_WITH_GPU)
-endif()
-
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index e74ce75c93..3943c3cfad 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1 @@
 add_subdirectory(detail)
-
-if(${WITH_GPU})
-  nv_library(memory SRCS memory.cc)
-else(${WITH_GPU})
-  cc_library(memory SRCS memory.cc)
-endif(${WITH_GPU})
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 895bf319d7..ebe680f5ee 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/buddy_allocator.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 129b137ed7..702c7d28ee 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -16,6 +16,9 @@
 
 #include "paddle/memory/detail/system_allocator.h"
 
+#include <vector>
+#include <mutex>
+
 namespace paddle {
 namespace memory {
 namespace detail {

From 9490d243dd0255021b288f9c2e43c57b30264b9b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 10:46:14 +0800
Subject: [PATCH 24/86] ENH: clang-format

---
 paddle/memory/detail/buddy_allocator.h | 2 +-
 paddle/platform/cuda.h                 | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 702c7d28ee..82e6aaedc7 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -16,8 +16,8 @@
 
 #include "paddle/memory/detail/system_allocator.h"
 
-#include <vector>
 #include <mutex>
+#include <vector>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 864a5d3340..8fe891f9ce 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -29,13 +29,12 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 }
 
 int GetDeviceCount(void) {
-    int count;
-    throw_on_error(cudaGetDeviceCount(&count),
-                   "cudaGetDeviceCount failed");
-    return count;
+  int count;
+  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
+  return count;
 }
 
 }  // namespace platform
 }  // namespace paddle
 
-#endif  // PADDLE_ONLY_CPU 
+#endif  // PADDLE_ONLY_CPU

From 60a65b5d90052d27fbd5928b71304a8f91fce181 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 28 Jun 2017 13:33:00 +0800
Subject: [PATCH 25/86] design doc for go package management

---
 doc/design/build_system/README.md | 34 +++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
index 310739f37a..96af6566d0 100644
--- a/doc/design/build_system/README.md
+++ b/doc/design/build_system/README.md
@@ -105,3 +105,37 @@ shared_library(api
 ### Implementation
 
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+### Using Package Manager For Go
+
+Building go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from master branch, so when an external
+    project updated and deprecates something or made changes to their APIs, builds
+    may not pass. This is very different with what we already have in `cmake/external`
+    which download a specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+   tools can package the dependencies as a "vendor" package, which can be mirrored
+   at many cloud file hosting, so users what to compile paddle by themselves can
+   download this "vendor" package from a mirror site.
+
+#### Godep vs. Glide
+
+Here's a brief comparison for current Go ecosystem: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about `Godep`. A new "official" pakcage management tool has been
+started: https://github.com/golang/dep to resolve such problems, but it's currently
+at Alpha stage. So the best choice now is glide obviously.
+
+#### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+  with their commit id. Builds will "lock" to these packages if we don't `glide up`
+  them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+  under `go/`, cmake will just check the commit id to the packages under the folder,
+  if commit id matches, there will be no download at all.

From 6ad1d21c4b22adcb6fb970875256a08622d1af6e Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 28 Jun 2017 14:12:13 +0800
Subject: [PATCH 26/86] refine code of operator

---
 paddle/framework/scope.cc      | 45 +++++++++++++++-------------------
 paddle/framework/scope.h       | 30 +++++++++++++++--------
 paddle/framework/scope_test.cc | 42 ++++++++++++++++++-------------
 3 files changed, 65 insertions(+), 52 deletions(-)

diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index e985598849..5c197cec2a 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -1,18 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
-Error Scope::CreateVariable(const std::string& name) {
-  if (name == "") {
-    return Error("Variable name should not be empty");
+Variable* Scope::CreateVariable(const std::string& name) {
+  if (!HasVariable(name)) {
+    vars_[name] = std::unique_ptr<Variable>(new Variable());
   }
-
-  if (HaveVariable(name)) {
-    return AlreadyCreated;
-  }
-  vars_[name] = std::unique_ptr<Variable>(new Variable());
-  return Error();
+  return GetVariable(name);
 }
 
 Variable* Scope::GetVarLocally(const std::string& name) const {
@@ -33,22 +42,8 @@ Variable* Scope::GetVariable(const std::string& name) const {
   }
 }
 
-Variable* Scope::GetOrCreateVariable(const std::string& name) {
-  Variable* var = GetVariable(name);
-  if (var != nullptr) {
-    return var;
-  }
-
-  Error err = CreateVariable(name);
-  if (!err.isOK()) {
-    return nullptr;
-  } else {
-    return GetVariable(name);
-  }
-}
-
-bool Scope::HaveVariable(const std::string& name) {
-  return vars_.count(name) != 0;
+bool Scope::HasVariable(const std::string &name) {
+  return (vars_.count(name) > 0 || (parent_ && parent_->HasVariable(name)));
 }
 
 }  // namespace framework
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 90c8141e4f..81491f34d8 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -1,15 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include <unordered_map>
 #include <vector>
+#include <string>
+
 #include "paddle/framework/variable.h"
-#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
 
-const static Error AlreadyCreated("Variable has already been created");
-
 /**
  * Scope is an association of a name to Variable. All variables belong to
  * `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
@@ -26,20 +39,17 @@ class Scope {
 
   // Create Variable in this Scope. Return error if Variable already been
   // created.
-  Error __must_check CreateVariable(const std::string& name);
+  Variable* CreateVariable(const std::string& name);
 
   // Get Variable from this Scope, this function will recursive find Variable
   // from it's parent scope. Return nullptr if not found.
   Variable* GetVariable(const std::string& name) const;
 
-  // find and return Variables in the scope it self.
+  // Find and return Variables in the scope it self.
   Variable* GetVarLocally(const std::string& name) const;
 
-  // Get a Variable from Scope, if the Variable is not exist then create it.
-  // User should call this function most of time.
-  Variable* GetOrCreateVariable(const std::string& name);
-
-  bool HaveVariable(const std::string& name);
+  // Find if there is a Variable in this scope and it's parent scope
+  bool HasVariable(const std::string &name);
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index 09fbb78d69..25c144868b 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -1,47 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/scope.h"
 #include "gtest/gtest.h"
 
 TEST(Scope, Create) {
   using paddle::framework::Scope;
-  using paddle::Error;
   using paddle::framework::Variable;
-  using paddle::framework::AlreadyCreated;
 
   Scope* scope = new Scope();
 
-  Error err = scope->CreateVariable("");
-  EXPECT_FALSE(err.isOK());
+  Variable* var0 = scope->CreateVariable("");
+  EXPECT_NE(var0, nullptr);
 
   Variable* var1 = scope->GetVariable("a");
   EXPECT_EQ(var1, nullptr);
 
-  Error err1 = scope->CreateVariable("a");
-  EXPECT_TRUE(err1.isOK());
-
-  Error err2 = scope->CreateVariable("a");
-  EXPECT_EQ(err2, AlreadyCreated);
-
-  Variable* var2 = scope->GetVariable("a");
+  Variable* var2 = scope->CreateVariable("a");
   EXPECT_NE(var2, nullptr);
 
-  Variable* var3 = scope->GetOrCreateVariable("b");
-  EXPECT_NE(var3, nullptr);
+  Variable* var3 = scope->CreateVariable("a");
+  EXPECT_EQ(var2, var3);
+
+  Variable* var4 = scope->GetVariable("a");
+  EXPECT_EQ(var2, var4);
 }
 
 TEST(Scope, Parent) {
   using paddle::framework::Scope;
   using paddle::framework::Variable;
-  using paddle::Error;
 
   const auto parent_scope_ptr = std::shared_ptr<Scope>(new Scope());
   Scope* scope = new Scope(parent_scope_ptr);
 
-  Error err = parent_scope_ptr->CreateVariable("a");
-  EXPECT_TRUE(err.isOK());
+  Variable* var0 = parent_scope_ptr->CreateVariable("a");
+  EXPECT_NE(var0, nullptr);
 
   Variable* var1 = scope->GetVarLocally("a");
   EXPECT_EQ(var1, nullptr);
 
   Variable* var2 = scope->GetVariable("a");
-  EXPECT_NE(var2, nullptr);
+  EXPECT_EQ(var2, var0);
 }
\ No newline at end of file

From 3e16f5fdfbb52d4381d2eea2d4a60d414e72cf0d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 28 Jun 2017 14:15:54 +0800
Subject: [PATCH 27/86] clang format

---
 paddle/framework/scope.cc | 2 +-
 paddle/framework/scope.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 5c197cec2a..72cb744707 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -42,7 +42,7 @@ Variable* Scope::GetVariable(const std::string& name) const {
   }
 }
 
-bool Scope::HasVariable(const std::string &name) {
+bool Scope::HasVariable(const std::string& name) {
   return (vars_.count(name) > 0 || (parent_ && parent_->HasVariable(name)));
 }
 
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 81491f34d8..a624fe3bbe 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <unordered_map>
 #include <vector>
-#include <string>
 
 #include "paddle/framework/variable.h"
 
@@ -49,7 +49,7 @@ class Scope {
   Variable* GetVarLocally(const std::string& name) const;
 
   // Find if there is a Variable in this scope and it's parent scope
-  bool HasVariable(const std::string &name);
+  bool HasVariable(const std::string& name);
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;

From 7062be0fbdbe2a916d7dd81e81c013bc7ee4a914 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 28 Jun 2017 15:12:30 +0800
Subject: [PATCH 28/86] Add cmake for compile NNPACKConvOp.cpp.

---
 CMakeLists.txt                          |  5 +++++
 paddle/function/CMakeLists.txt          |  5 +++++
 paddle/function/nnpack/NNPACKConvOp.cpp | 29 +++++++++++++++++--------
 paddle/function/nnpack/nnpack.cmake     | 16 ++++++++++++++
 4 files changed, 46 insertions(+), 9 deletions(-)
 create mode 100644 paddle/function/nnpack/nnpack.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c719d35ec..f645ed04a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
+option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -126,6 +127,10 @@ if(WITH_GPU)
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(USE_NNPACK)
+  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+endif(USE_NNPACK)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 5e170714cf..daa2aa150e 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -10,6 +10,11 @@ if(WITH_GPU)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
+if(USE_NNPACK)
+  include(nnpack/nnpack.cmake)
+  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
+endif()
+
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function gen_proto_cpp)
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 57a6681f29..5e4de55469 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "ConvOp.h"
 #include "nnpack.h"
+#include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
             false,
@@ -72,14 +72,22 @@ public:
     }
   }
 
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
     CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
-    check(input, filter, output);
 
     size_t batchSize = input[0];
     size_t inputChannels = input[1];
@@ -92,12 +100,13 @@ public:
     // size_t outputWidth = output[3];
 
     nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = paddingH(),
-                           .right = paddingW(),
-                           .bottom = paddingH(),
-                           .left = paddingW()};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
     nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = strideW(), .height = strideH()};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
 
     float* inputData = inputs[0].data<float>();
     float* filterData = inputs[1].data<float>();
@@ -129,7 +138,8 @@ public:
         CHECK_EQ(status, nnp_status_success);
       } else {
         // only supports stride = 1
-        CHECK_EQ(stride_, 1);
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
         nnp_status status = nnp_convolution_output(algorithm_,
                                                    batchSize,
                                                    inputChannels,
@@ -189,7 +199,8 @@ public:
       CHECK_EQ(status, nnp_status_success);
     } else {
       // only supports stride = 1
-      CHECK_EQ(stride_, 1);
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
       nnp_status status = nnp_convolution_output(algorithm_,
                                                  batchSize,
                                                  inputChannels,
diff --git a/paddle/function/nnpack/nnpack.cmake b/paddle/function/nnpack/nnpack.cmake
new file mode 100644
index 0000000000..7182730ae8
--- /dev/null
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -0,0 +1,16 @@
+# Find the NNPACK library
+#  NNPACK_ROOT - where to find NNPACK include and library.
+#
+
+set(NNPACK_FOUND OFF)
+set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
+find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
+find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
+find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+
+if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
+  set(NNPACK_FOUND ON)
+  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+else()
+  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
+endif()

From 2d9113dac13000851d0d95818299f3e7c0d532c4 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 28 Jun 2017 15:47:23 +0800
Subject: [PATCH 29/86] Add test for NNPACKConvFunc.

---
 paddle/function/CMakeLists.txt              |  3 +
 paddle/function/nnpack/NNPACKConvOpTest.cpp | 96 +++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 paddle/function/nnpack/NNPACKConvOpTest.cpp

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index daa2aa150e..4ef8d80ff1 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -13,6 +13,9 @@ endif()
 if(USE_NNPACK)
   include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
+  if(WITH_TESTING)
+    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
+  endif()
 endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000..e7ce61cc6c
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/function/Function.h"
+#include "paddle/function/FunctionTest.h"
+
+DEFINE_string(algo,
+              "auto",
+              "The algorithm (auto, ft8x8, ft16x16, wt8x8, "
+              "implicit-gemm, or direct) for computing convolution of NNPACK.");
+
+namespace paddle {
+
+#define IS_NNPACK_SUPPORT(algo, filterSize, stride)        \
+  if (algo == "direct" && filterSize != 1) continue;       \
+  if (algo == "direct" && batchSize != 1) continue;        \
+  if (algo == "wt8x8" && filterSize != 3) continue;        \
+  if (algo == "implicit-gemm" && batchSize != 1) continue; \
+  if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
+
+class ConvolutionTest {
+public:
+  ConvolutionTest(const std::string& conv1,
+                  const std::string& conv2,
+                  std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {3, 64}) {
+            for (size_t outputChannels : {3, 64, 128}) {
+              if (inputChannels < outputChannels) break;
+              for (size_t stride : {1, 2}) {
+                // if batchSize > 1 NNPACKConv only supports stride = 1
+                if (batchSize > 1 && stride > 1) break;
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  IS_NNPACK_SUPPORT(algo, filterSize, stride);
+                  LOG(INFO) << " batchSize=" << batchSize
+                            << " inputChannels=" << inputChannels
+                            << " inputHeight=" << inputSize
+                            << " inputWidth=" << inputSize
+                            << " outputChannels=" << outputChannels
+                            << " filterHeight=" << filterSize
+                            << " filterWidth=" << filterSize
+                            << " outputHeight=" << outputSize
+                            << " outputWidth=" << outputSize
+                            << " stride=" << stride << " padding=" << padding;
+
+                  Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("padding", padding)
+                          .set("stride", stride)
+                          .set("algo", algo));
+
+                  TensorShape shape0{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape shape1{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape shape2{
+                      batchSize, outputChannels, outputSize, outputSize};
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+TEST(Convolution, NNPACK) {
+  // NNPACK only supports stride = 1
+  ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
+}
+
+}  // namespace paddle

From b8ffa8b9e9f468f79fea7f0bd452be2f8c64d17a Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 28 Jun 2017 15:57:00 +0800
Subject: [PATCH 30/86] move code to scope.h, remove scope.cc

---
 paddle/framework/CMakeLists.txt |  4 +--
 paddle/framework/scope.cc       | 50 ---------------------------
 paddle/framework/scope.h        | 61 ++++++++++++++++++++++-----------
 paddle/framework/scope_test.cc  | 16 ++++-----
 4 files changed, 49 insertions(+), 82 deletions(-)
 delete mode 100644 paddle/framework/scope.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 7ea17f7114..6caeb1be3a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,6 +6,4 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 cc_test(variable_test SRCS variable_test.cc)
 
-# scope lib
-cc_library(scope SRCS scope.cc)
-cc_test(scope_test SRCS scope_test.cc DEPS scope)
+cc_test(scope_test SRCS scope_test.cc)
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
deleted file mode 100644
index 72cb744707..0000000000
--- a/paddle/framework/scope.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-Variable* Scope::CreateVariable(const std::string& name) {
-  if (!HasVariable(name)) {
-    vars_[name] = std::unique_ptr<Variable>(new Variable());
-  }
-  return GetVariable(name);
-}
-
-Variable* Scope::GetVarLocally(const std::string& name) const {
-  if (vars_.count(name)) {
-    return vars_.at(name).get();
-  }
-  return nullptr;
-}
-
-Variable* Scope::GetVariable(const std::string& name) const {
-  Variable* var = GetVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  } else if (parent_ != nullptr) {
-    return parent_->GetVariable(name);
-  } else {
-    return nullptr;
-  }
-}
-
-bool Scope::HasVariable(const std::string& name) {
-  return (vars_.count(name) > 0 || (parent_ && parent_->HasVariable(name)));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a624fe3bbe..2f8d6dbd97 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -19,37 +19,58 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/variable.h"
+#include "paddle/platform/assert.h"
 
 namespace paddle {
 namespace framework {
 
 /**
  * Scope is an association of a name to Variable. All variables belong to
- * `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
  * One net can run in different scopes and update different variable in the
  * scope.
  */
 class Scope {
  public:
-  Scope() {}
-
-  explicit Scope(const std::shared_ptr<Scope>& scope) : parent_(scope) {}
-
-  ~Scope() {}
-
-  // Create Variable in this Scope. Return error if Variable already been
-  // created.
-  Variable* CreateVariable(const std::string& name);
-
-  // Get Variable from this Scope, this function will recursive find Variable
-  // from it's parent scope. Return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
-
-  // Find and return Variables in the scope it self.
-  Variable* GetVarLocally(const std::string& name) const;
-
-  // Find if there is a Variable in this scope and it's parent scope
-  bool HasVariable(const std::string& name);
+  explicit Scope(const std::shared_ptr<Scope>& parent = nullptr)
+      : parent_(parent) {}
+
+  /// Create Variable in this Scope. Failed if Variable already been
+  /// created.
+  Variable* CreateVariable(const std::string& name) {
+    PADDLE_ASSERT(!HasVariable(name));
+    vars_[name] = std::unique_ptr<Variable>(new Variable());
+    return GetVariable(name);
+  }
+
+  /// Get Variable from this Scope, this function will recursive find Variable
+  /// from it's parent scope. Return nullptr if not found.
+  Variable* GetVariable(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /// Get Variable from scope, if Variable is not exist, creat one and return.
+  Variable* GetOrCreateVariable(const std::string& name) {
+    auto var = GetVariable(name);
+    if (var) {
+      return var;
+    } else {
+      return CreateVariable(name);
+    }
+  }
+
+  /// Find if there is a Variable in this scope and it's parent scope
+  bool HasVariable(const std::string& name) const {
+    return (vars_.find(name) != vars_.end() ||
+            (parent_ && parent_->HasVariable(name)));
+  }
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index 25c144868b..34ee21e1aa 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -28,12 +28,13 @@ TEST(Scope, Create) {
   EXPECT_EQ(var1, nullptr);
 
   Variable* var2 = scope->CreateVariable("a");
-  EXPECT_NE(var2, nullptr);
 
-  Variable* var3 = scope->CreateVariable("a");
+  ASSERT_DEATH({ scope->CreateVariable("a"); }, "");
+
+  Variable* var3 = scope->GetVariable("a");
   EXPECT_EQ(var2, var3);
 
-  Variable* var4 = scope->GetVariable("a");
+  Variable* var4 = scope->GetOrCreateVariable("a");
   EXPECT_EQ(var2, var4);
 }
 
@@ -47,9 +48,6 @@ TEST(Scope, Parent) {
   Variable* var0 = parent_scope_ptr->CreateVariable("a");
   EXPECT_NE(var0, nullptr);
 
-  Variable* var1 = scope->GetVarLocally("a");
-  EXPECT_EQ(var1, nullptr);
-
-  Variable* var2 = scope->GetVariable("a");
-  EXPECT_EQ(var2, var0);
-}
\ No newline at end of file
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var0, var1);
+}

From 3a119efedad1a15f587c9415c70f661853a8d579 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 16:18:22 +0800
Subject: [PATCH 31/86] Adding Enforce to platform

Basically from caffe2::logging.h, but only expose `PADDLE_ENFORCE`
interface.
---
 paddle/platform/CMakeLists.txt  |   1 +
 paddle/platform/enforce.h       | 116 ++++++++++++++++++++++++++++++++
 paddle/platform/enforce_test.cc |  25 +++++++
 3 files changed, 142 insertions(+)
 create mode 100644 paddle/platform/enforce.h
 create mode 100644 paddle/platform/enforce_test.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7abe2ab89e..8435410564 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -3,3 +3,4 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 cc_test(must_check_test SRCS must_check_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
new file mode 100644
index 0000000000..e501e80c55
--- /dev/null
+++ b/paddle/platform/enforce.h
@@ -0,0 +1,116 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#pragma once
+#include <exception>
+#include <sstream>
+
+namespace paddle {
+namespace platform {
+
+/**
+ * @brief Enforce exception. Inherits std::exception
+ *
+ * All enforce condition not met, will throw an EnforceNotMet exception.
+ */
+class EnforceNotMet : public std::exception {
+ public:
+  EnforceNotMet(const std::string& msg, const char* file, int fileline)
+      : file_(file), fileline_(fileline) {
+    std::ostringstream sout;
+    sout << msg << " at [" << file_ << ":" << fileline_ << "];";
+    all_msg_ = sout.str();
+  }
+
+  const char* what() const noexcept override { return all_msg_.c_str(); }
+
+ private:
+  std::string all_msg_;
+  const char* file_;
+  int fileline_;
+};
+
+namespace details {
+
+inline void MakeStringInternal(std::ostringstream& stream) {}
+
+template <typename T>
+inline void MakeStringInternal(std::ostringstream& stream, T v) {
+  stream << v;
+}
+
+template <typename T, typename... ARGS>
+inline void MakeStringInternal(std::ostringstream& stream, T v, ARGS... args) {
+  MakeStringInternal(stream, v);
+  MakeStringInternal(stream, args...);
+};
+
+/**
+ * @brief Make string will concat all args into a string.
+ */
+template <typename... ARGS>
+inline std::string MakeString(ARGS... args) {
+  std::ostringstream sout;
+  details::MakeStringInternal(sout, args...);
+  return sout.str();
+}
+
+/**
+ * @brief special handle string
+ */
+template <>
+inline std::string MakeString<std::string>(std::string str) {
+  return str;
+}
+
+/**
+ * @brief special handle const char*
+ */
+template <>
+inline std::string MakeString<const char*>(const char* str) {
+  return std::string(str);
+}
+}  // namespace details
+
+// From https://stackoverflow.com/questions/30130930/
+// __buildin_expect is in C++ 11 standard. Since the condition which enforced
+// should be true in most situation, it will make the compiler generate faster
+// code by adding `UNLIKELY` macro.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+/**
+ * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
+ * __LINE__
+ *
+ * This macro take __VA_ARGS__, user can pass any type if that type can
+ * serialize to std::ostream
+ */
+#define PADDLE_THROW(...)                                               \
+  do {                                                                  \
+    throw ::paddle::platform::EnforceNotMet(                            \
+        ::paddle::platform::details::MakeString(__VA_ARGS__), __FILE__, \
+        __LINE__);                                                      \
+  } while (0)
+
+/**
+ * @brief Enforce a condition, otherwise throw an EnforceNotMet
+ */
+#define PADDLE_ENFORCE(condition, ...) \
+  do {                                 \
+    if (UNLIKELY(!(condition))) {      \
+      PADDLE_THROW(__VA_ARGS__);       \
+    }                                  \
+  } while (0)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
new file mode 100644
index 0000000000..d3e945e972
--- /dev/null
+++ b/paddle/platform/enforce_test.cc
@@ -0,0 +1,25 @@
+#include <gtest/gtest.h>
+#include <paddle/platform/enforce.h>
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok", 123, "now", 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool in_catch = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok ", 123, " at all");
+  } catch (paddle::platform::EnforceNotMet err) {
+    in_catch = true;
+    std::string msg = "Enforce is not ok 123 at all";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
\ No newline at end of file

From cdf8d99080c1c36c505cd5dbe7572fe2f71bac6d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 28 Jun 2017 16:26:16 +0800
Subject: [PATCH 32/86] Bug fix.

---
 paddle/function/nnpack/NNPACKConvOp.cpp     | 4 ++--
 paddle/function/nnpack/NNPACKConvOpTest.cpp | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 5e4de55469..d75fab0403 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -74,9 +74,9 @@ public:
 
   virtual void check(const BufferArgs& inputs,
                      const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
+    const TensorShape& output = outputs[0].shape();
     checkShape(input, filter, output);
   }
 
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
index e7ce61cc6c..4818011211 100644
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -60,12 +60,15 @@ public:
                             << " outputWidth=" << outputSize
                             << " stride=" << stride << " padding=" << padding;
 
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
                   Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
                       conv1,
                       conv2,
                       FuncConfig()
-                          .set("padding", padding)
-                          .set("stride", stride)
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
                           .set("algo", algo));
 
                   TensorShape shape0{

From 1678ad7b3067a8c72ac504fd8cb00e83766cbba2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 28 Jun 2017 16:33:43 +0800
Subject: [PATCH 33/86] add Create for scope

---
 paddle/framework/scope.h       | 8 +++++++-
 paddle/framework/scope_test.cc | 8 ++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 2f8d6dbd97..bb22c4b834 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -31,10 +31,16 @@ namespace framework {
  * scope.
  */
 class Scope {
- public:
+ private:
   explicit Scope(const std::shared_ptr<Scope>& parent = nullptr)
       : parent_(parent) {}
 
+ public:
+  static std::shared_ptr<Scope> Create(
+      const std::shared_ptr<Scope>& parent = nullptr) {
+    return std::make_shared<Scope>(Scope(parent));
+  }
+
   /// Create Variable in this Scope. Failed if Variable already been
   /// created.
   Variable* CreateVariable(const std::string& name) {
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index 34ee21e1aa..d73391d977 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -19,7 +19,7 @@ TEST(Scope, Create) {
   using paddle::framework::Scope;
   using paddle::framework::Variable;
 
-  Scope* scope = new Scope();
+  auto scope = Scope::Create();
 
   Variable* var0 = scope->CreateVariable("");
   EXPECT_NE(var0, nullptr);
@@ -42,10 +42,10 @@ TEST(Scope, Parent) {
   using paddle::framework::Scope;
   using paddle::framework::Variable;
 
-  const auto parent_scope_ptr = std::shared_ptr<Scope>(new Scope());
-  Scope* scope = new Scope(parent_scope_ptr);
+  auto parent_scope = Scope::Create();
+  auto scope = Scope::Create(parent_scope);
 
-  Variable* var0 = parent_scope_ptr->CreateVariable("a");
+  Variable* var0 = parent_scope->CreateVariable("a");
   EXPECT_NE(var0, nullptr);
 
   Variable* var1 = scope->GetVariable("a");

From d2581f34e8179bdd7e0b9ce8a9d3e847758ff52d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 17:48:20 +0800
Subject: [PATCH 34/86] change copy right format

---
 paddle/platform/enforce.h       | 22 ++++++++++------------
 paddle/platform/enforce_test.cc | 11 +++++++++++
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index e501e80c55..fbd3405a24 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -1,15 +1,13 @@
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <exception>
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index d3e945e972..23b32444ad 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -1,3 +1,14 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <gtest/gtest.h>
 #include <paddle/platform/enforce.h>
 

From fc5972ba2c0c2565d4255fda19f1b68f02c18e62 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang>
Date: Wed, 28 Jun 2017 19:54:25 +0800
Subject: [PATCH 35/86] fix requirement config for flowers dataset

---
 python/setup.py.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 86fc0fc5c0..aa6771709c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -15,7 +15,8 @@ setup_requires=["requests",
                 "protobuf==3.1",
                 "recordio",
                 "matplotlib",
-                "rarfile"]
+                "rarfile",
+                "scipy>=0.19.0"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]

From b93e863a1c5f31e9404dee8a2a6684119b876a2a Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 28 Jun 2017 20:02:52 +0800
Subject: [PATCH 36/86] Fix bug in MultiGradientMachine.

---
 paddle/gserver/gradientmachines/MultiGradientMachine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 8ef5e9d0c1..018da6c76d 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -601,7 +601,7 @@ void TrainerThread::backward() {
 
 void TrainerThread::backwardCallback(Parameter* para) {
   // CPU parameters are merged in the end
-  if (!para->useGpu()) return;
+  if (!para->useGpu() || para->isStatic()) return;
 
   int paramId = para->getID();
   if (multiMachine_->getNumThreads() == 1) {

From 9ad846ecee27ff1860debc4658090f1cfa75140f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 21:20:33 +0800
Subject: [PATCH 37/86] Remove must_check in paddle::platform

---
 paddle/platform/CMakeLists.txt     |  1 -
 paddle/platform/must_check.h       | 26 --------------------------
 paddle/platform/must_check_test.cc | 10 ----------
 paddle/utils/Error.h               | 16 +++++++++++++++-
 4 files changed, 15 insertions(+), 38 deletions(-)
 delete mode 100644 paddle/platform/must_check.h
 delete mode 100644 paddle/platform/must_check_test.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7abe2ab89e..c7d7b14518 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -2,4 +2,3 @@ nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-cc_test(must_check_test SRCS must_check_test.cc)
diff --git a/paddle/platform/must_check.h b/paddle/platform/must_check.h
deleted file mode 100644
index 4fcc62afc0..0000000000
--- a/paddle/platform/must_check.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#ifdef __GNUC__
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
-#else
-#define __must_check
-#endif
diff --git a/paddle/platform/must_check_test.cc b/paddle/platform/must_check_test.cc
deleted file mode 100644
index 6ee3ea49ac..0000000000
--- a/paddle/platform/must_check_test.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-#include <paddle/platform/must_check.h>
-
-int __must_check SomeFunctionMustCheck() { return 0; }
-
-TEST(MustCheck, all) {
-  //  This line should not be compiled, because the
-  //  return value of SomeFunctionMustCheck marked as __must_check
-  //  SomeFunctionMustCheck();
-}
\ No newline at end of file
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index f3d535c69c..27ddaab3f0 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -19,7 +19,21 @@ limitations under the License. */
 #include <stdio.h>
 #include <memory>
 #include <string>
-#include "paddle/platform/must_check.h"
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#ifdef __GNUC__
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
+#else
+#define __must_check
+#endif
 
 namespace paddle {
 

From b1a311c44d9554a1710d26c78f487f9786dd1934 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 21:48:40 +0800
Subject: [PATCH 38/86] Add pb_cc_library in generic.cmake

Fix #2567
---
 cmake/external/protobuf.cmake   |  8 ++++++--
 cmake/generic.cmake             | 33 +++++++++++++++++++++++++++++++++
 paddle/api/CMakeLists.txt       |  2 +-
 paddle/capi/CMakeLists.txt      |  2 +-
 paddle/function/CMakeLists.txt  |  2 +-
 paddle/gserver/CMakeLists.txt   |  2 +-
 paddle/math/CMakeLists.txt      |  2 +-
 paddle/optimizer/CMakeLists.txt |  2 +-
 paddle/parameter/CMakeLists.txt |  2 +-
 paddle/pserver/CMakeLists.txt   |  4 ++--
 paddle/testing/CMakeLists.txt   |  4 ++--
 paddle/trainer/CMakeLists.txt   |  2 +-
 paddle/utils/CMakeLists.txt     |  2 +-
 proto/CMakeLists.txt            | 16 +---------------
 14 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index d43badc1da..891fb29118 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -39,12 +39,16 @@ macro(PROMPT_PROTOBUF_LIB)
     ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
     SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
 
-    ADD_LIBRARY(protoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
 
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(libprotoc ${dep})
         ADD_DEPENDENCIES(protoc ${dep})
     ENDFOREACH()
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 11c1f677ae..0370ab31f3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -331,3 +331,36 @@ function(go_test TARGET_NAME)
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
   add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
+
+function(pb_cc_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(pb_cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  foreach(FIL ${pb_cc_library_SRCS})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+      if(FIL_DIR)
+        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+      endif()
+    endif()
+
+    list(APPEND proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    list(APPEND proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+
+    add_custom_command(
+            OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc"
+            "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h"
+            COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--cpp_out=${DLL_EXPORT_DECL}${CMAKE_CURRENT_BINARY_DIR}" "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} protoc
+            COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+            VERBATIM )
+  endforeach()
+  set_source_files_properties(${proto_srcs} ${proto_hdrs} PROPERTIES GENERATED TRUE)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs})
+endfunction()
\ No newline at end of file
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index f2315e31cc..39d8aa075b 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
     Internal.h)
 
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)
+add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 206f512563..11022d1754 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
   ${CAPI_PRIVATE_HEADER})
 
-add_dependencies(paddle_capi gen_proto_cpp)
+add_dependencies(paddle_capi paddle_proto)
 
 
 # combine all paddle static libraries together, into libpaddle_capi_whole.a
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 5e170714cf..1c39ced3c9 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function gen_proto_cpp)
+add_dependencies(paddle_function paddle_proto)
 
 if(WITH_TESTING)
 if(WITH_GPU)
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 93a6a99848..3bd583773a 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -58,7 +58,7 @@ endif()
 
 add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
 add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver gen_proto_cpp)
+add_dependencies(paddle_gserver paddle_proto)
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index f5657c4690..326cdb156c 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 add_style_check_target(paddle_math ${MATH_SOURCES})
 add_style_check_target(paddle_math ${MATH_HEADERS})
 
-add_dependencies(paddle_math gen_proto_cpp)  # depends
+add_dependencies(paddle_math paddle_proto)  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 4536f62ec7..bf878baaf0 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -10,7 +10,7 @@ set(OPITMIZER_SRCS
   )
 
 add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer gen_proto_cpp)
+add_dependencies(paddle_optimizer paddle_proto)
 
 if(WITH_TESTING)
   add_simple_unittest(serialization_test)
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index a35e46997f..a9e344afdc 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -7,7 +7,7 @@ add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter gen_proto_cpp)
+add_dependencies(paddle_parameter paddle_proto)
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index b7f85ea1a6..92dd286f04 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_network STATIC
 add_style_check_target(paddle_network ${NETWORK_SOURCES})
 add_style_check_target(paddle_network ${NETWORK_HEADERS})
 
-add_dependencies(paddle_network gen_proto_cpp)
+add_dependencies(paddle_network paddle_proto)
 
 ################### paddle_pserver ######################
 set(PSERVER_SOURCES
@@ -40,7 +40,7 @@ add_library(paddle_pserver STATIC
 add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
 add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
 
-add_dependencies(paddle_pserver gen_proto_cpp)
+add_dependencies(paddle_pserver paddle_proto)
 
 set(PSERVER_MAIN_SOURCES
     ParameterServer2Main.cpp)
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index c47add04b0..4aa6eae681 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_dependencies(paddle_test_main paddle_proto)
   add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util gen_proto_cpp)
+  add_dependencies(paddle_test_util paddle_proto)
 endif()
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index f34d53ae99..b8f03fa7e7 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -41,7 +41,7 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
     ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    paddle_proto)
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index af59951752..f5c399256a 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_utils STATIC
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
     ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils gen_proto_cpp)
+add_dependencies(paddle_utils paddle_proto)
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index c942620990..948d7db6b2 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -13,18 +13,6 @@ set(PROTO_GEN_PY)
 
 foreach(filename ${proto_filenames})
     get_filename_component(base_filename ${filename} NAME_WE)
-    set(CUR_PROTO_GEN
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.h
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.cc)
-    set(PROTO_GEN
-        ${PROTO_GEN}
-        ${CUR_PROTO_GEN})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
-                  --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
-
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
     set(PROTO_GEN_PY
@@ -36,8 +24,6 @@ foreach(filename ${proto_filenames})
         DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
-add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
 
-add_library(paddle_proto STATIC ${PROTO_GEN})
-target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+pb_cc_library(paddle_proto SRCS ${proto_filenames})

From 30b75a51035dee978225f5e5eff2c0d4b9c09aec Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 22:05:04 +0800
Subject: [PATCH 39/86] Also add pb_py_library

---
 cmake/generic.cmake  | 31 ++++++++++++++++++++++++-------
 proto/CMakeLists.txt | 30 ++----------------------------
 2 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 0370ab31f3..14b6909829 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -341,13 +341,6 @@ function(pb_cc_library TARGET_NAME)
   foreach(FIL ${pb_cc_library_SRCS})
     get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
     get_filename_component(FIL_WE ${FIL} NAME_WE)
-    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-      if(FIL_DIR)
-        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-      endif()
-    endif()
-
     list(APPEND proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
     list(APPEND proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
 
@@ -363,4 +356,28 @@ function(pb_cc_library TARGET_NAME)
   set_source_files_properties(${proto_srcs} ${proto_hdrs} PROPERTIES GENERATED TRUE)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs})
+endfunction()
+
+function(pb_py_library TARGET_NAME)
+  set(oneValueArgs TARGET_DIR)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(pb_py_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (NOT ${pb_py_library_TARGET_DIR})
+    set(pb_py_library_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+
+  set(py_srcs)
+  foreach(FIL ${pb_py_library_SRCS})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    set(cur_py_src ${pb_py_library_TARGET_DIR}/${FIL_WE}_pb2.py)
+    list(APPEND py_srcs "${cur_py_src}")
+    add_custom_command(OUTPUT ${cur_py_src}
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--python_out=${pb_py_library_TARGET_DIR}" "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} protoc
+            COMMENT "Running Python protocol buffer compiler on ${FIL}")
+  endforeach()
+
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
 endfunction()
\ No newline at end of file
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 948d7db6b2..9b29d43d73 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,29 +1,3 @@
-set(proto_filenames
-    DataConfig.proto
-    DataFormat.proto
-    ModelConfig.proto
-    ParameterConfig.proto
-    ParameterService.proto
-    TrainerConfig.proto
-    OptimizerConfig.proto
-    ParameterServerConfig.proto)
-
-set(PROTO_GEN)
-set(PROTO_GEN_PY)
-
-foreach(filename ${proto_filenames})
-    get_filename_component(base_filename ${filename} NAME_WE)
-    set(CUR_PROTO_GEN_PY
-        ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
-    set(PROTO_GEN_PY
-        ${CUR_PROTO_GEN_PY}
-        ${PROTO_GEN_PY})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
-endforeach()
-
-add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-
+file(GLOB proto_filenames . *.proto)
 pb_cc_library(paddle_proto SRCS ${proto_filenames})
+pb_py_library(gen_proto_py SRCS ${proto_filenames} TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR})

From 64b78b1656bd023e916447e7ea6c08de3d5c1f88 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 28 Jun 2017 22:27:50 +0800
Subject: [PATCH 40/86] Fix TravisCI

---
 cmake/generic.cmake             | 3 +--
 doc/CMakeLists.txt              | 7 -------
 paddle/gserver/CMakeLists.txt   | 2 +-
 paddle/math/CMakeLists.txt      | 2 +-
 paddle/optimizer/CMakeLists.txt | 2 +-
 paddle/parameter/CMakeLists.txt | 2 +-
 paddle/pserver/CMakeLists.txt   | 2 +-
 paddle/trainer/CMakeLists.txt   | 3 ++-
 paddle/utils/CMakeLists.txt     | 2 +-
 proto/CMakeLists.txt            | 2 +-
 10 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 14b6909829..24a07c0a24 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -362,10 +362,9 @@ function(pb_py_library TARGET_NAME)
   set(oneValueArgs TARGET_DIR)
   set(multiValueArgs SRCS)
   cmake_parse_arguments(pb_py_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (NOT ${pb_py_library_TARGET_DIR})
+  if (NOT pb_py_library_TARGET_DIR)
     set(pb_py_library_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR})
   endif()
-
   set(py_srcs)
   foreach(FIL ${pb_py_library_SRCS})
     get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 6fa42fd0c7..94dd3457fb 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_docs
-  gen_proto_py)
-
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 3bd583773a..0012636b8f 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -58,7 +58,7 @@ endif()
 
 add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
 add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver paddle_proto)
+add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 326cdb156c..9981de6160 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 add_style_check_target(paddle_math ${MATH_SOURCES})
 add_style_check_target(paddle_math ${MATH_HEADERS})
 
-add_dependencies(paddle_math paddle_proto)  # depends
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index bf878baaf0..9996d01d18 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -10,7 +10,7 @@ set(OPITMIZER_SRCS
   )
 
 add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer paddle_proto)
+add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})
 
 if(WITH_TESTING)
   add_simple_unittest(serialization_test)
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index a9e344afdc..d2ae1c16c6 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -7,7 +7,7 @@ add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter paddle_proto)
+add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 92dd286f04..f2e0b4b76b 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -40,7 +40,7 @@ add_library(paddle_pserver STATIC
 add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
 add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
 
-add_dependencies(paddle_pserver paddle_proto)
+add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 
 set(PSERVER_MAIN_SOURCES
     ParameterServer2Main.cpp)
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index b8f03fa7e7..6414c39956 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -41,7 +41,8 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
     ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    paddle_proto)
+    paddle_proto
+    ${external_project_dependencies})
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index f5c399256a..7a4977935e 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_utils STATIC
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
     ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils paddle_proto)
+add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 9b29d43d73..4402f2c899 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,3 +1,3 @@
 file(GLOB proto_filenames . *.proto)
 pb_cc_library(paddle_proto SRCS ${proto_filenames})
-pb_py_library(gen_proto_py SRCS ${proto_filenames} TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+pb_py_library(gen_proto_py SRCS ${proto_filenames} TARGET_DIR ${PROJ_ROOT}/python/paddle/proto)

From 44e39246639fe5b3ba1dbf5158531f7eb4fc6175 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 28 Jun 2017 23:04:35 +0800
Subject: [PATCH 41/86] "fix client send empty gradients bug"

---
 go/pserver/client.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/go/pserver/client.go b/go/pserver/client.go
index dda9159772..a8d2d710d8 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -123,6 +123,10 @@ func (c *Client) FinishInitParams() error {
 // SendGrads sends gradients to parameter servers for updating
 // parameters.
 func (c *Client) SendGrads(grads []Gradient) error {
+	if len(grads) == 0 {
+		log.Info("Send Empty Gradient")
+		return nil
+	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {

From 01f44bff669442ffdb67a5baac14aa693cba08c6 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang>
Date: Wed, 28 Jun 2017 23:12:19 +0800
Subject: [PATCH 42/86] rename args and add comments 1. rename 'useXmap' to
 'use_xmap' 2. add comments about exchanging train data and test data

---
 python/paddle/v2/dataset/flowers.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index a181f3881a..158cfe158c 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -46,6 +46,12 @@ SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
 
 
 def default_mapper(sample):
@@ -64,7 +70,7 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper=default_mapper,
                    buffered_size=1024,
-                   useXmap=True):
+                   use_xmap=True):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -106,13 +112,13 @@ def reader_creator(data_file,
             for sample, label in itertools.izip(data, batch['label']):
                 yield sample, int(label)
 
-    if useXmap:
+    if use_xmap:
         return xmap_readers(mapper, reader, cpu_count(), buffered_size)
     else:
         return map_readers(mapper, reader)
 
 
-def train(mapper=default_mapper, buffered_size=1024, useXmap=True):
+def train(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -131,11 +137,11 @@ def train(mapper=default_mapper, buffered_size=1024, useXmap=True):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
-        buffered_size, useXmap)
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def test(mapper=default_mapper, buffered_size=1024, useXmap=True):
+def test(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -154,11 +160,11 @@ def test(mapper=default_mapper, buffered_size=1024, useXmap=True):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
-        buffered_size, useXmap)
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def valid(mapper=default_mapper, buffered_size=1024, useXmap=True):
+def valid(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers validation set reader.
     It returns a reader, each sample in the reader is
@@ -177,8 +183,8 @@ def valid(mapper=default_mapper, buffered_size=1024, useXmap=True):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
-        buffered_size, useXmap)
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
 def fetch():

From c9865824a718e8361941f669e4ca879be6c24bcb Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 29 Jun 2017 01:10:30 +0800
Subject: [PATCH 43/86] Support to init partial network parameters from the tar
 file.

---
 python/paddle/v2/parameters.py            | 23 +++++----
 python/paddle/v2/tests/test_parameters.py | 57 +++++++++++++++++++++--
 2 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index ad20241b98..f730ea10bb 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -51,7 +51,7 @@ class Parameters(object):
     def __init__(self):
         self.__param_conf__ = dict()
         self.__gradient_machines__ = []
-        self.__tmp_params__ = []
+        self.__tmp_params__ = dict()
 
     def __append_config__(self, param_conf):
         """
@@ -128,13 +128,10 @@ class Parameters(object):
 
         if len(self.__gradient_machines__) == 0:
             # create new parameter in python numpy.
-            if len(self.__tmp_params__) != 0:
-                ret_list = [
-                    mat for name, mat in self.__tmp_params__ if name == key
-                ]
-                if len(ret_list) == 1:
-                    return ret_list[0]
-            return np.ndarray(shape=shape, dtype=np.float32)
+            if key in self.__tmp_params__:
+                return self.__tmp_params__[key]
+            else:
+                return np.ndarray(shape=shape, dtype=np.float32)
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 param = __get_parameter_in_gradient_machine__(
@@ -187,7 +184,7 @@ class Parameters(object):
                              (shape, value.shape))
 
         if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__.append((key, value))
+            self.__tmp_params__[key] = value
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 __copy_parameter_to_gradient_machine__(each_gradient_machine,
@@ -231,7 +228,7 @@ class Parameters(object):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
         if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__:
+            for name, val in self.__tmp_params__.iteritems():
                 try:
                     __copy_parameter_to_gradient_machine__(gradient_machine,
                                                            name, val)
@@ -302,6 +299,12 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
+    def init_from_tar(self, f):
+        tar_param = self.from_tar(f)
+        for pname in tar_param.names():
+            if pname in self.names():
+                self.set(pname, tar_param.get(pname))
+
 
 def __get_parameter_in_gradient_machine__(gradient_machine, name):
     """
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index 45372e7dd0..7ba8a939fb 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -20,14 +20,17 @@ import cStringIO
 import numpy
 
 
-def __rand_param_config__(name):
+def __rand_param_config__(name, psize=None):
     conf = ParameterConfig()
     conf.name = name
     size = 1
-    for i in xrange(2):
-        dim = random.randint(1, 1000)
-        conf.dims.append(dim)
-        size *= dim
+    if psize is None:
+        for i in xrange(2):
+            dim = random.randint(1, 1000)
+            conf.dims.append(dim)
+            size *= dim
+    else:
+        size = psize
     conf.size = size
     assert conf.IsInitialized()
     return conf
@@ -77,6 +80,50 @@ class TestParameters(unittest.TestCase):
         expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
         assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
 
+    def test_init_from_tar(self):
+        def get_param(names, size):
+            p = parameters.Parameters()
+            for k, v in zip(names, size):
+                p.__append_config__(__rand_param_config__(k, v))
+            for name in p.names():
+                param = p.get(name)
+                param[:] = numpy.random.uniform(
+                    -1.0, 1.0, size=p.get_shape(name))
+                p.set(name, param)
+            return p
+
+        def get_parames():
+            name1 = ['param_0', 'param_1']
+            size1 = [128, 256]
+            p1 = get_param(name1, size1)
+            file1 = cStringIO.StringIO()
+            p1.to_tar(file1)
+            file1.seek(0)
+
+            name2 = ['param_0', 'param_1', 'param_2']
+            size2 = [128, 256, 288]
+            p2 = get_param(name2, size2)
+            file2 = cStringIO.StringIO()
+            p2.to_tar(file2)
+            file2.seek(0)
+            return p1, file1, p2, file2
+
+        p1, file1, p2, file2 = get_parames()
+        p2.init_from_tar(file1)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
+        p1, file1, p2, file2 = get_parames()
+        p1.init_from_tar(file2)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
 
 if __name__ == '__main__':
     unittest.main()

From 555540fcc1b44323161c3dfd56a6f3fc7307433c Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 29 Jun 2017 01:11:58 +0800
Subject: [PATCH 44/86] fix typo

---
 paddle/py_paddle/dataprovider_converter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 218cb5ec56..43614b9779 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -144,7 +144,7 @@ class DenseScanner(IScanner):
         if len(self.__shape__) > 1:
             # The last-two dimenstions are the frame height and width.
             # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the fram height and width.
+            # The H and W are the frame height and width.
             h, w = self.__shape__[-2:]
             argument.setSlotFrameHeight(self.pos, h)
             argument.setSlotFrameWidth(self.pos, w)

From 6215f47c7c572edd94900a9ef4b90fce6726ee70 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Wed, 28 Jun 2017 14:44:40 -0700
Subject: [PATCH 45/86] Rename paddle/strings/ to paddle/string/

---
 paddle/{strings => string}/CMakeLists.txt      | 0
 paddle/{strings => string}/stringpiece.cc      | 0
 paddle/{strings => string}/stringpiece.h       | 0
 paddle/{strings => string}/stringpiece_test.cc | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename paddle/{strings => string}/CMakeLists.txt (100%)
 rename paddle/{strings => string}/stringpiece.cc (100%)
 rename paddle/{strings => string}/stringpiece.h (100%)
 rename paddle/{strings => string}/stringpiece_test.cc (100%)

diff --git a/paddle/strings/CMakeLists.txt b/paddle/string/CMakeLists.txt
similarity index 100%
rename from paddle/strings/CMakeLists.txt
rename to paddle/string/CMakeLists.txt
diff --git a/paddle/strings/stringpiece.cc b/paddle/string/stringpiece.cc
similarity index 100%
rename from paddle/strings/stringpiece.cc
rename to paddle/string/stringpiece.cc
diff --git a/paddle/strings/stringpiece.h b/paddle/string/stringpiece.h
similarity index 100%
rename from paddle/strings/stringpiece.h
rename to paddle/string/stringpiece.h
diff --git a/paddle/strings/stringpiece_test.cc b/paddle/string/stringpiece_test.cc
similarity index 100%
rename from paddle/strings/stringpiece_test.cc
rename to paddle/string/stringpiece_test.cc

From ea1d3acfb4012f491703266fa4caaf8e7e99e8c3 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Wed, 28 Jun 2017 14:52:54 -0700
Subject: [PATCH 46/86] Rename string/stringpiece* into string/piece

---
 paddle/CMakeLists.txt                         |   2 +-
 paddle/string/CMakeLists.txt                  |   4 +-
 paddle/string/piece.cc                        | 138 +++++++++++++++++
 paddle/string/{stringpiece.h => piece.h}      |  64 ++++----
 .../{stringpiece_test.cc => piece_test.cc}    | 100 +++++++------
 paddle/string/stringpiece.cc                  | 141 ------------------
 6 files changed, 225 insertions(+), 224 deletions(-)
 create mode 100644 paddle/string/piece.cc
 rename paddle/string/{stringpiece.h => piece.h} (57%)
 rename paddle/string/{stringpiece_test.cc => piece_test.cc} (77%)
 delete mode 100644 paddle/string/stringpiece.cc

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 979b68e827..307e99bbe3 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
-add_subdirectory(strings)
+add_subdirectory(string)
 
 if(Boost_FOUND)
   add_subdirectory(memory)
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 4e55eecd48..0f39660a90 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(stringpiece SRCS stringpiece.cc)
-cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags)
+cc_library(stringpiece SRCS piece.cc)
+cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
diff --git a/paddle/string/piece.cc b/paddle/string/piece.cc
new file mode 100644
index 0000000000..b80afdec82
--- /dev/null
+++ b/paddle/string/piece.cc
@@ -0,0 +1,138 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/string/piece.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+
+namespace paddle {
+namespace string {
+
+Piece::Piece() : data_(NULL), size_(0) {}
+
+Piece::Piece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument("Piece requires len to be 0 for NULL data");
+}
+
+Piece::Piece(const char* s) : data_(s) { size_ = (s == NULL) ? 0 : strlen(s); }
+
+Piece::Piece(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+char Piece::operator[](size_t n) const {
+  if (n >= len()) throw std::invalid_argument("index out of Piece length");
+  return data_[n];
+}
+
+int Compare(Piece a, Piece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+
+bool operator==(Piece x, Piece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+
+bool operator!=(Piece x, Piece y) { return !(x == y); }
+
+bool operator<(Piece x, Piece y) { return Compare(x, y) < 0; }
+bool operator>(Piece x, Piece y) { return Compare(x, y) > 0; }
+
+bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
+bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
+
+bool HasPrefix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+
+bool HasSuffix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+
+Piece SkipPrefix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data() + n, s.len() - n);
+}
+
+Piece SkipSuffix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data(), s.len() - n);
+}
+
+Piece TrimPrefix(Piece s, Piece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+
+Piece TrimSuffix(Piece s, Piece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+
+bool Contains(Piece s, Piece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+
+size_t Index(Piece s, Piece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : Piece::npos;
+}
+
+size_t Find(Piece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return Piece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : Piece::npos;
+}
+
+size_t RFind(Piece s, char c, size_t pos) {
+  if (s.len() == 0) return Piece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return Piece::npos;
+}
+
+Piece SubStr(Piece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return Piece(s.data() + pos, n);
+}
+
+std::ostream& operator<<(std::ostream& o, Piece piece) {
+  return o << piece.ToString();
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/stringpiece.h b/paddle/string/piece.h
similarity index 57%
rename from paddle/string/stringpiece.h
rename to paddle/string/piece.h
index adff713e86..db7c3e6980 100644
--- a/paddle/string/stringpiece.h
+++ b/paddle/string/piece.h
@@ -20,33 +20,34 @@
 #include <string>
 
 namespace paddle {
+namespace string {
 
-// StringPiece points into a std::string object but doesn't own the
+// Piece points into a std::string object but doesn't own the
 // string.  It is for efficient access to strings.  Like Go's string
-// type.  Not that StringPiece doesn't mutate the underlying string,
+// type.  Not that Piece doesn't mutate the underlying string,
 // so it is thread-safe given that the underlying string doesn't
-// change.  Because StringPiece contains a little data members, and
+// change.  Because Piece contains a little data members, and
 // its syntax is simple as it doesn't own/manage the string, it is
-// cheap to construct StringPieces and pass them around.
-class StringPiece {
+// cheap to construct Pieces and pass them around.
+class Piece {
 public:
   static const size_t npos = static_cast<size_t>(-1);
 
   // We provide non-explicit singleton constructors so users can
-  // pass in a "const char*" or a "string" wherever a "StringPiece"
+  // pass in a "const char*" or a "string" wherever a "Piece"
   // is expected.  These contructors ensure that if data_ is NULL,
   // size_ is 0.
-  StringPiece();
-  StringPiece(const char* d, size_t n);
-  StringPiece(const char* d);
-  StringPiece(const std::string& s);
+  Piece();
+  Piece(const char* d, size_t n);
+  Piece(const char* d);
+  Piece(const std::string& s);
 
   const char* data() const { return data_; }
   size_t len() const { return size_; }
 
   char operator[](size_t n) const;
 
-  // StringPiece doesn't own the string, so both iterator and const
+  // Piece doesn't own the string, so both iterator and const
   // iterator are const char* indeed.
   typedef const char* const_iterator;
   typedef const char* iterator;
@@ -63,43 +64,44 @@ private:
   // Intentionally copyable
 };
 
-int Compare(StringPiece a, StringPiece b);
+int Compare(Piece a, Piece b);
 
-bool operator==(StringPiece x, StringPiece y);
-bool operator!=(StringPiece x, StringPiece y);
-bool operator<(StringPiece x, StringPiece y);
-bool operator>(StringPiece x, StringPiece y);
-bool operator<=(StringPiece x, StringPiece y);
-bool operator>=(StringPiece x, StringPiece y);
+bool operator==(Piece x, Piece y);
+bool operator!=(Piece x, Piece y);
+bool operator<(Piece x, Piece y);
+bool operator>(Piece x, Piece y);
+bool operator<=(Piece x, Piece y);
+bool operator>=(Piece x, Piece y);
 
-bool HasPrefix(StringPiece s, StringPiece prefix);
-bool HasSuffix(StringPiece s, StringPiece suffix);
+bool HasPrefix(Piece s, Piece prefix);
+bool HasSuffix(Piece s, Piece suffix);
 
-StringPiece SkipPrefix(StringPiece s, size_t n);
-StringPiece SkipSuffix(StringPiece s, size_t n);
+Piece SkipPrefix(Piece s, size_t n);
+Piece SkipSuffix(Piece s, size_t n);
 
 // Skip the prefix (or suffix) if it matches with the string.
-StringPiece TrimPrefix(StringPiece s, StringPiece prefix);
-StringPiece TrimSuffix(StringPiece s, StringPiece suffix);
+Piece TrimPrefix(Piece s, Piece prefix);
+Piece TrimSuffix(Piece s, Piece suffix);
 
 // Returns if s contains sub.  Any s except for empty s contains an
 // empty sub.
-bool Contains(StringPiece s, StringPiece sub);
+bool Contains(Piece s, Piece sub);
 
 // Return the first occurrence of sub in s, or npos.  If both s and
 // sub is empty, it returns npos; otherwise, if only sub is empty, it
 // returns 0.
-size_t Index(StringPiece s, StringPiece sub);
+size_t Index(Piece s, Piece sub);
 
 // Return the first occurrence of c in s[pos:end], or npos.
-size_t Find(StringPiece s, char c, size_t pos);
+size_t Find(Piece s, char c, size_t pos);
 
 // Search range is [0..pos] inclusive.  If pos == npos, search everything.
-size_t RFind(StringPiece s, char c, size_t pos);
+size_t RFind(Piece s, char c, size_t pos);
 
-StringPiece SubStr(StringPiece s, size_t pos, size_t n);
+Piece SubStr(Piece s, size_t pos, size_t n);
 
-// allow StringPiece to be logged
-std::ostream& operator<<(std::ostream& o, StringPiece piece);
+// allow Piece to be logged
+std::ostream& operator<<(std::ostream& o, Piece piece);
 
+}  // namespace string
 }  // namespace paddle
diff --git a/paddle/string/stringpiece_test.cc b/paddle/string/piece_test.cc
similarity index 77%
rename from paddle/string/stringpiece_test.cc
rename to paddle/string/piece_test.cc
index 2ba66a04f6..cf5152ff5a 100644
--- a/paddle/string/stringpiece_test.cc
+++ b/paddle/string/piece_test.cc
@@ -14,7 +14,7 @@
   limitations under the License.
 */
 
-#include "paddle/strings/stringpiece.h"
+#include "paddle/string/piece.h"
 
 #include <sstream>
 
@@ -22,42 +22,44 @@
 
 TEST(StringPiece, Construct) {
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(NULL, s.data());
     EXPECT_EQ(0U, s.len());
   }
-  { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); }
   {
-    paddle::StringPiece s(NULL);
+    EXPECT_THROW(paddle::string::Piece s(NULL, 10000U), std::invalid_argument);
+  }
+  {
+    paddle::string::Piece s(NULL);
     EXPECT_EQ(0U, s.len());
   }
   {
     std::string a;
     EXPECT_EQ(0U, a.size());
-    paddle::StringPiece s(a);
+    paddle::string::Piece s(a);
     EXPECT_EQ(0U, s.len());
   }
 }
 
 TEST(StringPiece, CopyAndAssign) {
-  paddle::StringPiece empty;
+  paddle::string::Piece empty;
   EXPECT_EQ(0U, empty.len());
 
-  paddle::StringPiece a("hello");
-  paddle::StringPiece b = a;
+  paddle::string::Piece a("hello");
+  paddle::string::Piece b = a;
   EXPECT_EQ(b.len(), strlen("hello"));
   EXPECT_EQ(a, b);
 
   std::string storage("hello");
-  paddle::StringPiece c(storage);
+  paddle::string::Piece c(storage);
   EXPECT_EQ(a, c);
   EXPECT_NE(a.data(), c.data());
 }
 
 TEST(StringPiece, Compare) {
   {
-    paddle::StringPiece a("hello");
-    paddle::StringPiece b("world");
+    paddle::string::Piece a("hello");
+    paddle::string::Piece b("world");
     EXPECT_TRUE(a != b);
     EXPECT_FALSE(a == b);
     EXPECT_TRUE(a < b);
@@ -68,7 +70,7 @@ TEST(StringPiece, Compare) {
     EXPECT_GT(Compare(b, a), 0);
   }
   {
-    paddle::StringPiece a, b;
+    paddle::string::Piece a, b;
     EXPECT_TRUE(a == b);
     EXPECT_FALSE(a != b);
     EXPECT_FALSE(a < b);
@@ -82,31 +84,31 @@ TEST(StringPiece, Compare) {
 
 TEST(StringPiece, ToString) {
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(std::string(""), s.ToString());
   }
   {
-    paddle::StringPiece s(NULL);
+    paddle::string::Piece s(NULL);
     EXPECT_EQ(std::string(""), s.ToString());
   }
   {
-    paddle::StringPiece s("hello");
+    paddle::string::Piece s("hello");
     EXPECT_EQ(std::string("hello"), s.ToString());
   }
 }
 
 TEST(StringPiece, HasPrefixSuffix) {
-  using paddle::HasPrefix;
-  using paddle::HasSuffix;
+  using paddle::string::HasPrefix;
+  using paddle::string::HasSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_FALSE(HasPrefix(s, "something"));
     EXPECT_TRUE(HasPrefix(s, ""));
     EXPECT_FALSE(HasSuffix(s, "something"));
     EXPECT_TRUE(HasSuffix(s, ""));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_TRUE(HasPrefix(s, ""));
     EXPECT_TRUE(HasPrefix(s, "a"));
     EXPECT_TRUE(HasPrefix(s, "ap"));
@@ -120,10 +122,10 @@ TEST(StringPiece, HasPrefixSuffix) {
 }
 
 TEST(StringPiece, SkipPrefixSuffix) {
-  using paddle::SkipPrefix;
-  using paddle::SkipSuffix;
+  using paddle::string::SkipPrefix;
+  using paddle::string::SkipSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", SkipPrefix(s, 0));
     EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
 
@@ -131,7 +133,7 @@ TEST(StringPiece, SkipPrefixSuffix) {
     EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("app", SkipPrefix(s, 0));
     EXPECT_EQ("pp", SkipPrefix(s, 1));
     EXPECT_EQ("p", SkipPrefix(s, 2));
@@ -147,10 +149,10 @@ TEST(StringPiece, SkipPrefixSuffix) {
 }
 
 TEST(StringPiece, TrimPrefixSuffix) {
-  using paddle::TrimPrefix;
-  using paddle::TrimSuffix;
+  using paddle::string::TrimPrefix;
+  using paddle::string::TrimSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", TrimPrefix(s, ""));
     EXPECT_EQ("", TrimPrefix(s, "something"));
 
@@ -158,7 +160,7 @@ TEST(StringPiece, TrimPrefixSuffix) {
     EXPECT_EQ("", TrimSuffix(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("app", TrimPrefix(s, ""));
     EXPECT_EQ("pp", TrimPrefix(s, "a"));
     EXPECT_EQ("p", TrimPrefix(s, "ap"));
@@ -174,14 +176,14 @@ TEST(StringPiece, TrimPrefixSuffix) {
 }
 
 TEST(StringPiece, Contains) {
-  using paddle::Contains;
+  using paddle::string::Contains;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_FALSE(Contains(s, ""));
     EXPECT_FALSE(Contains(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_TRUE(Contains(s, ""));
     EXPECT_TRUE(Contains(s, "a"));
     EXPECT_TRUE(Contains(s, "p"));
@@ -193,15 +195,15 @@ TEST(StringPiece, Contains) {
 }
 
 TEST(StringPiece, Index) {
-  using paddle::Index;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::Index;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, Index(s, ""));
     EXPECT_EQ(npos, Index(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(0U, Index(s, ""));
     EXPECT_EQ(0U, Index(s, "a"));
     EXPECT_EQ(1U, Index(s, "p"));
@@ -213,14 +215,14 @@ TEST(StringPiece, Index) {
 }
 
 TEST(StringPiece, Find) {
-  using paddle::Find;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::Find;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, Find(s, 'a', 0U));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(0U, Find(s, 'a', 0U));
     EXPECT_EQ(1U, Find(s, 'p', 0U));
     EXPECT_EQ(1U, Find(s, 'p', 1U));
@@ -230,14 +232,14 @@ TEST(StringPiece, Find) {
 }
 
 TEST(StringPiece, RFind) {
-  using paddle::RFind;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::RFind;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, RFind(s, 'a', 0U));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(2U, RFind(s, 'p', 2U));
     EXPECT_EQ(0U, RFind(s, 'a', 2U));
     EXPECT_EQ(1U, RFind(s, 'p', 1U));
@@ -247,15 +249,15 @@ TEST(StringPiece, RFind) {
 }
 
 TEST(StringPiece, SubStr) {
-  using paddle::SubStr;
+  using paddle::string::SubStr;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", SubStr(s, 0, 0));
     EXPECT_EQ("", SubStr(s, 0, 1));
     EXPECT_EQ("", SubStr(s, 1, 0));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("", SubStr(s, 0, 0));
     EXPECT_EQ("", SubStr(s, 1, 0));
     EXPECT_EQ("", SubStr(s, 2, 0));
@@ -279,15 +281,15 @@ TEST(StringPiece, SubStr) {
 }
 
 TEST(StringPiece, StreamOutput) {
-  using paddle::StringPiece;
+  using paddle::string::Piece;
 
   std::stringstream o;
-  o << StringPiece();
+  o << paddle::string::Piece();
   EXPECT_EQ("", o.str());
 
-  o << StringPiece("hello");
+  o << paddle::string::Piece("hello");
   EXPECT_EQ("hello", o.str());
 
-  o << StringPiece();
+  o << paddle::string::Piece();
   EXPECT_EQ("hello", o.str());
 }
diff --git a/paddle/string/stringpiece.cc b/paddle/string/stringpiece.cc
deleted file mode 100644
index 415b3558d5..0000000000
--- a/paddle/string/stringpiece.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
-
-#include "paddle/strings/stringpiece.h"
-
-#include <string.h>
-
-#include <algorithm>
-#include <iosfwd>
-#include <stdexcept>
-
-namespace paddle {
-
-StringPiece::StringPiece() : data_(NULL), size_(0) {}
-
-StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) {
-  if (d == NULL && n != 0)
-    throw std::invalid_argument(
-        "StringPiece requires len to be 0 for NULL data");
-}
-
-StringPiece::StringPiece(const char* s) : data_(s) {
-  size_ = (s == NULL) ? 0 : strlen(s);
-}
-
-StringPiece::StringPiece(const std::string& s)
-    : data_(s.data()), size_(s.size()) {}
-
-char StringPiece::operator[](size_t n) const {
-  if (n >= len())
-    throw std::invalid_argument("index out of StringPiece length");
-  return data_[n];
-}
-
-int Compare(StringPiece a, StringPiece b) {
-  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
-  int r = memcmp(a.data(), b.data(), min_len);
-  if (r == 0) {
-    if (a.len() < b.len())
-      return -1;
-    else if (a.len() > b.len())
-      return 1;
-  }
-  return r;
-}
-
-bool operator==(StringPiece x, StringPiece y) {
-  return ((x.len() == y.len()) &&
-          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
-}
-
-bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
-
-bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; }
-bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; }
-
-bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; }
-bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; }
-
-bool HasPrefix(StringPiece s, StringPiece x) {
-  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
-}
-
-bool HasSuffix(StringPiece s, StringPiece x) {
-  return ((s.len() >= x.len()) &&
-          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
-}
-
-StringPiece SkipPrefix(StringPiece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than StringPiece length");
-  return StringPiece(s.data() + n, s.len() - n);
-}
-
-StringPiece SkipSuffix(StringPiece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than StringPiece length");
-  return StringPiece(s.data(), s.len() - n);
-}
-
-StringPiece TrimPrefix(StringPiece s, StringPiece x) {
-  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
-}
-
-StringPiece TrimSuffix(StringPiece s, StringPiece x) {
-  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
-}
-
-bool Contains(StringPiece s, StringPiece sub) {
-  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
-}
-
-size_t Index(StringPiece s, StringPiece sub) {
-  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
-  return e != s.end() ? e - s.data() : StringPiece::npos;
-}
-
-size_t Find(StringPiece s, char c, size_t pos) {
-  if (pos >= s.len()) {
-    return StringPiece::npos;
-  }
-  const char* result =
-      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
-  return result != nullptr ? result - s.data() : StringPiece::npos;
-}
-
-size_t RFind(StringPiece s, char c, size_t pos) {
-  if (s.len() == 0) return StringPiece::npos;
-  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
-       p--) {
-    if (*p == c) {
-      return p - s.data();
-    }
-  }
-  return StringPiece::npos;
-}
-
-StringPiece SubStr(StringPiece s, size_t pos, size_t n) {
-  if (pos > s.len()) pos = s.len();
-  if (n > s.len() - pos) n = s.len() - pos;
-  return StringPiece(s.data() + pos, n);
-}
-
-std::ostream& operator<<(std::ostream& o, StringPiece piece) {
-  return o << piece.ToString();
-}
-
-}  // namespace paddle

From 6cb7cb36911ec36be344a5800c142284983ae2f6 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Wed, 28 Jun 2017 17:23:17 -0700
Subject: [PATCH 47/86] Add paddle/string/printf and tests

---
 paddle/string/CMakeLists.txt          |   2 +
 paddle/string/printf.h                | 105 +++
 paddle/string/printf_test.cc          |  16 +
 paddle/string/tinyformat/tinyformat.h | 902 ++++++++++++++++++++++++++
 4 files changed, 1025 insertions(+)
 create mode 100644 paddle/string/printf.h
 create mode 100644 paddle/string/printf_test.cc
 create mode 100644 paddle/string/tinyformat/tinyformat.h

diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 0f39660a90..5becf62672 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -1,2 +1,4 @@
 cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
+
+cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
diff --git a/paddle/string/printf.h b/paddle/string/printf.h
new file mode 100644
index 0000000000..0767f8f5b5
--- /dev/null
+++ b/paddle/string/printf.h
@@ -0,0 +1,105 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+// Compared with std::stringstream, there are primary purpose of
+// string::Printf:
+//
+// 1. Type-safe printing, with why and how explained in
+//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
+//    Implementation includes
+//
+//    https://github.com/c42f/tinyformat
+//    boost::format
+//    std::stringstream
+//
+//    std::stringstream is not convenient enough in many cases.  For example:
+//
+//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+//
+//    boost::format is the most convenient one.  We can have
+//
+//      std::cout << format("%2% %1%") % 36 % 77;
+//
+//    or
+//
+//      format fmter("%2% %1%");
+//      fmter % 36; fmter % 77;
+//      std::cout << fmter.c_str();
+//
+//    But the overloading of % might be overkilling and it would be
+//    more efficient if it can write to std::cout directly.
+//
+//    tinyformat has an interface compatible with the C-printf style,
+//    and it can writes to a stream or returns a std::string:
+//
+//      std::cout << tfm::printf(
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+//    or
+//
+//      tfm::format(std::cout,
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+// 2. High-performance -- most printed strings are not too long and
+//    doens't need dynamic memory allocation.  Many StringPrintf
+//    implementations doesn't enforce type-safe, but are
+//    high-performance, including
+//
+//    https://developers.google.com/optimization/reference/base/stringprintf/
+//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
+//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
+//
+// According to
+// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
+// boost::format runs too slow and results in large executable binary
+// files.  So here we port tinyformat.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "paddle/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+
+namespace paddle {
+namespace string {
+
+template <typename... Args>
+void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
+  tinyformat::vformat(out, fmt, makeFormatList(args...));
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
+  std::ostringstream oss;
+  tinyformat::format(oss, fmt, args...);
+  return oss.str();
+}
+
+template <typename... Args>
+void printf(const char* fmt, const Args&... args) {
+  tinyformat::format(std::cout, fmt, args...);
+}
+
+template <typename... Args>
+void printfln(const char* fmt, const Args&... args) {
+  tinyformat::format(std::cout, fmt, args...);
+  std::cout << '\n';
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
new file mode 100644
index 0000000000..d8f2454165
--- /dev/null
+++ b/paddle/string/printf_test.cc
@@ -0,0 +1,16 @@
+#include "paddle/string/printf.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringPrintf, StringPrintf) {
+  std::string weekday = "Wednesday";
+  const char* month = "July";
+  size_t day = 27;
+  long hour = 14;
+  int min = 44;
+  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
+}
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
new file mode 100644
index 0000000000..f0e5e0160f
--- /dev/null
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -0,0 +1,902 @@
+// tinyformat.h
+// Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
+//
+// Boost Software License - Version 1.0
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+//------------------------------------------------------------------------------
+// Tinyformat: A minimal type safe printf replacement
+//
+// tinyformat.h is a type safe printf replacement library in a single C++
+// header file.  Design goals include:
+//
+// * Type safety and extensibility for user defined types.
+// * C99 printf() compatibility, to the extent possible using std::ostream
+// * Simplicity and minimalism.  A single header file to include and distribute
+//   with your projects.
+// * Augment rather than replace the standard stream formatting mechanism
+// * C++98 support, with optional C++11 niceties
+//
+//
+// Main interface example usage
+// ----------------------------
+//
+// To print a date to std::cout:
+//
+//   std::string weekday = "Wednesday";
+//   const char* month = "July";
+//   size_t day = 27;
+//   long hour = 14;
+//   int min = 44;
+//
+//   tfm::printf("%s, %s %d, %.2d:%.2d\n", weekday, month, day, hour, min);
+//
+// The strange types here emphasize the type safety of the interface; it is
+// possible to print a std::string using the "%s" conversion, and a
+// size_t using the "%d" conversion.  A similar result could be achieved
+// using either of the tfm::format() functions.  One prints on a user provided
+// stream:
+//
+//   tfm::format(std::cerr, "%s, %s %d, %.2d:%.2d\n",
+//               weekday, month, day, hour, min);
+//
+// The other returns a std::string:
+//
+//   std::string date = tfm::format("%s, %s %d, %.2d:%.2d\n",
+//                                  weekday, month, day, hour, min);
+//   std::cout << date;
+//
+// These are the three primary interface functions.  There is also a
+// convenience function printfln() which appends a newline to the usual result
+// of printf() for super simple logging.
+//
+//
+// User defined format functions
+// -----------------------------
+//
+// Simulating variadic templates in C++98 is pretty painful since it requires
+// writing out the same function for each desired number of arguments.  To make
+// this bearable tinyformat comes with a set of macros which are used
+// internally to generate the API, but which may also be used in user code.
+//
+// The three macros TINYFORMAT_ARGTYPES(n), TINYFORMAT_VARARGS(n) and
+// TINYFORMAT_PASSARGS(n) will generate a list of n argument types,
+// type/name pairs and argument names respectively when called with an integer
+// n between 1 and 16.  We can use these to define a macro which generates the
+// desired user defined function with n arguments.  To generate all 16 user
+// defined function bodies, use the macro TINYFORMAT_FOREACH_ARGNUM.  For an
+// example, see the implementation of printf() at the end of the source file.
+//
+// Sometimes it's useful to be able to pass a list of format arguments through
+// to a non-template function.  The FormatList class is provided as a way to do
+// this by storing the argument list in a type-opaque way.  Continuing the
+// example from above, we construct a FormatList using makeFormatList():
+//
+//   FormatListRef formatList = tfm::makeFormatList(weekday, month, day, hour,
+//   min);
+//
+// The format list can now be passed into any non-template function and used
+// via a call to the vformat() function:
+//
+//   tfm::vformat(std::cout, "%s, %s %d, %.2d:%.2d\n", formatList);
+//
+//
+// Additional API information
+// --------------------------
+//
+// Error handling: Define TINYFORMAT_ERROR to customize the error handling for
+// format strings which are unsupported or have the wrong number of format
+// specifiers (calls assert() by default).
+//
+// User defined types: Uses operator<< for user defined types by default.
+// Overload formatValue() for more control.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+
+namespace paddle {
+namespace string {
+namespace tinyformat {
+
+#ifndef TINYFORMAT_ERROR
+#define TINYFORMAT_ERROR(reason) assert(0 && reason)
+#endif
+
+//------------------------------------------------------------------------------
+namespace detail {
+
+// Test whether type T1 is convertible to type T2
+template <typename T1, typename T2>
+struct is_convertible {
+private:
+  // two types of different size
+  struct fail {
+    char dummy[2];
+  };
+  struct succeed {
+    char dummy;
+  };
+  // Try to convert a T1 to a T2 by plugging into tryConvert
+  static fail tryConvert(...);
+  static succeed tryConvert(const T2 &);
+  static const T1 &makeT1();
+
+public:
+  // Standard trick: the (...) version of tryConvert will be chosen from
+  // the overload set only if the version taking a T2 doesn't match.
+  // Then we compare the sizes of the return types to check which
+  // function matched.  Very neat, in a disgusting kind of way :)
+  static const bool value = sizeof(tryConvert(makeT1())) == sizeof(succeed);
+};
+
+// Format the value by casting to type fmtT.  This default implementation
+// should never be called.
+template <typename T,
+          typename fmtT,
+          bool convertible = is_convertible<T, fmtT>::value>
+struct formatValueAsType {
+  static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
+};
+// Specialized version for types that can actually be converted to fmtT, as
+// indicated by the "convertible" template parameter.
+template <typename T, typename fmtT>
+struct formatValueAsType<T, fmtT, true> {
+  static void invoke(std::ostream &out, const T &value) {
+    out << static_cast<fmtT>(value);
+  }
+};
+
+// Convert an arbitrary type to integer.  The version with convertible=false
+// throws an error.
+template <typename T, bool convertible = is_convertible<T, int>::value>
+struct convertToInt {
+  static int invoke(const T & /*value*/) {
+    TINYFORMAT_ERROR(
+        "tinyformat: Cannot convert from argument type to "
+        "integer for use as variable width or precision");
+    return 0;
+  }
+};
+// Specialization for convertToInt when conversion is possible
+template <typename T>
+struct convertToInt<T, true> {
+  static int invoke(const T &value) { return static_cast<int>(value); }
+};
+
+// Format at most ntrunc characters to the given stream.
+template <typename T>
+inline void formatTruncated(std::ostream &out, const T &value, int ntrunc) {
+  std::ostringstream tmp;
+  tmp << value;
+  std::string result = tmp.str();
+  out.write(result.c_str(),
+            (std::min)(ntrunc, static_cast<int>(result.size())));
+}
+#define TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(type)                       \
+  inline void formatTruncated(std::ostream &out, type *value, int ntrunc) { \
+    std::streamsize len = 0;                                                \
+    while (len < ntrunc && value[len] != 0) ++len;                          \
+    out.write(value, len);                                                  \
+  }
+// Overload for const char* and char*.  Could overload for signed & unsigned
+// char too, but these are technically unneeded for printf compatibility.
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(const char)
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
+#undef TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Variable formatting functions.  May be overridden for user-defined types if
+// desired.
+
+/// Format a value into a stream, delegating to operator<< by default.
+///
+/// Users may override this for their own types.  When this function is called,
+/// the stream flags will have been modified according to the format string.
+/// The format specification is provided in the range [fmtBegin, fmtEnd).  For
+/// truncating conversions, ntrunc is set to the desired maximum number of
+/// characters, for example "%.7s" calls formatValue with ntrunc = 7.
+///
+/// By default, formatValue() uses the usual stream insertion operator
+/// operator<< to format the type T, with special cases for the %c and %p
+/// conversions.
+template <typename T>
+inline void formatValue(std::ostream &out,
+                        const char * /*fmtBegin*/,
+                        const char *fmtEnd,
+                        int ntrunc,
+                        const T &value) {
+  // The mess here is to support the %c and %p conversions: if these
+  // conversions are active we try to convert the type to a char or const
+  // void* respectively and format that instead of the value itself.  For the
+  // %p conversion it's important to avoid dereferencing the pointer, which
+  // could otherwise lead to a crash when printing a dangling (const char*).
+  const bool canConvertToChar = detail::is_convertible<T, char>::value;
+  const bool canConvertToVoidPtr =
+      detail::is_convertible<T, const void *>::value;
+  if (canConvertToChar && *(fmtEnd - 1) == 'c')
+    detail::formatValueAsType<T, char>::invoke(out, value);
+  else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p')
+    detail::formatValueAsType<T, const void *>::invoke(out, value);
+  else if (ntrunc >= 0) {
+    // Take care not to overread C strings in truncating conversions like
+    // "%.4s" where at most 4 characters may be read.
+    detail::formatTruncated(out, value, ntrunc);
+  } else
+    out << value;
+}
+
+// Overloaded version for char types to support printing as an integer
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+  inline void formatValue(std::ostream &out,         \
+                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd,        \
+                          int /**/,                  \
+                          charType value) {          \
+    switch (*(fmtEnd - 1)) {                         \
+      case 'u':                                      \
+      case 'd':                                      \
+      case 'i':                                      \
+      case 'o':                                      \
+      case 'X':                                      \
+      case 'x':                                      \
+        out << static_cast<int>(value);              \
+        break;                                       \
+      default:                                       \
+        out << value;                                \
+        break;                                       \
+    }                                                \
+  }
+// per 3.9.1: char, signed char and unsigned char are all distinct types
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(signed char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(unsigned char)
+#undef TINYFORMAT_DEFINE_FORMATVALUE_CHAR
+
+//------------------------------------------------------------------------------
+// Tools for emulating variadic templates in C++98.  The basic idea here is
+// stolen from the boost preprocessor metaprogramming library and cut down to
+// be just general enough for what we need.
+
+#define TINYFORMAT_ARGTYPES(n) TINYFORMAT_ARGTYPES_##n
+#define TINYFORMAT_VARARGS(n) TINYFORMAT_VARARGS_##n
+#define TINYFORMAT_PASSARGS(n) TINYFORMAT_PASSARGS_##n
+#define TINYFORMAT_PASSARGS_TAIL(n) TINYFORMAT_PASSARGS_TAIL_##n
+
+// To keep it as transparent as possible, the macros below have been generated
+// using python via the excellent cog.py code generation script.  This avoids
+// the need for a bunch of complex (but more general) preprocessor tricks as
+// used in boost.preprocessor.
+//
+// To rerun the code generation in place, use `cog.py -r tinyformat.h`
+// (see http://nedbatchelder.com/code/cog).  Alternatively you can just create
+// extra versions by hand.
+
+/*[[[cog
+maxParams = 16
+
+def makeCommaSepLists(lineTemplate, elemTemplate, startInd=1):
+    for j in range(startInd,maxParams+1):
+        list = ', '.join([elemTemplate % {'i':i} for i in range(startInd,j+1)])
+        cog.outl(lineTemplate % {'j':j, 'list':list})
+
+makeCommaSepLists('#define TINYFORMAT_ARGTYPES_%(j)d %(list)s',
+                  'class T%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_VARARGS_%(j)d %(list)s',
+                  'const T%(i)d& v%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_%(j)d %(list)s', 'v%(i)d')
+
+cog.outl()
+cog.outl('#define TINYFORMAT_PASSARGS_TAIL_1')
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_TAIL_%(j)d , %(list)s',
+                  'v%(i)d', startInd = 2)
+
+cog.outl()
+cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
+         ' '.join(['m(%d)' % (j,) for j in range(1,maxParams+1)]))
+]]]*/
+#define TINYFORMAT_ARGTYPES_1 class T1
+#define TINYFORMAT_ARGTYPES_2 class T1, class T2
+#define TINYFORMAT_ARGTYPES_3 class T1, class T2, class T3
+#define TINYFORMAT_ARGTYPES_4 class T1, class T2, class T3, class T4
+#define TINYFORMAT_ARGTYPES_5 class T1, class T2, class T3, class T4, class T5
+#define TINYFORMAT_ARGTYPES_6 \
+  class T1, class T2, class T3, class T4, class T5, class T6
+#define TINYFORMAT_ARGTYPES_7 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7
+#define TINYFORMAT_ARGTYPES_8 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8
+#define TINYFORMAT_ARGTYPES_9                                           \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9
+#define TINYFORMAT_ARGTYPES_10                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10
+#define TINYFORMAT_ARGTYPES_11                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11
+#define TINYFORMAT_ARGTYPES_12                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12
+#define TINYFORMAT_ARGTYPES_13                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13
+#define TINYFORMAT_ARGTYPES_14                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14
+#define TINYFORMAT_ARGTYPES_15                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15
+#define TINYFORMAT_ARGTYPES_16                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15, class T16
+
+#define TINYFORMAT_VARARGS_1 const T1 &v1
+#define TINYFORMAT_VARARGS_2 const T1 &v1, const T2 &v2
+#define TINYFORMAT_VARARGS_3 const T1 &v1, const T2 &v2, const T3 &v3
+#define TINYFORMAT_VARARGS_4 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4
+#define TINYFORMAT_VARARGS_5 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5
+#define TINYFORMAT_VARARGS_6                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6
+#define TINYFORMAT_VARARGS_7                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7
+#define TINYFORMAT_VARARGS_8                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8
+#define TINYFORMAT_VARARGS_9                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9
+#define TINYFORMAT_VARARGS_10                                           \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10
+#define TINYFORMAT_VARARGS_11                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11
+#define TINYFORMAT_VARARGS_12                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12
+#define TINYFORMAT_VARARGS_13                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13
+#define TINYFORMAT_VARARGS_14                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14
+#define TINYFORMAT_VARARGS_15                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15
+#define TINYFORMAT_VARARGS_16                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15, const T16 &v16
+
+#define TINYFORMAT_PASSARGS_1 v1
+#define TINYFORMAT_PASSARGS_2 v1, v2
+#define TINYFORMAT_PASSARGS_3 v1, v2, v3
+#define TINYFORMAT_PASSARGS_4 v1, v2, v3, v4
+#define TINYFORMAT_PASSARGS_5 v1, v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_6 v1, v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_7 v1, v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_8 v1, v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_9 v1, v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_10 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_11 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_12 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_13 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_14 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_15 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_16 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_PASSARGS_TAIL_1
+#define TINYFORMAT_PASSARGS_TAIL_2 , v2
+#define TINYFORMAT_PASSARGS_TAIL_3 , v2, v3
+#define TINYFORMAT_PASSARGS_TAIL_4 , v2, v3, v4
+#define TINYFORMAT_PASSARGS_TAIL_5 , v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_TAIL_6 , v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_TAIL_7 , v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_TAIL_8 , v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_TAIL_9 , v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_TAIL_10 , v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_TAIL_11 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_TAIL_12 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_TAIL_13 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_TAIL_14 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_TAIL_15 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_TAIL_16 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_FOREACH_ARGNUM(m)                                         \
+  m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \
+      m(15) m(16)
+//[[[end]]]
+
+namespace detail {
+
+// Type-opaque holder for an argument to format(), with associated actions on
+// the type held as explicit function pointers.  This allows FormatArg's for
+// each argument to be allocated as a homogenous array inside FormatList
+// whereas a naive implementation based on inheritance does not.
+class FormatArg {
+public:
+  FormatArg() {}
+
+  template <typename T>
+  FormatArg(const T &value)
+      : m_value(static_cast<const void *>(&value)),
+        m_formatImpl(&formatImpl<T>),
+        m_toIntImpl(&toIntImpl<T>) {}
+
+  void format(std::ostream &out,
+              const char *fmtBegin,
+              const char *fmtEnd,
+              int ntrunc) const {
+    m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
+  }
+
+  int toInt() const { return m_toIntImpl(m_value); }
+
+private:
+  template <typename T>
+  static void formatImpl(std::ostream &out,
+                         const char *fmtBegin,
+                         const char *fmtEnd,
+                         int ntrunc,
+                         const void *value) {
+    formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
+  }
+
+  template <typename T>
+  static int toIntImpl(const void *value) {
+    return convertToInt<T>::invoke(*static_cast<const T *>(value));
+  }
+
+  const void *m_value;
+  void (*m_formatImpl)(std::ostream &out,
+                       const char *fmtBegin,
+                       const char *fmtEnd,
+                       int ntrunc,
+                       const void *value);
+  int (*m_toIntImpl)(const void *value);
+};
+
+// Parse and return an integer from the string c, as atoi()
+// On return, c is set to one past the end of the integer.
+inline int parseIntAndAdvance(const char *&c) {
+  int i = 0;
+  for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0');
+  return i;
+}
+
+// Print literal part of format string and return next format spec
+// position.
+//
+// Skips over any occurrences of '%%', printing a literal '%' to the
+// output.  The position of the first % character of the next
+// nontrivial format spec is returned, or the end of string.
+inline const char *printFormatStringLiteral(std::ostream &out,
+                                            const char *fmt) {
+  const char *c = fmt;
+  for (;; ++c) {
+    switch (*c) {
+      case '\0':
+        out.write(fmt, c - fmt);
+        return c;
+      case '%':
+        out.write(fmt, c - fmt);
+        if (*(c + 1) != '%') return c;
+        // for "%%", tack trailing % onto next literal section.
+        fmt = ++c;
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+// Parse a format string and set the stream state accordingly.
+//
+// The format mini-language recognized here is meant to be the one from C99,
+// with the form "%[flags][width][.precision][length]type".
+//
+// Formatting options which can't be natively represented using the ostream
+// state are returned in spacePadPositive (for space padded positive numbers)
+// and ntrunc (for truncating conversions).  argIndex is incremented if
+// necessary to pull out variable width and precision .  The function returns a
+// pointer to the character after the end of the current format spec.
+inline const char *streamStateFromFormat(std::ostream &out,
+                                         bool &spacePadPositive,
+                                         int &ntrunc,
+                                         const char *fmtStart,
+                                         const detail::FormatArg *formatters,
+                                         int &argIndex,
+                                         int numFormatters) {
+  if (*fmtStart != '%') {
+    TINYFORMAT_ERROR(
+        "tinyformat: Not enough conversion specifiers in format string");
+    return fmtStart;
+  }
+  // Reset stream state to defaults.
+  out.width(0);
+  out.precision(6);
+  out.fill(' ');
+  // Reset most flags; ignore irrelevant unitbuf & skipws.
+  out.unsetf(std::ios::adjustfield | std::ios::basefield |
+             std::ios::floatfield | std::ios::showbase | std::ios::boolalpha |
+             std::ios::showpoint | std::ios::showpos | std::ios::uppercase);
+  bool precisionSet = false;
+  bool widthSet = false;
+  int widthExtra = 0;
+  const char *c = fmtStart + 1;
+  // 1) Parse flags
+  for (;; ++c) {
+    switch (*c) {
+      case '#':
+        out.setf(std::ios::showpoint | std::ios::showbase);
+        continue;
+      case '0':
+        // overridden by left alignment ('-' flag)
+        if (!(out.flags() & std::ios::left)) {
+          // Use internal padding so that numeric values are
+          // formatted correctly, eg -00010 rather than 000-10
+          out.fill('0');
+          out.setf(std::ios::internal, std::ios::adjustfield);
+        }
+        continue;
+      case '-':
+        out.fill(' ');
+        out.setf(std::ios::left, std::ios::adjustfield);
+        continue;
+      case ' ':
+        // overridden by show positive sign, '+' flag.
+        if (!(out.flags() & std::ios::showpos)) spacePadPositive = true;
+        continue;
+      case '+':
+        out.setf(std::ios::showpos);
+        spacePadPositive = false;
+        widthExtra = 1;
+        continue;
+      default:
+        break;
+    }
+    break;
+  }
+  // 2) Parse width
+  if (*c >= '0' && *c <= '9') {
+    widthSet = true;
+    out.width(parseIntAndAdvance(c));
+  }
+  if (*c == '*') {
+    widthSet = true;
+    int width = 0;
+    if (argIndex < numFormatters)
+      width = formatters[argIndex++].toInt();
+    else
+      TINYFORMAT_ERROR(
+          "tinyformat: Not enough arguments to read variable width");
+    if (width < 0) {
+      // negative widths correspond to '-' flag set
+      out.fill(' ');
+      out.setf(std::ios::left, std::ios::adjustfield);
+      width = -width;
+    }
+    out.width(width);
+    ++c;
+  }
+  // 3) Parse precision
+  if (*c == '.') {
+    ++c;
+    int precision = 0;
+    if (*c == '*') {
+      ++c;
+      if (argIndex < numFormatters)
+        precision = formatters[argIndex++].toInt();
+      else
+        TINYFORMAT_ERROR(
+            "tinyformat: Not enough arguments to read variable precision");
+    } else {
+      if (*c >= '0' && *c <= '9')
+        precision = parseIntAndAdvance(c);
+      else if (*c == '-')  // negative precisions ignored, treated as zero.
+        parseIntAndAdvance(++c);
+    }
+    out.precision(precision);
+    precisionSet = true;
+  }
+  // 4) Ignore any C99 length modifier
+  while (*c == 'l' || *c == 'h' || *c == 'L' || *c == 'j' || *c == 'z' ||
+         *c == 't')
+    ++c;
+  // 5) We're up to the conversion specifier character.
+  // Set stream flags based on conversion specifier (thanks to the
+  // boost::format class for forging the way here).
+  bool intConversion = false;
+  switch (*c) {
+    case 'u':
+    case 'd':
+    case 'i':
+      out.setf(std::ios::dec, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'o':
+      out.setf(std::ios::oct, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'X':
+      out.setf(std::ios::uppercase);
+    case 'x':
+    case 'p':
+      out.setf(std::ios::hex, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'E':
+      out.setf(std::ios::uppercase);
+    case 'e':
+      out.setf(std::ios::scientific, std::ios::floatfield);
+      out.setf(std::ios::dec, std::ios::basefield);
+      break;
+    case 'F':
+      out.setf(std::ios::uppercase);
+    case 'f':
+      out.setf(std::ios::fixed, std::ios::floatfield);
+      break;
+    case 'G':
+      out.setf(std::ios::uppercase);
+    case 'g':
+      out.setf(std::ios::dec, std::ios::basefield);
+      // As in boost::format, let stream decide float format.
+      out.flags(out.flags() & ~std::ios::floatfield);
+      break;
+    case 'a':
+    case 'A':
+      TINYFORMAT_ERROR(
+          "tinyformat: the %a and %A conversion specs "
+          "are not supported");
+      break;
+    case 'c':
+      // Handled as special case inside formatValue()
+      break;
+    case 's':
+      if (precisionSet) ntrunc = static_cast<int>(out.precision());
+      // Make %s print booleans as "true" and "false"
+      out.setf(std::ios::boolalpha);
+      break;
+    case 'n':
+      // Not supported - will cause problems!
+      TINYFORMAT_ERROR("tinyformat: %n conversion spec not supported");
+      break;
+    case '\0':
+      TINYFORMAT_ERROR(
+          "tinyformat: Conversion spec incorrectly "
+          "terminated by end of string");
+      return c;
+    default:
+      break;
+  }
+  if (intConversion && precisionSet && !widthSet) {
+    // "precision" for integers gives the minimum number of digits (to be
+    // padded with zeros on the left).  This isn't really supported by the
+    // iostreams, but we can approximately simulate it with the width if
+    // the width isn't otherwise used.
+    out.width(out.precision() + widthExtra);
+    out.setf(std::ios::internal, std::ios::adjustfield);
+    out.fill('0');
+  }
+  return c + 1;
+}
+
+//------------------------------------------------------------------------------
+inline void formatImpl(std::ostream &out,
+                       const char *fmt,
+                       const detail::FormatArg *formatters,
+                       int numFormatters) {
+  // Saved stream state
+  std::streamsize origWidth = out.width();
+  std::streamsize origPrecision = out.precision();
+  std::ios::fmtflags origFlags = out.flags();
+  char origFill = out.fill();
+
+  for (int argIndex = 0; argIndex < numFormatters; ++argIndex) {
+    // Parse the format string
+    fmt = printFormatStringLiteral(out, fmt);
+    bool spacePadPositive = false;
+    int ntrunc = -1;
+    const char *fmtEnd = streamStateFromFormat(out,
+                                               spacePadPositive,
+                                               ntrunc,
+                                               fmt,
+                                               formatters,
+                                               argIndex,
+                                               numFormatters);
+    if (argIndex >= numFormatters) {
+      // Check args remain after reading any variable width/precision
+      TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
+      return;
+    }
+    const FormatArg &arg = formatters[argIndex];
+    // Format the arg into the stream.
+    if (!spacePadPositive)
+      arg.format(out, fmt, fmtEnd, ntrunc);
+    else {
+      // The following is a special case with no direct correspondence
+      // between stream formatting and the printf() behaviour.  Simulate
+      // it crudely by formatting into a temporary string stream and
+      // munging the resulting string.
+      std::ostringstream tmpStream;
+      tmpStream.copyfmt(out);
+      tmpStream.setf(std::ios::showpos);
+      arg.format(tmpStream, fmt, fmtEnd, ntrunc);
+      std::string result = tmpStream.str();  // allocates... yuck.
+      for (size_t i = 0, iend = result.size(); i < iend; ++i)
+        if (result[i] == '+') result[i] = ' ';
+      out << result;
+    }
+    fmt = fmtEnd;
+  }
+
+  // Print remaining part of format string.
+  fmt = printFormatStringLiteral(out, fmt);
+  if (*fmt != '\0')
+    TINYFORMAT_ERROR(
+        "tinyformat: Too many conversion specifiers in format string");
+
+  // Restore stream state
+  out.width(origWidth);
+  out.precision(origPrecision);
+  out.flags(origFlags);
+  out.fill(origFill);
+}
+
+}  // namespace detail
+
+/// List of template arguments format(), held in a type-opaque way.
+///
+/// A const reference to FormatList (typedef'd as FormatListRef) may be
+/// conveniently used to pass arguments to non-template functions: All type
+/// information has been stripped from the arguments, leaving just enough of a
+/// common interface to perform formatting as required.
+class FormatList {
+public:
+  FormatList(detail::FormatArg *formatters, int N)
+      : m_formatters(formatters), m_N(N) {}
+
+  friend void vformat(std::ostream &out,
+                      const char *fmt,
+                      const FormatList &list);
+
+private:
+  const detail::FormatArg *m_formatters;
+  int m_N;
+};
+
+/// Reference to type-opaque format list for passing to vformat()
+typedef const FormatList &FormatListRef;
+
+namespace detail {
+
+// Format list subclass with fixed storage to avoid dynamic allocation
+template <int N>
+class FormatListN : public FormatList {
+public:
+  template <typename... Args>
+  FormatListN(const Args &... args)
+      : FormatList(&m_formatterStore[0], N),
+        m_formatterStore{FormatArg(args)...} {
+    static_assert(sizeof...(args) == N, "Number of args must be N");
+  }
+
+private:
+  FormatArg m_formatterStore[N];
+};
+
+// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
+template <>
+class FormatListN<0> : public FormatList {
+public:
+  FormatListN() : FormatList(0, 0) {}
+};
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Primary API functions
+
+/// Make type-agnostic format list from list of template arguments.
+///
+/// The exact return type of this function is an implementation detail and
+/// shouldn't be relied upon.  Instead it should be stored as a FormatListRef:
+///
+///   FormatListRef formatList = makeFormatList( /*...*/ );
+template <typename... Args>
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
+  return detail::FormatListN<sizeof...(args)>(args...);
+}
+
+/// Format list of arguments to the stream according to the given format string.
+///
+/// The name vformat() is chosen for the semantic similarity to vprintf(): the
+/// list of format arguments is held in a single function argument.
+inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
+  detail::formatImpl(out, fmt, list.m_formatters, list.m_N);
+}
+
+/// Format list of arguments to the stream according to given format string.
+template <typename... Args>
+void format(std::ostream &out, const char *fmt, const Args &... args) {
+  vformat(out, fmt, makeFormatList(args...));
+}
+
+/// Format list of arguments according to the given format string and return
+/// the result as a string.
+template <typename... Args>
+std::string format(const char *fmt, const Args &... args) {
+  std::ostringstream oss;
+  format(oss, fmt, args...);
+  return oss.str();
+}
+
+/// Format list of arguments to std::cout, according to the given format string
+template <typename... Args>
+void printf(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+}
+
+template <typename... Args>
+void printfln(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+  std::cout << '\n';
+}
+
+}  // namespace tinyformat
+}  // namespace string
+}  // namespace paddle

From 098f956c098f5980a231575ed663b4704b6df5c5 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Wed, 28 Jun 2017 17:36:32 -0700
Subject: [PATCH 48/86] Correct function implementation dependencies

---
 paddle/string/printf.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/paddle/string/printf.h b/paddle/string/printf.h
index 0767f8f5b5..8b5ce63a8e 100644
--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
@@ -80,25 +80,19 @@ namespace string {
 
 template <typename... Args>
 void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
-  tinyformat::vformat(out, fmt, makeFormatList(args...));
+  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
 }
 
 template <typename... Args>
 std::string Sprintf(const char* fmt, const Args&... args) {
   std::ostringstream oss;
-  tinyformat::format(oss, fmt, args...);
+  Fprintf(oss, fmt, args...);
   return oss.str();
 }
 
 template <typename... Args>
-void printf(const char* fmt, const Args&... args) {
-  tinyformat::format(std::cout, fmt, args...);
-}
-
-template <typename... Args>
-void printfln(const char* fmt, const Args&... args) {
-  tinyformat::format(std::cout, fmt, args...);
-  std::cout << '\n';
+void Printf(const char* fmt, const Args&... args) {
+  Fprintf(std::cout, fmt, args...);
 }
 
 }  // namespace string

From 5157ba692d53657c96f41c0a380219fe7a7a6b5a Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 28 Jun 2017 20:25:56 +0000
Subject: [PATCH 49/86] create save model design doc

---
 doc/design/cluster_train/save_model.md | 100 +++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 doc/design/cluster_train/save_model.md

diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
new file mode 100644
index 0000000000..3a9a24fb9c
--- /dev/null
+++ b/doc/design/cluster_train/save_model.md
@@ -0,0 +1,100 @@
+# Design Doc: Save Model
+
+## Overview
+
+The model is the output of the training process. There are two
+ways from which user can obtain a model:
+
+- Save model triggered by user code: user code asks PaddlePaddle to
+  save a model.
+- Convert model from the snapshot: model being converted from
+  pservers' periodic snapshot. In this way, the user can cancel a job
+  at any time, and still have a relatively fresh model (we snapshot
+  around every 5 minutes).
+
+### Save Model Triggered by User Code
+
+Both trainers and pservers have access to the model. So the model can
+be saved from a trainer or pservers. We need to decide on where the
+model is saved from.
+
+#### Dense Model vs. Sparse Model
+
+There are two types of model: dense and sparse model (when the
+parameter is configured to be sparse). Pservers always jointly have
+the entire model at any given time. Trainers only have the entire
+dense model, but only have a fraction of the sparse model at any given
+time.
+
+#### Pservers Saving Model
+
+The benefit of letting pservers save model is they have the entire
+model all the time. However, since pservers are on different nodes, it
+requires a merging process to merge model shards into the same
+model. Thus requires the pservers to write models to a distributed
+filesystem, making the snapshot shards visible to the merge program.
+
+#### Trainer Saving Model
+
+The benefit of letting one trainer to save the model is it does not
+require a distributed filesystem. And it's reusing the same save model
+logic when the trainer is training locally - except when training
+sparse model, the trainer needs to download the entire sparse model
+during the saving process.
+
+#### Conclusion
+
+Given trainer saving model does not require a distributed filesystem,
+and is an intuitive extension to training locally, we decide to let
+the trainer save the model.
+
+
+### Convert Model from Snapshot
+
+TODO
+
+
+## Timeline
+
+We first implement trainer save the model. Converting the latest
+snapshot to a model will be a TODO for future.
+
+
+## Trainer Save Model
+
+### Trainer Election
+
+One trainer will be elected as the one to save the model. When using
+etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
+elect one trainer. When not using etcd, unique trainer IDs will be
+given by the administrator, the trainer whose ID is "0" is elected to
+save the model.
+
+### Model Save Path
+
+Each trainer will be given the directory to save the model. The
+elected trainer will save the model to
+`given-directory/trainerID`. Since the tainerID is unique, this would
+prevent concurrent save to the same file when multiple trainers are
+elected to save the model when split-brain problem happens.
+
+### What Happens When Model Is Saving
+
+It takes some time to save model, we need to define what will happen
+when save model is taking place.
+
+When saving a dense model, the trainer uses the local model. Pservers
+does not need to pause model update.
+
+When saving a sparse model. The trainer needs to download the entire
+sparse model while saving. To get the most accurate model, the model
+update needs to be paused before the download starts and resumed after
+the download finishes. Otherwise, the trainer gets a model that is
+"polluted": some part of the model is old, some part of the model is
+new.
+
+It's unclear that the "polluted" model will be inferiod due to the
+stochastic nature of deep learning, and pausing the model update will
+add more complexity to the system. Since supporting sparse model is a
+TODO item. We defer the evaluation of pause the model update or not
+during saving model to the future.

From 48fdfd5a0350fea69d070716bd9c74ff56ebbc24 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jun 2017 09:40:29 +0800
Subject: [PATCH 50/86] fix bugs

---
 python/paddle/v2/dataset/uci_housing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index c715ea9681..ec10ce646e 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -14,7 +14,7 @@
 """
 UCI Housing dataset.
 
-This module will paddle.v2.dataset.common.download dataset from
+This module will download dataset from
 https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """

From c5e8e27ba6783a947965900931ed41c9ef2123fb Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 29 Jun 2017 10:02:20 +0800
Subject: [PATCH 51/86] "change empty gradient to error"

---
 go/pserver/client.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/go/pserver/client.go b/go/pserver/client.go
index a8d2d710d8..6938b9d5ce 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -1,6 +1,7 @@
 package pserver
 
 import (
+	"errors"
 	"hash/fnv"
 	"sort"
 	"time"
@@ -124,8 +125,7 @@ func (c *Client) FinishInitParams() error {
 // parameters.
 func (c *Client) SendGrads(grads []Gradient) error {
 	if len(grads) == 0 {
-		log.Info("Send Empty Gradient")
-		return nil
+		return errors.New("no gradient received")
 	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {

From 11a8dfe78e1626a6535a1d0ba8220c2dd3fa050c Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 29 Jun 2017 10:57:40 +0800
Subject: [PATCH 52/86] Use Parameters.from_tar for static method.

---
 python/paddle/v2/parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index f730ea10bb..4c4ff4c7c2 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -300,7 +300,7 @@ class Parameters(object):
         return params
 
     def init_from_tar(self, f):
-        tar_param = self.from_tar(f)
+        tar_param = Parameters.from_tar(f)
         for pname in tar_param.names():
             if pname in self.names():
                 self.set(pname, tar_param.get(pname))

From f0a3fb6e36e06512d537068ecd7c5f553a88da83 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 11:10:59 +0800
Subject: [PATCH 53/86] Using paddle::string in enforce

---
 paddle/framework/CMakeLists.txt               |  3 +-
 paddle/{platform => framework}/enforce.h      | 63 +++----------------
 .../{platform => framework}/enforce_test.cc   |  9 ++-
 paddle/platform/CMakeLists.txt                |  1 -
 4 files changed, 14 insertions(+), 62 deletions(-)
 rename paddle/{platform => framework}/enforce.h (60%)
 rename paddle/{platform => framework}/enforce_test.cc (82%)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e3c3155aa9..b06ecc2628 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,6 +1,5 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-
 cc_test(variable_test SRCS variable_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/platform/enforce.h b/paddle/framework/enforce.h
similarity index 60%
rename from paddle/platform/enforce.h
rename to paddle/framework/enforce.h
index fbd3405a24..56cb7f9564 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/framework/enforce.h
@@ -10,11 +10,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <paddle/string/printf.h>
 #include <exception>
 #include <sstream>
 
 namespace paddle {
-namespace platform {
+namespace framework {
 
 /**
  * @brief Enforce exception. Inherits std::exception
@@ -23,10 +24,9 @@ namespace platform {
  */
 class EnforceNotMet : public std::exception {
  public:
-  EnforceNotMet(const std::string& msg, const char* file, int fileline)
-      : file_(file), fileline_(fileline) {
+  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
     std::ostringstream sout;
-    sout << msg << " at [" << file_ << ":" << fileline_ << "];";
+    sout << msg << " at [" << file << ":" << fileline << "];";
     all_msg_ = sout.str();
   }
 
@@ -34,52 +34,8 @@ class EnforceNotMet : public std::exception {
 
  private:
   std::string all_msg_;
-  const char* file_;
-  int fileline_;
 };
 
-namespace details {
-
-inline void MakeStringInternal(std::ostringstream& stream) {}
-
-template <typename T>
-inline void MakeStringInternal(std::ostringstream& stream, T v) {
-  stream << v;
-}
-
-template <typename T, typename... ARGS>
-inline void MakeStringInternal(std::ostringstream& stream, T v, ARGS... args) {
-  MakeStringInternal(stream, v);
-  MakeStringInternal(stream, args...);
-};
-
-/**
- * @brief Make string will concat all args into a string.
- */
-template <typename... ARGS>
-inline std::string MakeString(ARGS... args) {
-  std::ostringstream sout;
-  details::MakeStringInternal(sout, args...);
-  return sout.str();
-}
-
-/**
- * @brief special handle string
- */
-template <>
-inline std::string MakeString<std::string>(std::string str) {
-  return str;
-}
-
-/**
- * @brief special handle const char*
- */
-template <>
-inline std::string MakeString<const char*>(const char* str) {
-  return std::string(str);
-}
-}  // namespace details
-
 // From https://stackoverflow.com/questions/30130930/
 // __buildin_expect is in C++ 11 standard. Since the condition which enforced
 // should be true in most situation, it will make the compiler generate faster
@@ -93,11 +49,10 @@ inline std::string MakeString<const char*>(const char* str) {
  * This macro take __VA_ARGS__, user can pass any type if that type can
  * serialize to std::ostream
  */
-#define PADDLE_THROW(...)                                               \
-  do {                                                                  \
-    throw ::paddle::platform::EnforceNotMet(                            \
-        ::paddle::platform::details::MakeString(__VA_ARGS__), __FILE__, \
-        __LINE__);                                                      \
+#define PADDLE_THROW(...)                                            \
+  do {                                                               \
+    throw ::paddle::framework::EnforceNotMet(                        \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
   } while (0)
 
 /**
@@ -110,5 +65,5 @@ inline std::string MakeString<const char*>(const char* str) {
     }                                  \
   } while (0)
 
-}  // namespace platform
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/framework/enforce_test.cc
similarity index 82%
rename from paddle/platform/enforce_test.cc
rename to paddle/framework/enforce_test.cc
index 23b32444ad..f8da1a192f 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/framework/enforce_test.cc
@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/platform/enforce.h>
+#include <paddle/framework/enforce.h>
 
 TEST(ENFORCE, OK) {
-  PADDLE_ENFORCE(true, "Enforce is ok", 123, "now", 0.345);
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
   size_t val = 1;
   const size_t limit = 10;
   PADDLE_ENFORCE(val < limit, "Enforce is OK too");
@@ -22,8 +22,8 @@ TEST(ENFORCE, OK) {
 TEST(ENFORCE, FAILED) {
   bool in_catch = false;
   try {
-    PADDLE_ENFORCE(false, "Enforce is not ok ", 123, " at all");
-  } catch (paddle::platform::EnforceNotMet err) {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::framework::EnforceNotMet err) {
     in_catch = true;
     std::string msg = "Enforce is not ok 123 at all";
     const char* what = err.what();
@@ -31,6 +31,5 @@ TEST(ENFORCE, FAILED) {
       ASSERT_EQ(what[i], msg[i]);
     }
   }
-
   ASSERT_TRUE(in_catch);
 }
\ No newline at end of file
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index bc72e62be4..c7d7b14518 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -2,4 +2,3 @@ nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-cc_test(enforce_test SRCS enforce_test.cc)

From aabe171566ca455925dd5a597497106e240656cb Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 29 Jun 2017 11:32:25 +0800
Subject: [PATCH 54/86] merge CreateVar and GetOrCreateVar

---
 paddle/framework/scope.h       | 21 +++++++--------------
 paddle/framework/scope_test.cc | 13 +++++++++----
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index bb22c4b834..88a13145ca 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/variable.h"
-#include "paddle/platform/assert.h"
 
 namespace paddle {
 namespace framework {
@@ -44,9 +43,13 @@ class Scope {
   /// Create Variable in this Scope. Failed if Variable already been
   /// created.
   Variable* CreateVariable(const std::string& name) {
-    PADDLE_ASSERT(!HasVariable(name));
-    vars_[name] = std::unique_ptr<Variable>(new Variable());
-    return GetVariable(name);
+    auto var = GetVariable(name);
+    if (var) {
+      return var;
+    } else {
+      vars_[name] = std::unique_ptr<Variable>(new Variable());
+      return GetVariable(name);
+    }
   }
 
   /// Get Variable from this Scope, this function will recursive find Variable
@@ -62,16 +65,6 @@ class Scope {
     }
   }
 
-  /// Get Variable from scope, if Variable is not exist, creat one and return.
-  Variable* GetOrCreateVariable(const std::string& name) {
-    auto var = GetVariable(name);
-    if (var) {
-      return var;
-    } else {
-      return CreateVariable(name);
-    }
-  }
-
   /// Find if there is a Variable in this scope and it's parent scope
   bool HasVariable(const std::string& name) const {
     return (vars_.find(name) != vars_.end() ||
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index d73391d977..ec6236ec62 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -24,18 +24,22 @@ TEST(Scope, Create) {
   Variable* var0 = scope->CreateVariable("");
   EXPECT_NE(var0, nullptr);
 
+  /// GetVariable will return nullptr if not exist.
   Variable* var1 = scope->GetVariable("a");
   EXPECT_EQ(var1, nullptr);
 
+  /// CreateVariable will return one.
   Variable* var2 = scope->CreateVariable("a");
+  EXPECT_NE(var2, nullptr);
 
-  ASSERT_DEATH({ scope->CreateVariable("a"); }, "");
-
+  /// Get the created variable.
   Variable* var3 = scope->GetVariable("a");
   EXPECT_EQ(var2, var3);
 
-  Variable* var4 = scope->GetOrCreateVariable("a");
-  EXPECT_EQ(var2, var4);
+  /// CreateVariable will just return the variable if it's
+  /// already exist.
+  Variable* var4 = scope->CreateVariable("a");
+  EXPECT_EQ(var4, var2);
 }
 
 TEST(Scope, Parent) {
@@ -48,6 +52,7 @@ TEST(Scope, Parent) {
   Variable* var0 = parent_scope->CreateVariable("a");
   EXPECT_NE(var0, nullptr);
 
+  /// GetVariable will get Variable from parent scope if exist.
   Variable* var1 = scope->GetVariable("a");
   EXPECT_EQ(var0, var1);
 }

From 456f9cc89f6ec5a80e08522ff1fafb8f20a21fa6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 12:11:03 +0800
Subject: [PATCH 55/86] Remove Python protobuf function

---
 cmake/generic.cmake  | 23 -----------------------
 proto/CMakeLists.txt | 19 ++++++++++++++++++-
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 24a07c0a24..8736d30059 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -357,26 +357,3 @@ function(pb_cc_library TARGET_NAME)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs})
 endfunction()
-
-function(pb_py_library TARGET_NAME)
-  set(oneValueArgs TARGET_DIR)
-  set(multiValueArgs SRCS)
-  cmake_parse_arguments(pb_py_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (NOT pb_py_library_TARGET_DIR)
-    set(pb_py_library_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  endif()
-  set(py_srcs)
-  foreach(FIL ${pb_py_library_SRCS})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
-    set(cur_py_src ${pb_py_library_TARGET_DIR}/${FIL_WE}_pb2.py)
-    list(APPEND py_srcs "${cur_py_src}")
-    add_custom_command(OUTPUT ${cur_py_src}
-            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${pb_py_library_TARGET_DIR}" "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} protoc
-            COMMENT "Running Python protocol buffer compiler on ${FIL}")
-  endforeach()
-
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
-endfunction()
\ No newline at end of file
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 4402f2c899..1cf39d6944 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,3 +1,20 @@
 file(GLOB proto_filenames . *.proto)
 pb_cc_library(paddle_proto SRCS ${proto_filenames})
-pb_py_library(gen_proto_py SRCS ${proto_filenames} TARGET_DIR ${PROJ_ROOT}/python/paddle/proto)
+
+set(PROTO_GEN)
+set(PROTO_GEN_PY)
+
+foreach(filename ${proto_filenames})
+    get_filename_component(base_filename ${filename} NAME_WE)
+    set(CUR_PROTO_GEN_PY
+            ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+    set(PROTO_GEN_PY
+            ${CUR_PROTO_GEN_PY}
+            ${PROTO_GEN_PY})
+    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+            COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+            --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+            DEPENDS ${filename} ${external_project_dependencies})
+endforeach()
+
+add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})

From 32d6587242e5a2e97fe5b9e675273fa96fd99c5a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 12:28:48 +0800
Subject: [PATCH 56/86] Use protobuf_generate_cpp

---
 cmake/external/protobuf.cmake |  7 +++++++
 cmake/generic.cmake           | 17 +----------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 891fb29118..2f267adc20 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -13,6 +13,10 @@
 # limitations under the License.
 
 INCLUDE(ExternalProject)
+# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+FIND_PACKAGE(Protobuf QUIET)
+SET(PROTOBUF_FOUND "OFF")
+
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
@@ -44,6 +48,9 @@ macro(PROMPT_PROTOBUF_LIB)
 
     ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
     SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
+    # make `protobuf_generate_cpp` happy.
+    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
 
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8736d30059..cdf917a1e9 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -338,22 +338,7 @@ function(pb_cc_library TARGET_NAME)
   cmake_parse_arguments(pb_cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(proto_srcs)
   set(proto_hdrs)
-  foreach(FIL ${pb_cc_library_SRCS})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
-    list(APPEND proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
-    list(APPEND proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
-
-    add_custom_command(
-            OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc"
-            "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h"
-            COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--cpp_out=${DLL_EXPORT_DECL}${CMAKE_CURRENT_BINARY_DIR}" "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} protoc
-            COMMENT "Running C++ protocol buffer compiler on ${FIL}"
-            VERBATIM )
-  endforeach()
-  set_source_files_properties(${proto_srcs} ${proto_hdrs} PROPERTIES GENERATED TRUE)
+  protobuf_generate_cpp(proto_srcs proto_hdrs ${pb_cc_library_SRCS})
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs})
 endfunction()

From 23d6c594eca369820b5f4dfcd0a38a9f4cd6122e Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 29 Jun 2017 12:33:07 +0800
Subject: [PATCH 57/86] add comments

---
 python/paddle/v2/parameters.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 4c4ff4c7c2..bbaf8bfa97 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -284,6 +284,18 @@ class Parameters(object):
 
     @staticmethod
     def from_tar(f):
+        """
+        Create a `Parameters` object from the given file. And
+        the `Parameters` only contains the parameters in this
+        file. It is adapted the parameters are same in the
+        defined network and the given file. For example, it
+        can be used in the inference.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: A Parameters object.
+        :rtype: Parameters.
+        """
         params = Parameters()
         tar = tarfile.TarFile(fileobj=f, mode='r')
         for finfo in tar:
@@ -300,6 +312,15 @@ class Parameters(object):
         return params
 
     def init_from_tar(self, f):
+        """
+        Different from `from_tar`, this interface can be used to
+        init partial network parameters from another saved model.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: Nothing.
+        """
+
         tar_param = Parameters.from_tar(f)
         for pname in tar_param.names():
             if pname in self.names():

From b88ca542baa4bb5e5822912dceeb4ceb2080c660 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 12:43:51 +0800
Subject: [PATCH 58/86] Rename pb_cc_library -> proto_library

---
 cmake/generic.cmake  | 6 +++---
 proto/CMakeLists.txt | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cdf917a1e9..779f627115 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -332,13 +332,13 @@ function(go_test TARGET_NAME)
   add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
-function(pb_cc_library TARGET_NAME)
+function(proto_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS)
-  cmake_parse_arguments(pb_cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(proto_srcs)
   set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${pb_cc_library_SRCS})
+  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs})
 endfunction()
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 1cf39d6944..436bea53e5 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB proto_filenames . *.proto)
-pb_cc_library(paddle_proto SRCS ${proto_filenames})
+proto_library(paddle_proto SRCS ${proto_filenames})
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)

From 4a4ec31e0d2f07b5e29acfd1b5b0b62d40f7ab91 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 12:53:10 +0800
Subject: [PATCH 59/86] Fix TravisCI

---
 paddle/cuda/CMakeLists.txt |  2 +-
 proto/CMakeLists.txt       | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index f9061e96de..73ffa690d9 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
-add_dependencies(paddle_cuda ${external_project_dependencies})
+add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 436bea53e5..70dd4d674c 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -5,16 +5,18 @@ set(PROTO_GEN)
 set(PROTO_GEN_PY)
 
 foreach(filename ${proto_filenames})
-    get_filename_component(base_filename ${filename} NAME_WE)
+    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+    get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-            COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-            --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-            DEPENDS ${filename} ${external_project_dependencies})
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} ${external_project_dependencies})
 endforeach()
 
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})

From 9af8d86b7ceedbc244873ee5207392231bab540a Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Thu, 29 Jun 2017 13:20:13 +0800
Subject: [PATCH 60/86] Trainer library discover master by etcd (#2551)

* add trainer library

* modifty file name

* move trainer to master client

* update

* update

* modify monitor master to receive a chan

* update

* use etcd client from etcd_client.go

* update

* update

* remove etcd client without lock

* update

* update the comment

* update commonts
---
 go/master/c/client.go             | 30 +++++++++++++++++++++++++-----
 go/master/client.go               | 24 ++++--------------------
 go/master/client_internal_test.go | 13 ++++---------
 go/master/client_test.go          |  8 +++-----
 go/master/etcd_client.go          | 28 ++++++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index b186474dc3..9e35e98600 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -13,10 +13,13 @@ typedef int paddle_master_client;
 import "C"
 
 import (
+	"strings"
 	"sync"
+	"time"
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
@@ -48,16 +51,33 @@ func remove(client C.paddle_master_client) *master.Client {
 	return h
 }
 
-type addresser string
-
-func (a addresser) Address() string {
-	return string(a)
+//export paddle_new_etcd_master_client
+func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
+	p := C.GoString(etcdEndpoints)
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   strings.Split(p, ","),
+		DialTimeout: time.Second * time.Duration(timeout),
+	})
+	if err != nil {
+		panic(err)
+	}
+	ch := make(chan string, 1)
+	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
+	if err != nil {
+		panic(err)
+	}
+	ch <- a
+	go master.WatchKey(cli, master.DefaultAddrPath, ch)
+	c := master.NewClient(ch, bufSize)
+	return add(c)
 }
 
 //export paddle_new_master_client
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	c := master.NewClient(addresser(a), bufSize)
+	ch := make(chan string, 1)
+	ch <- a
+	c := master.NewClient(ch, bufSize)
 	return add(c)
 }
 
diff --git a/go/master/client.go b/go/master/client.go
index 8451820c19..d3bea49d0a 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,18 +2,12 @@ package master
 
 import (
 	"os"
-	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
 	log "github.com/sirupsen/logrus"
 )
 
-// Addresser provide the address of the master server.
-type Addresser interface {
-	Address() string
-}
-
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
@@ -24,11 +18,11 @@ type Client struct {
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addr Addresser, bufSize int) *Client {
+func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
 	c.ch = make(chan []byte, bufSize)
-	go c.monitorMaster(addr)
+	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
 }
@@ -72,12 +66,10 @@ func (c *Client) getRecords() {
 	}
 }
 
-func (c *Client) monitorMaster(addr Addresser) {
+func (c *Client) monitorMaster(addrCh <-chan string) {
 	lastMaster := ""
-	monitor := func() {
-		// get the lastest address of the master server,
+	for curMaster := range addrCh {
 		// connect to the new address once address changed.
-		curMaster := addr.Address()
 		if curMaster != lastMaster {
 			if curMaster == "" {
 				err := c.conn.Close()
@@ -94,18 +86,10 @@ func (c *Client) monitorMaster(addr Addresser) {
 					// to retry next time.
 					curMaster = lastMaster
 				}
-
 			}
 		}
-
 		lastMaster = curMaster
 	}
-
-	monitor()
-	ticker := time.NewTicker(10 * time.Second)
-	for _ = range ticker.C {
-		monitor()
-	}
 }
 
 // SetDataset set dataset for the master server to dispatch.
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 251225780a..364dce7b58 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -26,12 +26,6 @@ func init() {
 	log.SetLevel(log.ErrorLevel)
 }
 
-type TestAddresser string
-
-func (a TestAddresser) Address() string {
-	return string(a)
-}
-
 func TestGetFinishTask(t *testing.T) {
 	const path = "/tmp/master_client_test_0"
 
@@ -45,7 +39,6 @@ func TestGetFinishTask(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
 		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
 		if err != nil {
@@ -82,9 +75,11 @@ func TestGetFinishTask(t *testing.T) {
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
 	c.conn = connection.New()
-	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	addr := fmt.Sprintf(":%d", p)
+	ch := make(chan string, 1)
+	ch <- addr
+	go c.monitorMaster(ch)
 	c.SetDataset([]string{path})
-
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 85a86761c2..c00aeebfd5 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -20,7 +20,6 @@ func TestNextRecord(t *testing.T) {
 		path  = "/tmp/master_client_TestFull"
 		total = 50
 	)
-
 	l, err := net.Listen("tcp", ":0")
 	if err != nil {
 		panic(err)
@@ -31,7 +30,6 @@ func TestNextRecord(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
 		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
 		if err != nil {
@@ -63,10 +61,10 @@ func TestNextRecord(t *testing.T) {
 	}
 	w.Close()
 	f.Close()
-
-	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	curAddr := make(chan string, 1)
+	curAddr <- fmt.Sprintf(":%d", p)
+	c := master.NewClient(curAddr, 10)
 	c.SetDataset([]string{path})
-
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index f7b4638577..e27c014792 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -142,3 +142,31 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	state := kvs[0].Value
 	return state, nil
 }
+
+// GetKey gets the value by the specify key.
+func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	resp, err := c.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return "", err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return "", nil
+	}
+	v := kvs[0].Value
+	return string(v), nil
+}
+
+// WatchKey watches the specify key and send to valChan if there is some event.
+func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+	rch := c.Watch(context.Background(), key)
+	for wresp := range rch {
+		for _, ev := range wresp.Events {
+			// if received event is DELETE, the value will be an empty string
+			log.Infof("received event %s, %q : %q\n", ev.Type, ev.Kv.Key, ev.Kv.Value)
+			valChan <- string(ev.Kv.Value)
+		}
+	}
+}

From 2d3c186a96178132784ecb9ba81403b67c3b7f67 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 14:14:55 +0800
Subject: [PATCH 61/86] Follow comments

---
 cmake/generic.cmake  | 6 ++++--
 proto/CMakeLists.txt | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 779f627115..8117dbc53e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -87,6 +87,9 @@
 #   go_library(example SHARED)
 #
 
+# including binary directory for generated headers.
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
@@ -339,6 +342,5 @@ function(proto_library TARGET_NAME)
   set(proto_srcs)
   set(proto_hdrs)
   protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  include_directories(${CMAKE_CURRENT_BINARY_DIR})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs})
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
 endfunction()
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 70dd4d674c..18584cafe7 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,4 +1,5 @@
 file(GLOB proto_filenames . *.proto)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 proto_library(paddle_proto SRCS ${proto_filenames})
 
 set(PROTO_GEN)

From 9af54c9ee98cabb9e70a10261a1c19cd8e8b3297 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 29 Jun 2017 14:39:37 +0800
Subject: [PATCH 62/86] do not use default argument in Scope

---
 paddle/framework/scope.h       | 41 +++++++++++++++++++++++-----------
 paddle/framework/scope_test.cc |  6 ++---
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 88a13145ca..a4470f726f 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -24,24 +24,31 @@ namespace paddle {
 namespace framework {
 
 /**
+ * @brief Scope that manage all variables.
+ *
  * Scope is an association of a name to Variable. All variables belong to
  * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
  * One net can run in different scopes and update different variable in the
  * scope.
  */
 class Scope {
- private:
-  explicit Scope(const std::shared_ptr<Scope>& parent = nullptr)
-      : parent_(parent) {}
-
  public:
-  static std::shared_ptr<Scope> Create(
-      const std::shared_ptr<Scope>& parent = nullptr) {
-    return std::make_shared<Scope>(Scope(parent));
-  }
+  /**
+   * @brief Initialize s Scope without parent.
+   */
+  Scope() {}
+
+  /**
+   * @brief Initialize a Scope with parent.
+   */
+  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
 
-  /// Create Variable in this Scope. Failed if Variable already been
-  /// created.
+  /**
+   * @brief Create Variable
+   *
+   * Create Variable in this Scope. Return the exist one if Variable already
+   * been created.
+   */
   Variable* CreateVariable(const std::string& name) {
     auto var = GetVariable(name);
     if (var) {
@@ -52,8 +59,12 @@ class Scope {
     }
   }
 
-  /// Get Variable from this Scope, this function will recursive find Variable
-  /// from it's parent scope. Return nullptr if not found.
+  /**
+   * @brief Get Variable.
+   *
+   * Get Variable from this Scope, this function will recursive find Variable
+   * from it's parent scope. Return nullptr if not found.
+   */
   Variable* GetVariable(const std::string& name) const {
     auto it = vars_.find(name);
     if (it != vars_.end()) {
@@ -65,7 +76,11 @@ class Scope {
     }
   }
 
-  /// Find if there is a Variable in this scope and it's parent scope
+  /**
+   * @brief If this scope has a Var named name.
+   *
+   * Find if there is a Variable in this scope and it's parent scope
+   */
   bool HasVariable(const std::string& name) const {
     return (vars_.find(name) != vars_.end() ||
             (parent_ && parent_->HasVariable(name)));
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index ec6236ec62..df1afb200c 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -19,7 +19,7 @@ TEST(Scope, Create) {
   using paddle::framework::Scope;
   using paddle::framework::Variable;
 
-  auto scope = Scope::Create();
+  auto scope = std::make_shared<Scope>();
 
   Variable* var0 = scope->CreateVariable("");
   EXPECT_NE(var0, nullptr);
@@ -46,8 +46,8 @@ TEST(Scope, Parent) {
   using paddle::framework::Scope;
   using paddle::framework::Variable;
 
-  auto parent_scope = Scope::Create();
-  auto scope = Scope::Create(parent_scope);
+  auto parent_scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<Scope>(parent_scope);
 
   Variable* var0 = parent_scope->CreateVariable("a");
   EXPECT_NE(var0, nullptr);

From 3c925feb71a9b8c40cad60cf4c453ba083ed69a9 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 29 Jun 2017 14:52:50 +0800
Subject: [PATCH 63/86] update design doc

---
 doc/design/scope.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/design/scope.md b/doc/design/scope.md
index 4d14a64977..afe6bc028c 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -59,9 +59,9 @@ class Scope {
   Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
 
   Variable* GetVariable(const std::string& name) const {
-    Variable* var = GetVarLocally(name);
-    if (var != nullptr) {
-      return var;
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
     } else if (parent_ != nullptr) {
       return parent_->GetVariable(name);
     } else {
@@ -97,8 +97,8 @@ class Scope {
   // return nullptr if not found.
   Variable* GetVariable(const std::string& name) const;
 
-  // return Error if already contains same name variable.
-  Error CreateVariable(const std::string& name);
+  // return if already contains same name variable.
+  Variable* CreateVariable(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;

From 3d44fd5bf38b8cd74fccc17081972b5a9a0eaa2e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 14:53:29 +0800
Subject: [PATCH 64/86] Follow yiqun's comments

---
 cmake/generic.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8117dbc53e..61353a4a26 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 #
 
 # including binary directory for generated headers.
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_BINARY_DIR})
 
 if(NOT APPLE)
     find_package(Threads REQUIRED)

From c18275ffb3d80047cb77eedcd88d1ffb11d72ea5 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 29 Jun 2017 14:56:16 +0800
Subject: [PATCH 65/86] add more choice for eigen downloading

---
 cmake/external/eigen.cmake | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 253d436bcc..45f44f617d 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -7,8 +7,17 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
 ExternalProject_Add(
     eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
-    URL_MD5        "1a47e78efe365a97de0c022d127607c3"
+    # for latest version, please get from official website
+    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
+    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
+
+    # for no-ssl http support, please get from bazel's mirror
+    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
+    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
+
+    # get from github mirror
+    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""

From 52efb243b82b7b68868cfe5391eafb0376ddb839 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 15:06:46 +0800
Subject: [PATCH 66/86] Fix CI tests

---
 paddle/testing/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4aa6eae681..4245df5ab7 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main paddle_proto)
+  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util paddle_proto)
+  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
 endif()

From a1e7284ad139e6578036b7d872563aa1a2642351 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Jun 2017 15:24:12 +0800
Subject: [PATCH 67/86] Fix TravisCI

---
 paddle/pserver/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index f2e0b4b76b..2245c7d88c 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_network STATIC
 add_style_check_target(paddle_network ${NETWORK_SOURCES})
 add_style_check_target(paddle_network ${NETWORK_HEADERS})
 
-add_dependencies(paddle_network paddle_proto)
+add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
 
 ################### paddle_pserver ######################
 set(PSERVER_SOURCES

From b0ad9c907422e1256bc5ae6881913f71cd9d4aed Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Wed, 28 Jun 2017 10:12:23 +0800
Subject: [PATCH 68/86] enable intializing memory state for lstmemory_group.

---
 .../paddle/trainer_config_helpers/layers.py   | 51 ++++++++++---------
 .../paddle/trainer_config_helpers/networks.py | 48 ++++++++++-------
 2 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 84ed160773..a601d5c84a 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1149,10 +1149,10 @@ def pooling_layer(input,
 @layer_support(DROPOUT)
 def lstmemory(input,
               name=None,
+              size=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               state_act=None,
               bias_attr=None,
               param_attr=None,
@@ -1194,6 +1194,8 @@ def lstmemory(input,
 
     :param name: The lstmemory layer name.
     :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
     :param input: input layer name.
     :type input: LayerOutput
     :param reverse: is sequence process reversed or not.
@@ -1220,15 +1222,15 @@ def lstmemory(input,
     assert state_act.support_hppl
     assert act.support_hppl
     assert input.size is not None and input.size % 4 == 0
+
     if size is not None:
         if input.size / 4 == size:
             plog = logger.warning
         else:
             plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -1255,11 +1257,11 @@ def lstmemory(input,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
               name=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               bias_attr=None,
               param_attr=None,
               layer_attr=None):
@@ -1318,6 +1320,8 @@ def grumemory(input,
     :type name: None|basestring
     :param input: input layer.
     :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
     :param reverse: Whether sequence process is reversed or not.
     :type reverse: bool
     :param act: activation type, TanhActivation by default. This activation
@@ -1334,9 +1338,6 @@ def grumemory(input,
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
     :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1348,9 +1349,9 @@ def grumemory(input,
             plog = logger.warning
         else:
             plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,
 
 
 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                     bias_attr=None,
                     layer_attr=None):
     """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
 
     The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
     :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.
 
     The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
 
@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
         ...
 
 
-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and can use
     :code:`get_output_layer` to extract this output.
 
     :param name: Layer's name.
     :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
+                 :code:`input.size/4`, and should be equal to
                  :code:`state.size`.
     :type size: int
     :param input: input layer. :math:`Wx_t + Wh_{t-1}`
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 67154a8d7d..0d730e0995 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,6 +614,7 @@ def simple_lstm(input,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                    name=None,
                    size=None,
                    param_attr=None,
@@ -626,9 +627,9 @@ def lstmemory_unit(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    Define calculations that a LSTM unit performs during a single time step.
+    This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
@@ -638,13 +639,13 @@ def lstmemory_unit(input,
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
@@ -661,6 +662,8 @@ def lstmemory_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: lstmemory unit name.
     :type name: basestring
     :param size: lstmemory unit size.
@@ -692,7 +695,8 @@ def lstmemory_unit(input,
         assert input.size % 4 == 0
         size = input.size / 4
     out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)
 
     with mixed_layer(
             name="%s_input_recurrent" % name,
@@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                     size=None,
                     name=None,
+                    memory_boot=None,
                     reverse=False,
                     param_attr=None,
                     act=None,
@@ -737,7 +742,7 @@ def lstmemory_group(input,
                     lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
     cell states, or hidden states in every time step are accessible to the
@@ -748,8 +753,8 @@ def lstmemory_group(input,
 
     NOTE: In PaddlePaddle's implementation, the following input-to-hidden
     multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
     speed up the calculations. Consequently, an additional mixed_layer with
     full_matrix_projection must be included before lstmemory_unit is called.
 
@@ -765,8 +770,10 @@ def lstmemory_group(input,
 
     :param input: input layer name.
     :type input: LayerOutput
-    :param name: lstmemory group name.
+    :param name: name of the lstmemory group.
     :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param size: lstmemory group size.
     :type size: int
     :param reverse: is lstm reversed
@@ -798,6 +805,7 @@ def lstmemory_group(input,
     def __lstm_step__(ipt):
         return lstmemory_unit(
             input=ipt,
+            memory_boot=memory_boot,
             name=name,
             size=size,
             mixed_bias_attr=mixed_bias_attr,
@@ -819,6 +827,7 @@ def lstmemory_group(input,
 
 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
              size=None,
              name=None,
              gru_bias_attr=None,
@@ -829,8 +838,8 @@ def gru_unit(input,
              naive=False):
     """
     Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    step. This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
@@ -838,6 +847,8 @@ def gru_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
@@ -856,7 +867,7 @@ def gru_unit(input,
     if size is None:
         size = input.size / 3
 
-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
 
     if naive:
         __step__ = gru_step_naive_layer
@@ -878,6 +889,7 @@ def gru_unit(input,
 
 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
               size=None,
               name=None,
               reverse=False,
@@ -888,7 +900,7 @@ def gru_group(input,
               gru_layer_attr=None,
               naive=False):
     """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
     benefit is that gru hidden states are accessible to the user. This is
     especially useful in attention model. If you do not need to access
@@ -908,6 +920,8 @@ def gru_group(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
@@ -929,6 +943,7 @@ def gru_group(input,
     def __gru_step__(ipt):
         return gru_unit(
             input=ipt,
+            memory_boot=memory_boot,
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
@@ -1083,7 +1098,6 @@ def simple_gru2(input,
 
     return grumemory(
         name=name,
-        size=size,
         input=m,
         reverse=reverse,
         bias_attr=gru_bias_attr,

From 5c68aacad1abe9eefc4f1039aca4962b6c6d601f Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 29 Jun 2017 16:48:33 +0800
Subject: [PATCH 69/86] follow comments.

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 0d730e0995..b77932ce5f 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -770,12 +770,12 @@ def lstmemory_group(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param size: lstmemory group size.
+    :type size: int
     :param name: name of the lstmemory group.
     :type name: basestring
     :param memory_boot: the initialization state of LSTM cell.
     :type memory_boot: LayerOutput | None
-    :param size: lstmemory group size.
-    :type size: int
     :param reverse: is lstm reversed
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.

From cc0704227024b58d3721bdd305b9814b57a9d139 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Thu, 29 Jun 2017 18:50:11 +0800
Subject: [PATCH 70/86] update design doc

---
 doc/design/build_system/README.md | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
index 96af6566d0..3e540e072d 100644
--- a/doc/design/build_system/README.md
+++ b/doc/design/build_system/README.md
@@ -108,14 +108,14 @@ As above example CMakeLists.txt executes, each function invocation adds "nodes"
 
 ### Using Package Manager For Go
 
-Building go binaries and libraries need to satisfy their dependencies, generally
+Building Go binaries and libraries need to satisfy their dependencies, generally
 we can do `go get ./...` to download and compile all external dependencies. The
 problems are:
 
-1. `go get` will always get the latest code from master branch, so when an external
-    project updated and deprecates something or made changes to their APIs, builds
-    may not pass. This is very different with what we already have in `cmake/external`
-    which download a specific version or commit id of the dependency.
+1. `go get` will always get the latest code from the default branch of the
+    remote repo, so changes of dependents might break the build. This is very
+    different with what we already have in `cmake/external` which download a
+    specific version or commit id of the dependency.
 1. Some locations can not access external dependencies through the internet, as mentioned
    in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
    tools can package the dependencies as a "vendor" package, which can be mirrored
@@ -124,10 +124,20 @@ problems are:
 
 #### Godep vs. Glide
 
-Here's a brief comparison for current Go ecosystem: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
-also many complaints about `Godep`. A new "official" pakcage management tool has been
-started: https://github.com/golang/dep to resolve such problems, but it's currently
-at Alpha stage. So the best choice now is glide obviously.
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Commonly used project
+
+Then we shall choose between the most popular tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
 
 #### Manage Go Packages
 

From 7993ff794d23f07804cbe1308e6d8c143b9cdc1f Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Thu, 29 Jun 2017 19:04:03 +0800
Subject: [PATCH 71/86] update design doc

---
 doc/design/build_system/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
index 3e540e072d..bf0e4dddc1 100644
--- a/doc/design/build_system/README.md
+++ b/doc/design/build_system/README.md
@@ -122,15 +122,16 @@ problems are:
    at many cloud file hosting, so users what to compile paddle by themselves can
    download this "vendor" package from a mirror site.
 
-#### Godep vs. Glide
+#### Choose A Suitable Tool
 
 As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
 list dozens of Go package managers. We choose the tool using following principles:
 
 - Most "active" projects with more stars, more pull requests or commits
-- Commonly used project
+- Widely used project
 
-Then we shall choose between the most popular tools: Godep and Glide.
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
 
 Here's a brief comparison between Godep and Glide
 : https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are

From 7c066f6e3e43cfc2b43d46f5e860a291b125b3d4 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 30 Jun 2017 00:45:07 +0000
Subject: [PATCH 72/86] fix according to comments

---
 doc/design/cluster_train/save_model.md | 52 +++++++++++++++-----------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
index 3a9a24fb9c..76ac8d8387 100644
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@@ -7,24 +7,34 @@ ways from which user can obtain a model:
 
 - Save model triggered by user code: user code asks PaddlePaddle to
   save a model.
-- Convert model from the snapshot: model being converted from
-  pservers' periodic snapshot. In this way, the user can cancel a job
-  at any time, and still have a relatively fresh model (we snapshot
-  around every 5 minutes).
+- Convert model from the checkpoint: model being converted from
+  pservers' periodic checkpoint. In this way, the user can cancel a
+  job at any time, and still have a relatively fresh model (we
+  checkpoint around every 5 minutes).
 
-### Save Model Triggered by User Code
+### Trainer Saving Model vs. Pservers Saving Model
 
 Both trainers and pservers have access to the model. So the model can
 be saved from a trainer or pservers. We need to decide on where the
 model is saved from.
 
-#### Dense Model vs. Sparse Model
+#### Dense Update vs. Sparse Update
+
+There are two types of model update methods: dense update and sparse
+update (when the parameter is configured to be sparse).
+
+- Dense update
+
+  Every trainer has it's own full copy of the model. Every model
+  update will update the entire model.
+
+- Sparse update
+
+  The training input is sparse, and the trainer does not have the
+  entire model. It will only download the sub-model necessary related
+  to the input. When updating the model, only the sub-model related to
+  the training input is updated.
 
-There are two types of model: dense and sparse model (when the
-parameter is configured to be sparse). Pservers always jointly have
-the entire model at any given time. Trainers only have the entire
-dense model, but only have a fraction of the sparse model at any given
-time.
 
 #### Pservers Saving Model
 
@@ -32,15 +42,15 @@ The benefit of letting pservers save model is they have the entire
 model all the time. However, since pservers are on different nodes, it
 requires a merging process to merge model shards into the same
 model. Thus requires the pservers to write models to a distributed
-filesystem, making the snapshot shards visible to the merge program.
+filesystem, making the checkpoint shards visible to the merge program.
 
 #### Trainer Saving Model
 
 The benefit of letting one trainer to save the model is it does not
 require a distributed filesystem. And it's reusing the same save model
-logic when the trainer is training locally - except when training
-sparse model, the trainer needs to download the entire sparse model
-during the saving process.
+logic when the trainer is training locally - except when doing sparse
+update, the trainer needs to download the entire model during the
+saving process.
 
 #### Conclusion
 
@@ -49,7 +59,7 @@ and is an intuitive extension to training locally, we decide to let
 the trainer save the model.
 
 
-### Convert Model from Snapshot
+### Convert Model from Checkpoint
 
 TODO
 
@@ -86,15 +96,15 @@ when save model is taking place.
 When saving a dense model, the trainer uses the local model. Pservers
 does not need to pause model update.
 
-When saving a sparse model. The trainer needs to download the entire
-sparse model while saving. To get the most accurate model, the model
-update needs to be paused before the download starts and resumed after
-the download finishes. Otherwise, the trainer gets a model that is
+When doing sparse update. The trainer needs to download the entire
+model while saving. To get the most accurate model, the model update
+needs to be paused before the download starts and resumed after the
+download finishes. Otherwise, the trainer gets a model that is
 "polluted": some part of the model is old, some part of the model is
 new.
 
 It's unclear that the "polluted" model will be inferiod due to the
 stochastic nature of deep learning, and pausing the model update will
-add more complexity to the system. Since supporting sparse model is a
+add more complexity to the system. Since supporting sparse update is a
 TODO item. We defer the evaluation of pause the model update or not
 during saving model to the future.

From 59cf5e77962d743b7535e3ffd72e7ebe00c29502 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 28 Jun 2017 22:32:33 +0000
Subject: [PATCH 73/86] Fix Go cmake

---
 CMakeLists.txt                         | 31 ++++++++++++++++++++++----
 cmake/generic.cmake                    | 26 ++++++---------------
 go/master/c/CMakeLists.txt             | 20 +----------------
 go/pserver/cclient/test/CMakeLists.txt |  2 +-
 paddle/trainer/CMakeLists.txt          |  4 ++--
 python/CMakeLists.txt                  | 18 ++++++++++-----
 python/paddle/v2/__init__.py           |  1 +
 python/setup.py.in                     |  2 ++
 8 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24a7066adc..edea8279df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,14 +130,37 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 add_subdirectory(proto)
-add_subdirectory(paddle)
-add_subdirectory(python)
 
+# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
+# placed after this block, because they depends on it.
 if(WITH_GOLANG)
-    #TODO (add go/master/c back when fixed)
-    add_subdirectory(go/pserver/cclient)
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+  file(MAKE_DIRECTORY ${GOPATH})
+  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
+  add_custom_target(go_path)
+  add_custom_command(TARGET go_path
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code
+    # We can't run `go get -d ./...` for every target, because
+    # multiple `go get` can not run concurrently, but make need to be
+    # able to run with multiple jobs.
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./go/...
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+  add_subdirectory(go/master/c)
+  add_subdirectory(go/pserver/cclient)
 endif(WITH_GOLANG)
 
+add_subdirectory(paddle)
+add_subdirectory(python)
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 11c1f677ae..8a9bf12ccc 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -253,10 +253,6 @@ function(nv_test TARGET_NAME)
   endif()
 endfunction(nv_test)
 
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
-
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
@@ -265,10 +261,10 @@ function(go_library TARGET_NAME)
 
   if (go_library_SHARED OR go_library_shared)
     set(BUILD_MODE "-buildmode=c-shared")
-    set(LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   else()
     set(BUILD_MODE "-buildmode=c-archive")
-    set(LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   endif()
 
   # Add dummy code to support `make target_name` under Terminal Command
@@ -283,25 +279,17 @@ function(go_library TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
   endif(go_library_DEPS)
 
-  # we need to symlink Paddle directory into GOPATH. If we
-  # don't do it and we have code that depends on Paddle, go
-  # get ./... will download a new Paddle repo from Github,
-  # without the changes in our current Paddle repo that we
-  # want to build.
+  set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
+
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    # Symlink Paddle directory into GOPATH
-    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
-    COMMAND rm -rf ${PADDLE_IN_GOPATH}                                                                                                                                         
-    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
-    # Automatically get all dependencies specified in the source code                                                                                                                                 
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./...
+    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    -o "${${TARGET_NAME}_LIB_PATH}"
     ${GO_SOURCE}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  add_dependencies(${TARGET_NAME} go_path)
 endfunction(go_library)
 
 function(go_binary TARGET_NAME)
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
index acce698051..a4e92635ba 100644
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
@@ -1,21 +1,3 @@
 cmake_minimum_required(VERSION 3.0)
 
-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-project(cxx_go C Go)
-
-include(golang)
-include(flags)
-
-set(MASTER_LIB_NAME "paddle_master")
-go_library(${MASTER_LIB_NAME} SHARED)
-
-if(PROJ_ROOT)
-  add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
-    COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
-    DEPENDS ${MASTER_LIB_NAME})
-  add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
-endif(PROJ_ROOT)
+go_library(paddle_master SHARED)
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
index 916e4e99a2..170730cceb 100644
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-cc_library(main SRCS main.c DEPS paddle_pserver_cclient)
+cc_binary(main SRCS main.c DEPS paddle_pserver_cclient)
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index f34d53ae99..54e74248e7 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -72,6 +72,6 @@ endif()
 
 if(WITH_GOLANG)
   add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
-  target_link_libraries(paddle_trainer_lib ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
+  target_link_libraries(paddle_trainer paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
 endif(WITH_GOLANG)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 3640dd3a75..a9842152c8 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -7,10 +7,18 @@ file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
 file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
-             ${TRAINER_PY_FILES}
-             ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES}
-             ${V2_PY_FILES})
+  ${TRAINER_PY_FILES}
+  ${HELPERS_PY_FILES}
+  ${UTILS_PY_FILES}
+  ${V2_PY_FILES})
+
+add_custom_target(copy_paddle_master)
+if(WITH_GOLANG)
+  add_custom_command(TARGET copy_paddle_master
+    COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
+    )
+  add_dependencies(copy_paddle_master paddle_master)
+endif(WITH_GOLANG)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -18,7 +26,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} copy_paddle_master)
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 6a1e23a343..3ba5c31871 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -56,6 +56,7 @@ __all__ = [
     'plot',
     'evaluator',
     'image',
+    'master',
 ]
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 86fc0fc5c0..e507acaf21 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -8,6 +8,7 @@ packages=['paddle',
           'paddle.v2',
           'paddle.v2.dataset',
           'paddle.v2.reader',
+          'paddle.v2.master',
           'paddle.v2.plot']
 
 setup_requires=["requests",
@@ -25,6 +26,7 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
+      package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'
       },

From a53952165bbaebb34d2ce91ca361b470a2a3238a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 30 Jun 2017 11:22:49 +0800
Subject: [PATCH 74/86] FIX: merge objects to static lib

---
 cmake/generic.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 61353a4a26..f2e3934c27 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -146,9 +146,9 @@ function(merge_static_libs TARGET_NAME)
     set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
 
     foreach(lib ${libs})
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${objlistfile}"
-      WORKING_DIRECTORY ${objdir})
+      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
+      WORKING_DIRECTORY ${lib}.objdir)
     endforeach()
 
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD

From 38790c1c210c57b9cab5f1abe14203e053831ec5 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Thu, 29 Jun 2017 21:30:43 -0700
Subject: [PATCH 75/86] fix according to comment

---
 CMakeLists.txt        | 21 ---------------------
 cmake/configure.cmake | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edea8279df..b248191223 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -134,27 +134,6 @@ add_subdirectory(proto)
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
-  # we need to symlink Paddle directory into GOPATH. If we
-  # don't do it and we have code that depends on Paddle, go
-  # get ./... will download a new Paddle repo from Github,
-  # without the changes in our current Paddle repo that we
-  # want to build.
-  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-  file(MAKE_DIRECTORY ${GOPATH})
-  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
-  add_custom_target(go_path)
-  add_custom_command(TARGET go_path
-    # Symlink Paddle directory into GOPATH
-    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
-    COMMAND rm -rf ${PADDLE_IN_GOPATH}
-    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
-    # Automatically get all dependencies specified in the source code
-    # We can't run `go get -d ./...` for every target, because
-    # multiple `go get` can not run concurrently, but make need to be
-    # able to run with multiple jobs.
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./go/...
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  )
   add_subdirectory(go/master/c)
   add_subdirectory(go/pserver/cclient)
 endif(WITH_GOLANG)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e8425aedbd..f6dca6d575 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -69,3 +69,27 @@ endif(NOT WITH_GPU)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
+
+if(WITH_GOLANG)
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+  file(MAKE_DIRECTORY ${GOPATH})
+  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
+  add_custom_target(go_path)
+  add_custom_command(TARGET go_path
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code
+    # We can't run `go get -d ./...` for every target, because
+    # multiple `go get` can not run concurrently, but make need to be
+    # able to run with multiple jobs.
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./go/...
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+endif(WITH_GOLANG)

From d09bbb559d67848b6a17ff87432580f2c32db9e8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 30 Jun 2017 12:44:18 +0800
Subject: [PATCH 76/86] FIX: Always build protobuf from source

---
 cmake/external/protobuf.cmake | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2f267adc20..3c74944bc2 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -144,18 +144,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 ENDFUNCTION()
 
 SET(PROTOBUF_VERSION 3.1)
-IF(NOT CMAKE_CROSSCOMPILING)
-    FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
-
-    IF(PROTOBUF_FOUND)
-        SET_PROTOBUF_VERSION()
-        IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
-            SET(PROTOBUF_FOUND OFF)
-        ELSE()
-            PROMPT_PROTOBUF_LIB()
-        ENDIF()
-    ENDIF(PROTOBUF_FOUND)
-ELSE()
+IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
 

From 5fc9b116d5d5c8346e2c36e5dc75046ea5456086 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 30 Jun 2017 12:58:02 +0800
Subject: [PATCH 77/86] ENH: Remove deprecated python build

---
 cmake/external/python.cmake | 187 ------------------------------------
 1 file changed, 187 deletions(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index f4d0daab06..6546b2c83b 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -32,193 +32,6 @@ IF(PYTHONINTERP_FOUND)
         MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
-ELSE(PYTHONINTERP_FOUND)
-    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
-    ##################################### PYTHON ########################################
-    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
-    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
-    SET(_python_DIR ${PYTHON_INSTALL_DIR})
-
-    IF(UNIX)
-        SET(PYTHON_FOUND ON)
-        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
-        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
-        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
-        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
-    ELSEIF(WIN32)
-        SET(PYTHON_FOUND ON)
-        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
-        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
-        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
-        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Unknown system !")
-    ENDIF()
-
-    IF(APPLE)
-        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
-            )
-    ENDIF()
-
-    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
-
-    # Force Python build to "Release".
-    IF(CMAKE_CONFIGURATION_TYPES)
-        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
-        SET(CMAKE_CFG_INTDIR "Release")
-    ELSE()
-        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
-            -DCMAKE_BUILD_TYPE:STRING=Release
-            )
-    ENDIF()
-
-    ExternalProject_Add(python
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
-        PREFIX            ${PYTHON_SOURCES_DIR}
-        UPDATE_COMMAND    ""
-        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
-        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_CACHE_ARGS
-            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
-            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
-            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
-            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
-            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
-            -DDOWNLOAD_SOURCES:BOOL=ON
-            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
-            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
-            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
-        DEPENDS zlib
-    )
-
-    SET(py_env
-        PATH=${PYTHON_INSTALL_DIR}/bin
-        PYTHONHOME=${PYTHON_INSTALL_DIR}
-        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
-    ####################################################################################
-
-    ##################################### SETUPTOOLS ###################################
-    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
-    ExternalProject_Add(setuptools
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
-        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
-        BUILD_IN_SOURCE     1
-        PATCH_COMMAND       ""
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-        INSTALL_COMMAND     ""
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS             python zlib
-    )
-    #####################################################################################
-
-    ##################################### SIX ###########################################
-    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
-    ExternalProject_Add(six
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX              ${SIX_SOURCES_DIR}
-        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
-        BUILD_IN_SOURCE     1
-        PATCH_COMMAND       ""
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-        INSTALL_COMMAND     ""
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS             python setuptools
-    )
-    #####################################################################################
-
-    ##################################### CYTHON ########################################
-    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
-    ExternalProject_Add(cython
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX                ${CYTHON_SOURCES_DIR}
-        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
-        GIT_TAG               0.25.2
-        BUILD_IN_SOURCE       1
-        CONFIGURE_COMMAND     ""
-        PATCH_COMMAND         ""
-        UPDATE_COMMAND        ""
-        INSTALL_COMMAND       ""
-        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS               python
-    )
-    ####################################################################################
-
-    ##################################### NUMPY ########################################
-    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
-    SET(NUMPY_TAG_VERSION "v1.11.3")
-    SET(NUMPY_VERSION "1.11.3")
-
-    SET(EGG_NAME "")
-    SET(PYTHON_NUMPY_INCLUDE_DIR "")
-    IF(WIN32)
-        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
-    ELSE(WIN32)
-        IF(APPLE)
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
-        ELSE(APPLE)
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-        ENDIF(APPLE)
-
-        FOREACH(suffix x86_64 intel fat64 fat32 universal)
-            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
-        ENDFOREACH()
-    ENDIF(WIN32)
-
-    ExternalProject_Add(numpy
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/numpy/numpy.git
-        GIT_TAG             ${NUMPY_TAG_VERSION}
-        CONFIGURE_COMMAND   ""
-        UPDATE_COMMAND      ""
-        PREFIX              ${NUMPY_SOURCES_DIR}
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        BUILD_IN_SOURCE     1
-        DEPENDS             python setuptools cython
-    )
-    ####################################################################################
-
-    ##################################### WHEEL ########################################
-    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
-    ExternalProject_Add(wheel
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
-        PREFIX              ${WHEEL_SOURCES_DIR}
-        CONFIGURE_COMMAND   ""
-        UPDATE_COMMAND      ""
-        BUILD_COMMAND       ""
-        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        BUILD_IN_SOURCE     1
-        DEPENDS             python setuptools
-    )
-    ####################################################################################
-
-    ################################### PROTOBUF #######################################
-    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
-    ExternalProject_Add(python-protobuf
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
-        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
-        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
-        BUILD_IN_SOURCE       1
-        PATCH_COMMAND         ""
-        CONFIGURE_COMMAND     ""
-        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS               python setuptools six
-    )
-    ####################################################################################
-
-    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
-
 ENDIF(PYTHONINTERP_FOUND)
 
 IF(WITH_PYTHON)

From 0c70f34c60845f08563f031ce815c1d565dfab6b Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 30 Jun 2017 16:59:52 +0800
Subject: [PATCH 78/86] Fix bug for flowers dataset and row_conv.

---
 python/paddle/trainer/config_parser.py          |  4 ++--
 .../configs/protostr/test_row_conv.protostr     |  2 +-
 python/paddle/v2/dataset/flowers.py             | 17 +++++++++++------
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 58e4902f57..b7418101d8 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2082,10 +2082,10 @@ class MaxOutLayer(LayerBase):
 class RowConvLayer(LayerBase):
     def __init__(self, name, inputs, context_length, **xargs):
         super(RowConvLayer, self).__init__(
-            name, 'maxout', 0, inputs=inputs, **xargs)
+            name, 'row_conv', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
-            'TransLayer must have one and only one input')
+            'row convolution layer must have one and only one input.')
         input_layer = self.get_input_layer(0)
         row_conv_conf = self.config.inputs[0].row_conv_conf
         row_conv_conf.context_length = context_length
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
index 9ec15d2a19..19c9f16574 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
@@ -7,7 +7,7 @@ layers {
 }
 layers {
   name: "__row_conv_layer_0__"
-  type: "maxout"
+  type: "row_conv"
   size: 2560
   active_type: "relu"
   inputs {
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 158cfe158c..ef92fec75f 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -30,6 +30,7 @@ http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 """
 import cPickle
 import itertools
+import functools
 from common import download
 import tarfile
 import scipy.io as scio
@@ -54,21 +55,25 @@ TEST_FLAG = 'trnid'
 VALID_FLAG = 'valid'
 
 
-def default_mapper(sample):
+def default_mapper(is_train, sample):
     '''
     map image bytes data to type needed by model input layer
     '''
     img, label = sample
     img = load_image_bytes(img)
-    img = simple_transform(img, 256, 224, True)
+    img = simple_transform(img, 256, 224, is_train)
     return img.flatten().astype('float32'), label
 
 
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
 def reader_creator(data_file,
                    label_file,
                    setid_file,
                    dataset_name,
-                   mapper=default_mapper,
+                   mapper,
                    buffered_size=1024,
                    use_xmap=True):
     '''
@@ -118,7 +123,7 @@ def reader_creator(data_file,
         return map_readers(mapper, reader)
 
 
-def train(mapper=default_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -141,7 +146,7 @@ def train(mapper=default_mapper, buffered_size=1024, use_xmap=True):
         buffered_size, use_xmap)
 
 
-def test(mapper=default_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -164,7 +169,7 @@ def test(mapper=default_mapper, buffered_size=1024, use_xmap=True):
         buffered_size, use_xmap)
 
 
-def valid(mapper=default_mapper, buffered_size=1024, use_xmap=True):
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers validation set reader.
     It returns a reader, each sample in the reader is

From 9bb33f27f8313c2515b6dbfcfe8352b4a2c3bef6 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 30 Jun 2017 17:10:40 +0800
Subject: [PATCH 79/86] fix input shape of train_y

---
 doc/getstarted/concepts/src/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 679d0a931a..7e604f23de 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -31,7 +31,7 @@ def event_handler(event):
 # define training dataset reader
 def train_reader():
     train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
-    train_y = np.array([-2, -3, -7, -7])
+    train_y = np.array([[-2], [-3], [-7], [-7]])
 
     def reader():
         for i in xrange(train_y.shape[0]):

From 9e445eca89ae936ec82034c21b8311ccecdfc0ef Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 30 Jun 2017 17:16:59 +0800
Subject: [PATCH 80/86] FIX: Replace static libs check via system warning

---
 cmake/generic.cmake | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 03dabe7283..88be13b2ac 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -101,23 +101,16 @@ function(merge_static_libs TARGET_NAME)
 
   # First get the file names of the libraries to be merged
   foreach(lib ${libs})
-    get_target_property(libtype ${lib} TYPE)
-    if(NOT libtype STREQUAL "STATIC_LIBRARY")
-      message(FATAL_ERROR "merge_static_libs can only process static libraries")
-    endif()
     set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
   endforeach()
 
   if(APPLE) # Use OSX's libtool to merge archives
-    add_custom_target(${TARGET_NAME}_archive
-      COMMAND libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-      DEPENDS ${libs}
-      )
-    add_library(${TARGET_NAME} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${TARGET_NAME} PROPERTY
-      IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a")
-    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_archive)
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
 	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library

From e287034d73109a652a47b4f5132b5366f251711f Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 30 Jun 2017 17:29:07 +0800
Subject: [PATCH 81/86] minus mean in flowers dataset.

---
 python/paddle/v2/dataset/flowers.py |  3 ++-
 python/paddle/v2/image.py           | 26 ++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index ef92fec75f..fb9062fbb4 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -61,7 +61,8 @@ def default_mapper(is_train, sample):
     '''
     img, label = sample
     img = load_image_bytes(img)
-    img = simple_transform(img, 256, 224, is_train)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123, 68])
     return img.flatten().astype('float32'), label
 
 
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 0d648e9ae6..965d965335 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -262,7 +262,12 @@ def left_right_flip(im):
         return im[:, ::-1, :]
 
 
-def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
     """
     Simply data argumentation for training. These operations include
     resizing, croping and flipping.
@@ -288,7 +293,19 @@ def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
             im = left_right_flip(im)
     else:
         im = center_crop(im, crop_size)
-    im = to_chw(im)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1:
+            mean = mean[:, np.newaxis, np.newaxis]
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
 
     return im
 
@@ -297,7 +314,8 @@ def load_and_transform(filename,
                        resize_size,
                        crop_size,
                        is_train,
-                       is_color=True):
+                       is_color=True,
+                       mean=None):
     """
     Load image from the input file `filename` and transform image for
     data argumentation. Please refer to the `simple_transform` interface
@@ -318,5 +336,5 @@ def load_and_transform(filename,
     :type is_train: bool
     """
     im = load_image(filename)
-    im = simple_transform(im, resize_size, crop_size, is_train, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
     return im

From 0925681543ed8d2b50a67bd6695614a17fea9006 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 30 Jun 2017 17:45:28 +0800
Subject: [PATCH 82/86] fix typo.

---
 python/paddle/v2/dataset/flowers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index fb9062fbb4..e2a21e6e3e 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -62,7 +62,7 @@ def default_mapper(is_train, sample):
     img, label = sample
     img = load_image_bytes(img)
     img = simple_transform(
-        img, 256, 224, is_train, mean=[103.94, 116.78, 123, 68])
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
     return img.flatten().astype('float32'), label
 
 

From b5514602b6019a4b30515079e4be17bf4276cb19 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 30 Jun 2017 17:46:48 +0800
Subject: [PATCH 83/86] Add the use_nnpack parameter in ExpandConvLayer, so
 that the convolution calculation can be switched to the NNPACK function.

---
 paddle/function/nnpack/NNPACKConvOp.cpp   |  5 +-
 paddle/gserver/layers/ExpandConvLayer.cpp | 56 +++++++++++++++--------
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index d75fab0403..e8080c3d71 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -70,6 +70,9 @@ public:
     if (threadpool_) {
       pthreadpool_destroy(threadpool_);
     }
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
   }
 
   virtual void check(const BufferArgs& inputs,
@@ -160,7 +163,7 @@ public:
         CHECK_EQ(status, nnp_status_success);
       }
 
-      LOG(INFO) << "workspace size is " << needSize;
+      VLOG(3) << "workspace size is " << needSize;
       if (needSize > workspaceSize_) {
         workspaceSize_ = needSize;
         if (workspaceBuffer_) {
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 914689e66c..29e2113aff 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -16,6 +16,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
+DEFINE_bool(use_nnpack,
+            false,
+            "Whether to use nnpack for convolution calculation.");
+
 namespace paddle {
 
 /*
@@ -37,26 +41,38 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    createFunction(forward_,
-                   !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   "GemmConvGradFilter",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
+
+    if (FLAGS_use_nnpack) {
+      CHECK_EQ(isDeconv_, false);
+      createFunction(forward_,
+                     "NNPACKConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i])
+                         .set("algo", "auto"));
+    } else {
+      createFunction(forward_,
+                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     "GemmConvGradFilter",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+    }
   }
   return true;
 }

From 47f1031fb7e0644ab2797343f818d32f1c45fa38 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 30 Jun 2017 18:06:49 +0800
Subject: [PATCH 84/86] Modify the type of alog parameter.

---
 paddle/gserver/layers/ExpandConvLayer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 29e2113aff..af79e65a7c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -50,7 +50,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i])
-                         .set("algo", "auto"));
+                         .set("algo", std::string("auto")));
     } else {
       createFunction(forward_,
                      !isDeconv_ ? "GemmConv" : "GemmConvGradInput",

From 260427d2df5398ab3dac0ea3b8d6c54e2aa087fb Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Fri, 30 Jun 2017 18:20:21 +0800
Subject: [PATCH 85/86] "fix copy go master lib2python"

---
 python/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index a9842152c8..361e764e25 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -13,8 +13,11 @@ set(PY_FILES paddle/__init__.py
   ${V2_PY_FILES})
 
 add_custom_target(copy_paddle_master)
+
+SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
-  add_custom_command(TARGET copy_paddle_master
+  SET(COPY_PADDLE_MASTER "copy_paddle_master")
+  add_custom_command(TARGET ${COPY_PADDLE_MASTER}
     COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
     )
   add_dependencies(copy_paddle_master paddle_master)
@@ -26,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} copy_paddle_master)
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)

From 62e582e8109ff08089f72e88511162fe51ae031f Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 30 Jun 2017 18:23:46 +0000
Subject: [PATCH 86/86] polish wording and grammar.

---
 doc/design/cluster_train/save_model.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
index 76ac8d8387..b70f00176b 100644
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@@ -15,13 +15,13 @@ ways from which user can obtain a model:
 ### Trainer Saving Model vs. Pservers Saving Model
 
 Both trainers and pservers have access to the model. So the model can
-be saved from a trainer or pservers. We need to decide on where the
-model is saved from.
+be saved from a trainer or pservers. We need to decide where the model
+is saved from.
 
 #### Dense Update vs. Sparse Update
 
 There are two types of model update methods: dense update and sparse
-update (when the parameter is configured to be sparse).
+update (when the model parameter is configured to be sparse).
 
 - Dense update
 
@@ -48,15 +48,15 @@ filesystem, making the checkpoint shards visible to the merge program.
 
 The benefit of letting one trainer to save the model is it does not
 require a distributed filesystem. And it's reusing the same save model
-logic when the trainer is training locally - except when doing sparse
-update, the trainer needs to download the entire model during the
-saving process.
+logic when training locally - except when doing sparse update, the
+trainer needs to download the entire model during the saving process.
 
 #### Conclusion
 
 Given trainer saving model does not require a distributed filesystem,
-and is an intuitive extension to training locally, we decide to let
-the trainer save the model.
+and is an intuitive extension to trainer saving model when training
+locally, we decide to let the trainer save the model when doing
+distributed training.
 
 
 ### Convert Model from Checkpoint
@@ -84,16 +84,16 @@ save the model.
 
 Each trainer will be given the directory to save the model. The
 elected trainer will save the model to
-`given-directory/trainerID`. Since the tainerID is unique, this would
-prevent concurrent save to the same file when multiple trainers are
-elected to save the model when split-brain problem happens.
+`given-directory/trainerID`. Since the trainer ID is unique, this
+would prevent concurrent save to the same file when multiple trainers
+are elected to save the model when split-brain problem happens.
 
 ### What Happens When Model Is Saving
 
 It takes some time to save model, we need to define what will happen
 when save model is taking place.
 
-When saving a dense model, the trainer uses the local model. Pservers
+When doing dense update, the trainer uses the local model. Pservers
 does not need to pause model update.
 
 When doing sparse update. The trainer needs to download the entire
@@ -103,7 +103,7 @@ download finishes. Otherwise, the trainer gets a model that is
 "polluted": some part of the model is old, some part of the model is
 new.
 
-It's unclear that the "polluted" model will be inferiod due to the
+It's unclear that the "polluted" model will be inferior due to the
 stochastic nature of deep learning, and pausing the model update will
 add more complexity to the system. Since supporting sparse update is a
 TODO item. We defer the evaluation of pause the model update or not