From de8a8feeb48f981e5d75603eadc491b7eb139064 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 25 Jul 2017 17:34:59 +0800
Subject: [PATCH 01/15] ENH: Refine Tensor and And CopyFrom

---
 paddle/framework/CMakeLists.txt      |   2 +-
 paddle/framework/detail/tensor-inl.h | 158 ++++++++++++++++++++++
 paddle/framework/tensor.cc           |   2 +-
 paddle/framework/tensor.h            | 190 +++++++++++++--------------
 paddle/framework/tensor_test.cc      |  49 ++++---
 paddle/platform/device_context.h     |   2 +-
 6 files changed, 277 insertions(+), 126 deletions(-)
 create mode 100644 paddle/framework/detail/tensor-inl.h
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 760d84e51e..dec4ab80d8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h
new file mode 100644
index 0000000000..4fc328d231
--- /dev/null
+++ b/paddle/framework/detail/tensor-inl.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE(holder_ != nullptr,
+                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                 "first to re-allocate memory.");
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(product(dims_) > 0,
+                 "Tensor's numel must be larger than zero to call "
+                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    }
+#ifndef PADDLE_ONLY_CPU
+    else if (platform::is_gpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline void Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+}
+
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CPUDeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+#endif
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CUDADeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  } else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  }
+}
+#endif
+
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE(begin_idx < end_idx,
+                 "Begin index must be less than end index.");
+  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+
+inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
index 964f15ab66..ea7b2a1f7b 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/tensor.h>
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace framework {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a36f375d2e..d3f56b31cd 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -31,9 +32,11 @@ template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }  // namespace details
 }  // namespace pybind
+
 namespace framework {
 
 class Tensor {
+ public:
   template <bool less, size_t i, typename... args>
   friend struct paddle::pybind::details::CastToPyBufferImpl;
 
@@ -46,106 +49,84 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
+  /*! Return a pointer to mutable memory block. */
   template <typename T>
-  const T* data() const {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
+  inline T* data();
 
+  /*! Return a pointer to constant memory block. */
   template <typename T>
-  T* data() {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, platform::Place place) {
-    Resize(dims);
-    return mutable_data<T>(place);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(product(dims_) > 0,
-                   "Tensor's numel must be larger than zero to call "
-                   "Tensor::mutable_data. Call Tensor::set_dim first.");
-    if (holder_ == nullptr ||
-        !(holder_->place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      if (platform::is_cpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
-      } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#else
-        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
-#endif
-      } else {
-        PADDLE_THROW("Unknown 'place'.");
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
+  inline const T* data() const;
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
 
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline void Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline void ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
   template <typename T>
-  void ShareDataWith(const Tensor& src) {
-    src.EnforceSufficientMemory<T>();
-    *this = src;
-  }
+  inline void CopyFrom(const Tensor& src,
+                       const platform::CPUDeviceContext& ctx);
 
+#ifndef PADDLE_ONLY_CPU
   template <typename T>
-  void CopyFrom(const Tensor& src, platform::Place dst_place) {
-    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
-                       platform::is_cpu_place(dst_place),
-                   "Tensor::CopyFrom only support CPU now.");
-    src.EnforceSufficientMemory<T>();
-    size_t size = product(src.dims_) * sizeof(T);
-    Resize(src.dims());
-    const void* src_ptr = static_cast<const void*>(src.data<T>());
-    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-    memcpy(dst_ptr, src_ptr, size);
-  }
+  inline void CopyFrom(const Tensor& src,
+                       const platform::CUDADeviceContext& ctx);
+#endif
 
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
   template <typename T>
-  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    EnforceSufficientMemory<T>();
-    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
-    PADDLE_ENFORCE(begin_idx < end_idx,
-                   "Begin index must be less than end index.");
-    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    int base = product(dims_) / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
-    return dst;
-  }
-
-  void Resize(const DDim& dims) { dims_ = dims; }
-
-  const DDim& dims() const { return dims_; }
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
  private:
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
+  template <typename T>
+  inline void check_memory_size() const;
+
+ private:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
-    virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
     virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
   };
 
   template <typename T, typename PlaceType>
@@ -156,33 +137,38 @@ class Tensor {
           place_(place),
           size_(size) {}
 
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
+    /*! the pointer of memory block. */
     std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
-    platform::Place place_;  // record the place of ptr_.
-    size_t size_;            // size of the memory block.
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
   };
 
-  template <typename T>
-  inline void EnforceSufficientMemory() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
-                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                   "first to re-allocate memory.");
-  }
-
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
   DDim dims_;
-  // A PlaceHolder may be shared by more than one tensor. Some of them may be
-  // slices of the others. So the offset_ is introduced here to indicate the
-  // byte offset between PlaceHolder::ptr_ and where tensor's data really
-  // begins.
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
   size_t offset_;
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#include "paddle/framework/detail/tensor-inl.h"
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 089844dc01..7987d335ac 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,7 +72,8 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-#ifdef __CUDACC__
+
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -160,7 +161,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -188,25 +189,31 @@ TEST(Tensor, Slice) {
 TEST(Tensor, CopyFrom) {
   using namespace paddle::framework;
   using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
 
-  Tensor src_tensor;
-  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
-  Tensor dst_tensor;
-  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
-  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  }
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
 
-  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
-  for (size_t i = 0; i < 3; ++i) {
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    dst_tensor.CopyFrom<int>(src_tensor, cpu_ctx);
+
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, cpu_ctx);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
   }
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fe6f13e399..5a366dccdc 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -87,7 +87,7 @@ class CUDADeviceContext : public DeviceContext {
                    "cudaStreamSynchronize failed");
   }
 
-  cudaStream_t stream() { return stream_; }
+  cudaStream_t stream() const { return stream_; }
 
   Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
 

From 4e94cd757c824613b6b74275c4f4c31cc80bfa0b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 25 Jul 2017 17:46:20 +0800
Subject: [PATCH 02/15] FIX: restricting c++ template usage to POD types

---
 paddle/framework/detail/tensor-inl.h | 2 ++
 paddle/memory/memory.h               | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h
index 4fc328d231..2acae1b0e2 100644
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@@ -44,12 +44,14 @@ inline T* Tensor::data() {
 
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
   return mutable_data<T>(place);
 }
 
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
   PADDLE_ENFORCE(product(dims_) > 0,
                  "Tensor's numel must be larger than zero to call "
                  "Tensor::mutable_data. Call Tensor::set_dim first.");
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 5e0d647072..fd4d5e7082 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -29,10 +29,10 @@ void Free(Place, void*);
 template <typename Place>
 size_t Used(Place);
 
-template <typename T, /* must be POD types */
-          typename Place /* platform::GPUPlace or platform::CPUPlace */,
-          typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+template <typename T, typename Place>
 class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
  public:
   PODDeleter(Place place) : place_(place) {}
   void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }

From de8c344bda73cb0a6b1fa04037399bedb4c7d349 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 25 Jul 2017 18:03:14 +0800
Subject: [PATCH 03/15] ENH:  for clarity, change DLOG to VLOG

---
 paddle/memory/detail/buddy_allocator.cc | 56 ++++++++++++-------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 27c1b4033b..bb44970109 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -27,12 +27,11 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
       system_allocator_(std::move(system_allocator)) {}
 
 BuddyAllocator::~BuddyAllocator() {
-  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
-                "have actually been freed";
+  VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these "
+             "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
-               << ")";
+    VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -52,12 +51,11 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
-             << size;
+  VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size;
 
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
-    DLOG(INFO) << "Allocate from system allocator.";
+    VLOG(3) << "Allocate from system allocator.";
     return SystemAlloc(size);
   }
 
@@ -72,9 +70,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
-               << " at address "
-               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+    VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it)
+            << " at address "
+            << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
   total_used_ += size;
@@ -91,10 +89,10 @@ void BuddyAllocator::Free(void* p) {
   // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  DLOG(INFO) << "Free from address " << block;
+  VLOG(3) << "Free from address " << block;
 
   if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    DLOG(INFO) << "Free directly from system allocator";
+    VLOG(3) << "Free directly from system allocator";
     system_allocator_->Free(block, block->total_size(cache_),
                             block->index(cache_));
 
@@ -111,8 +109,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the right buddy
   if (block->has_right_buddy(cache_)) {
-    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
-               << block->right_buddy(cache_);
+    VLOG(3) << "Merging this block " << block << " with its right buddy "
+            << block->right_buddy(cache_);
 
     auto right_buddy = block->right_buddy(cache_);
 
@@ -129,8 +127,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
-    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
-               << block->left_buddy(cache_);
+    VLOG(3) << "Merging this block " << block << " with its left buddy "
+            << block->left_buddy(cache_);
 
     auto left_buddy = block->left_buddy(cache_);
 
@@ -146,8 +144,8 @@ void BuddyAllocator::Free(void* p) {
   }
 
   // Dumping this block into pool
-  DLOG(INFO) << "Inserting free block (" << block << ", "
-             << block->total_size(cache_) << ")";
+  VLOG(3) << "Inserting free block (" << block << ", "
+          << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
@@ -166,7 +164,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
 
-  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+  VLOG(3) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
@@ -192,8 +190,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  DLOG(INFO) << "Creating and inserting new block " << p
-             << " from system allocator";
+  VLOG(3) << "Creating and inserting new block " << p
+          << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
@@ -237,19 +235,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
-             << ") into";
+  VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_)
+          << ") into";
   block->split(cache_, size);
 
-  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
-             << ")";
+  VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_)
+          << ")";
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
-                 << block->right_buddy(cache_)->total_size(cache_) << ")";
+      VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", "
+              << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert(
           IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
@@ -276,7 +274,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
       return;
     }
 
-    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+    VLOG(3) << "Return block " << block << " to fallback allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -312,7 +310,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
-    DLOG(INFO) << "Return block " << block << " to base allocator.";
+    VLOG(3) << "Return block " << block << " to base allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);

From 6a5793baba87c42dc1dd8cf7499855242ecc950d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 26 Jul 2017 00:02:35 +0800
Subject: [PATCH 04/15] fix bug in OpKernel register macro

---
 paddle/framework/op_registry.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index f16deae028..384f0f631d 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -403,15 +403,16 @@ class GradOpRegisterHelper {
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
       "REGISTER_OP_KERNEL must be in global namespace");                  \
-  struct __op_kernel_register__##type##__ {                               \
-    __op_kernel_register__##type##__() {                                  \
+  struct __op_kernel_register__##type##__##DEVICE_TYPE##__ {              \
+    __op_kernel_register__##type##__##DEVICE_TYPE##__() {                 \
       ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
       key.place_ = PlaceType();                                           \
       ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
           .reset(new __VA_ARGS__());                                      \
     }                                                                     \
   };                                                                      \
-  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
+  static __op_kernel_register__##type##__##DEVICE_TYPE##__                \
+      __reg_kernel_##type##__##DEVICE_TYPE##__;                           \
   int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
 
 // (type, KernelType)

From 953eaadbf6b799272ae316fe3b87febf3c2dc185 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 26 Jul 2017 00:07:20 +0800
Subject: [PATCH 05/15] fix bug in OpKernelKey equal method

---
 paddle/framework/operator.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index f59314f828..0997983a03 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -199,7 +199,9 @@ class OperatorWithKernel : public OperatorBase {
       place_ = dev_ctx.GetPlace();
     }
 
-    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+    bool operator==(const OpKernelKey& o) const {
+      return platform::places_are_same_class(place_, o.place_);
+    }
   };
 
   struct OpKernelHash {

From 1c68f119d54844bb2c0972570d6f9b9c835e2cfc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 26 Jul 2017 16:17:01 +0800
Subject: [PATCH 06/15] ENH: Add GPU CopyFrom Unit Test

---
 paddle/framework/tensor_test.cc | 49 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 7987d335ac..27fc08e934 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) {
     int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     memcpy(src_ptr, arr, 9 * sizeof(int));
 
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    dst_tensor.CopyFrom<int>(src_tensor, cpu_ctx);
+    auto* cpu_ctx = new paddle::platform::CPUDeviceContext();
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_ctx);
 
     const int* dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(src_ptr, dst_ptr);
@@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) {
     }
 
     Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-    dst_tensor.CopyFrom<int>(slice_tensor, cpu_ctx);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_ctx);
     const int* slice_ptr = slice_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(dst_ptr, slice_ptr);
@@ -216,4 +216,47 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
   }
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_ctx = new paddle::platform::CUDADeviceContext(0);
+    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_ctx);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_ctx = new paddle::platform::CPUDeviceContext();
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
+
+    // Compare Tensors
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_ctx);
+
+    // GPU Tensor to CPU Tensor
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
+
+    // Compare Slice Tensors
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
 }

From 39af25595935876614c8ea938510b302ac8b4547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Thu, 27 Jul 2017 09:25:11 +0800
Subject: [PATCH 07/15] Fix new optimizer lr (#3074)

* default learning rate, temperary fix

* update
---
 go/pserver/client/c/test/test_train.py       | 2 +-
 paddle/trainer/NewRemoteParameterUpdater.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index e9264592b4..17082cf892 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -38,7 +38,7 @@ def main():
     parameters = paddle.parameters.create(cost)
 
     # create optimizer of new remote updater to pserver
-    optimizer = paddle.optimizer.Momentum(momentum=0)
+    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
 
     print "etcd endoint: ", etcd_endpoint
     trainer = paddle.trainer.SGD(cost=cost,
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index a830ceba57..e1558e3fdf 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -76,7 +76,11 @@ void NewRemoteParameterUpdater::init(
       sgdConfigV2->set_decay(paramConfig.decay_rate());
       optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
       auto constlr = optimizeConfigV2.mutable_const_lr();
-      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (paramConfig.has_learning_rate()) {
+        constlr->set_learning_rate(paramConfig.learning_rate());
+      } else {
+        constlr->set_learning_rate(trainerConfig_.learning_rate());
+      }
       if (trainerConfig_.algorithm() == "sgd") {
         optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
         // FIXME: config all algorithms

From f4e57b4b28089cc17b909c9725d3f0d3b98da8ea Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 27 Jul 2017 10:07:36 +0800
Subject: [PATCH 08/15] Fix bug in SequenceSoftmax

Also remove operator bool in Error. The Error should be removed
later because it is not necessary for Paddle. We are now using Enforce
to handle error.
---
 paddle/api/Evaluator.cpp                          |  2 +-
 paddle/gserver/activations/ActivationFunction.cpp |  4 ++--
 paddle/utils/Error.h                              | 13 ++++---------
 paddle/utils/tests/test_Error.cpp                 |  8 ++++----
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index 681e3a3809..fcda6eaf03 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -37,7 +37,7 @@ std::vector<std::string> Evaluator::getNames() const {
 double Evaluator::getValue(const std::string name) const {
   paddle::Error err;
   double v = m->rawPtr->getValue(name, &err);
-  if (err) {
+  if (!err.isOK()) {
     throw std::runtime_error(err.msg());
   }
   return v;
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index a40530f413..81cc3c890b 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -207,8 +207,8 @@ Error __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Error status = softmax_.backward(argument_);
-    if (!status) return status;
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
   }
   return Error();
 }
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 27ddaab3f0..7cde983060 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -126,9 +126,11 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is something error.
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
    */
-  operator bool() const { return !this->isOK(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
   /**
    * @brief isOK return True if there is no error.
@@ -136,13 +138,6 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
 private:
   std::shared_ptr<std::string> msg_;
 };
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index fdf326b17a..6f311fa6b8 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error.isOK());
   error = paddle::Error("I'm the error");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_TRUE(error3);
+  ASSERT_FALSE(error3.isOK());
   ASSERT_STREQ("error3", error3.msg());
 }

From e0463acfc3c0bc76de27060b32514fb67acd5805 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 27 Jul 2017 10:09:41 +0800
Subject: [PATCH 09/15] Rename PlainNet --> NetOp

---
 paddle/framework/CMakeLists.txt  |  4 +---
 paddle/framework/net.cc          | 16 ++++----------
 paddle/framework/net.h           | 24 +++++----------------
 paddle/framework/net_op_test.cc  | 37 +++++++++++++-------------------
 paddle/framework/net_proto.proto | 15 -------------
 paddle/framework/operator.h      | 14 ++++++------
 paddle/operators/fc_op.cc        |  2 +-
 paddle/operators/type_alias.h    |  2 +-
 paddle/pybind/pybind.cc          | 18 ++++++++--------
 9 files changed, 44 insertions(+), 88 deletions(-)
 delete mode 100644 paddle/framework/net_proto.proto

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 433edbfda7..a29a81c994 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,7 +29,5 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 
-proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
-cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_library(net SRCS net.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index bc23b63b35..2cd378c6b2 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -20,17 +20,7 @@
 namespace paddle {
 namespace framework {
 
-std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
-  auto grad_ops = std::make_shared<PlainNet>();
-  for (auto& op : ForwardOps->ops_) {
-    auto op_grad = OpRegistry::CreateGradOp(op);
-    grad_ops->AddOp(op_grad);
-  }
-  grad_ops->CompleteAddOp();
-  return grad_ops;
-}
-
-void PlainNet::CompleteAddOp(bool calc) {
+void NetOp::CompleteAddOp(bool calc) {
   add_op_done_ = true;
   if (!calc) return;
   std::unordered_set<std::string> input_set;
@@ -70,7 +60,7 @@ void PlainNet::CompleteAddOp(bool calc) {
   attrs_["temporary_index"] = tmp_index;
 }
 
-std::string PlainNet::DebugString() const {
+std::string NetOp::DebugString() const {
   std::ostringstream os;
   os << OperatorBase::DebugString() << std::endl;
   for (auto& op : ops_) {
@@ -82,5 +72,7 @@ std::string PlainNet::DebugString() const {
   return os.str();
 }
 
+bool NetOp::IsNetOp() const { return true; }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 3264f1f565..089c135595 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -37,21 +37,7 @@ namespace framework {
  * This is the base class of network, all the networks should implement the APIs
  * it defines.
  */
-class Net : public OperatorBase {
- public:
-  virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
-  virtual void CompleteAddOp(bool calc) = 0;
-};
-
-using NetPtr = std::shared_ptr<Net>;
-
-/**
- * @brief a basic implementation of Net.
- *
- * PlainNet is a very simple Net, it create a list of operators, and run them
- * sequentially following the order they added.
- */
-class PlainNet : public Net {
+class NetOp : public OperatorBase {
  public:
   /**
    * Infer all the operators' input and output variables' shapes, will be called
@@ -80,15 +66,17 @@ class PlainNet : public Net {
   /**
    * @brief Add an operator by ptr
    */
-  void AddOp(const std::shared_ptr<OperatorBase>& op) override {
+  void AddOp(const std::shared_ptr<OperatorBase>& op) {
     PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
     ops_.push_back(op);
   }
 
-  void CompleteAddOp(bool calculate = true) override;
+  void CompleteAddOp(bool calculate = true);
 
   std::string DebugString() const override;
 
+  bool IsNetOp() const override;
+
   std::vector<std::shared_ptr<OperatorBase>> ops_;
 
  private:
@@ -100,7 +88,5 @@ class PlainNet : public Net {
   }
 };
 
-std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index d924058624..8048311fe5 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -40,7 +40,7 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
 }
 
 TEST(OpKernel, all) {
-  auto net = std::make_shared<PlainNet>();
+  auto net = std::make_shared<NetOp>();
   ASSERT_NE(net, nullptr);
 
   auto op1 = std::make_shared<TestOp>();
@@ -71,28 +71,21 @@ TEST(OpKernel, all) {
   ASSERT_EQ(2, run_cnt);
   ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
 }
-TEST(AddBackwardOp, TestGradOp) {
-  auto net = std::make_shared<PlainNet>();
-  ASSERT_NE(net, nullptr);
-  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
-  net->AddOp(
-      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
-  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {}));
-  auto grad_ops = AddBackwardOp(net);
-  for (auto& op : grad_ops->ops_) {
-    op->DebugString();
-  }
-}
 
-// TODO(zhihong): add fc grad without registering.
-// TEST(AddBackwardOp, TestNoGradOp) {
-//   auto net = std::make_shared<PlainNet>();
-//   ASSERT_NE(net, nullptr);
-//   net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"},
-//   {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) {
-//     op->DebugString();
-//   }
-// }
+//! TODO(yuyang18): Refine Backward Op.
+// TEST(AddBackwardOp, TestGradOp) {
+//  auto net = std::make_shared<NetOp>();
+//  ASSERT_NE(net, nullptr);
+//  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
+//  net->AddOp(
+//      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
+//  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""},
+//  {}));
+//  auto grad_ops = AddBackwardOp(net);
+//  for (auto& op : grad_ops->ops_) {
+//    op->DebugString();
+//  }
+//}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
deleted file mode 100644
index 0779f49fe2..0000000000
--- a/paddle/framework/net_proto.proto
+++ /dev/null
@@ -1,15 +0,0 @@
-syntax="proto2";
-package paddle.framework;
-
-import "op_proto.proto";
-
-message NetDesc {
-  // network identification
-  optional string name = 1;
-  // operator contains in network
-  repeated OpProto operators = 2;
-  // network type to run with. e.g "plainNet", "DAG"
-  optional string net_type = 3;
-  // num worker always
-  optional int32 num_workers = 4;
-}
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index f59314f828..65fddb6811 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -90,15 +90,17 @@ class OperatorBase {
   virtual void Run(const std::shared_ptr<Scope>& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
-  // Get a input with argument's name described in `op_proto`
+  virtual bool IsNetOp() const { return false; }
+
+  //! Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
-  // Get a input which has multiple variables.
-  // TODO add a vector_view to prevent memory copy.
+  //! Get a input which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Inputs(const std::string& name) const;
-  // Get a output with argument's name described in `op_proto`
+  //! Get a output with argument's name described in `op_proto`
   const std::string& Output(const std::string& name) const;
-  // Get an output which has multiple variables.
-  // TODO add a vector_view to prevent memory copy.
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Outputs(const std::string& name) const;
 
  public:
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 40ff2f41dd..c4a9f5937f 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace operators {
 
-class FullyConnectedOp : public PlainNet {
+class FullyConnectedOp : public NetOp {
 public:
   void Init() override {
     AddOp(OpRegistry::CreateOp("mul",
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
index 44ffefb299..b712e457ff 100644
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -43,7 +43,7 @@ using OpProto = framework::OpProto;
 using OpAttrChecker = framework::OpAttrChecker;
 using CPUPlace = platform::CPUPlace;
 using GPUPlace = platform::GPUPlace;
-using PlainNet = framework::PlainNet;
+using NetOp = framework::NetOp;
 using OpRegistry = framework::OpRegistry;
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 0b152d03c0..ccefcd2511 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -146,22 +146,22 @@ All parameter, weight, gradient are variables in Paddle.
   });
   ExposeOperator(operator_base);
 
-  using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
-  py::class_<pd::PlainNet, PlainNetPtr> net(m, "Net");
+  py::class_<pd::NetOp, std::shared_ptr<pd::NetOp>> net(m, "Net");
 
   net.def_static("create",
-                 []() -> std::shared_ptr<pd::PlainNet> {
-                   auto retv = std::make_shared<pd::PlainNet>();
+                 []() -> std::shared_ptr<pd::NetOp> {
+                   auto retv = std::make_shared<pd::NetOp>();
                    retv->type_ = "plain_net";
                    return retv;
                  })
-      .def("add_op", &pd::PlainNet::AddOp)
+      .def("add_op", &pd::NetOp::AddOp)
       .def("add_op",
-           [](PlainNetPtr& self, const PlainNetPtr& net) -> void {
-             self->AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
+           [](pd::NetOp& self, const std::shared_ptr<pd::NetOp>& net) -> void {
+             self.AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
            })
-      .def("complete_add_op", &pd::PlainNet::CompleteAddOp)
-      .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
+      .def("complete_add_op", &pd::NetOp::CompleteAddOp)
+      .def("complete_add_op",
+           [](std::shared_ptr<pd::NetOp>& self) { self->CompleteAddOp(); });
   ExposeOperator(net);
 
   m.def("unique_integer", UniqueIntegerGenerator);

From 7c13292cff454fdf0f12d965268bffeb5cad5aed Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 27 Jul 2017 12:53:30 +0800
Subject: [PATCH 10/15] Fix bug for multi-GPU inference.

---
 python/paddle/v2/inference.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 40134a3270..4dcc3ab57e 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -35,6 +35,13 @@ class Inference(object):
             name = param.getName()
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
+            # the setValueUpdated function is called in randomize, zeroMem,
+            # load function in paddle/parameter/Parameter.cpp. But in the
+            # inference mode, the setValueUpdated is never called, it will
+            # cause the parameter will not be dispatched
+            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
+            # called here, but it's better to call this function in one place.
+            param.setValueUpdated()
         self.__gradient_machine__ = gm
         self.__data_types__ = topo.data_type()
 

From c10121e13c2309e81d1842c3ceca733b05f25e91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Thu, 27 Jul 2017 13:40:25 +0800
Subject: [PATCH 11/15] [Done] Sync master client between passes and fix
 recordio split (#2948)

* fix recordio split and task passes

* update for pre commit

* update

* update, still need to sync client wait for pass end.

* able to sync passes for task dispatching

* update to comment

* update

* fix yapf check

* why local pre-commit fails? version is the same

* fix race condition

* update

* fix race condition

* this still have duplicate problem in unit test

* update

* update

* update by comment

* update
---
 .pre-commit-config.yaml                | 12 ++--
 go/master/c/client.go                  | 17 +++--
 go/master/client.go                    | 70 ++++++++++--------
 go/master/client_internal_test.go      | 60 ++++++++--------
 go/master/client_test.go               | 83 +++++++++++++++-------
 go/master/service.go                   | 98 ++++++++++++++++----------
 go/master/service_internal_test.go     |  3 +-
 go/pserver/client/c/test/test_train.py | 18 +++--
 python/paddle/v2/dataset/common.py     | 44 ++++--------
 python/paddle/v2/master/client.py      |  1 -
 10 files changed, 235 insertions(+), 171 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index efb4dcb2df..980a97a07c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,9 +22,11 @@
     hooks:
     -   id: clang-formater
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: 16398aeccf263adaf53b2495eed0406347d76281
+    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
-      -   id: go-fmt
-          types: [go]
-      -   id: gometalinter
-          types: [go]
+    -   id: go-fmt
+        types:
+        - go
+    -   id: gometalinter
+        types:
+        - go
diff --git a/go/master/c/client.go b/go/master/c/client.go
index a2b18e4b47..b5759c30b1 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -18,7 +18,6 @@ package main
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1
 
@@ -101,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) {
 	remove(client)
 }
 
+//export paddle_start_get_records
+func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
+	c := get(client)
+	c.StartGetRecords(int(pass))
+}
+
 //export paddle_set_dataset
 func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
 	c := get(client)
@@ -121,15 +126,19 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 
 // paddle_next_record gets the nexts training record.
 //
-// returns number of bytes of the records if success, -1 if failed.
+// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
 //
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r, err := c.NextRecord()
 	if err != nil {
-		// Error
-		// TODO: return the type of error?
+		// NOTE: use errors to indicate pass ends
+		if err.Error() == master.ErrAllTaskFailed.Error() ||
+			err.Error() == master.ErrNoMoreAvailable.Error() ||
+			err.Error() == master.ErrPassBefore.Error() {
+			return -2
+		}
 		*record = (*C.uchar)(nil)
 		return -1
 	}
diff --git a/go/master/client.go b/go/master/client.go
index bbf3768d96..62801b9b7f 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -16,7 +16,6 @@ package master
 
 import (
 	"os"
-	"sync"
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
@@ -27,9 +26,9 @@ import (
 
 // Client is the client of the master server.
 type Client struct {
-	conn       *connection.Conn
-	ch         chan record
-	initChOnce sync.Once
+	conn    *connection.Conn
+	ch      chan record
+	bufSize int
 }
 
 type record struct {
@@ -46,11 +45,7 @@ func WithBuffer(bufSize int) func(*Client) error {
 		if bufSize <= 0 {
 			return nil
 		}
-
-		c.initChOnce.Do(func() {
-			c.ch = make(chan record, bufSize)
-			go c.getRecords()
-		})
+		c.bufSize = bufSize
 		return nil
 	}
 }
@@ -104,25 +99,41 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		if err != nil {
 			return nil, err
 		}
-
 	}
-
+	c.ch = make(chan record, c.bufSize)
+	// FIXME: connection is created asyncrosly in monitorMaster go routine,
+	//        ensure the connection is ready for use before calling c.addClient.
+	time.Sleep(time.Second)
 	return c, nil
 }
 
-func (c *Client) getRecords() {
+// StartGetRecords must be called at beginning of each pass
+func (c *Client) StartGetRecords(passID int) {
+	go c.getRecords(passID)
+}
+
+func (c *Client) getRecords(passID int) {
 	for {
-		t, err := c.getTask()
+		t, err := c.getTask(passID)
 		if err != nil {
-			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
-			time.Sleep(3 * time.Second)
-			continue
+			if err.Error() == ErrPassBefore.Error() ||
+				err.Error() == ErrNoMoreAvailable.Error() ||
+				err.Error() == ErrAllTaskFailed.Error() {
+				c.ch <- record{nil, err}
+				break
+			}
+			if err.Error() == ErrPassAfter.Error() {
+				// wait util last pass finishes
+				time.Sleep(time.Second * 3)
+				continue
+			}
+			log.Errorf("getTask error: %s", err)
 		}
 
 		for _, chunk := range t.Chunks {
-			f, err := os.Open(chunk.Path)
-			if err != nil {
-				log.Errorln(err)
+			f, e := os.Open(chunk.Path)
+			if e != nil {
+				log.Errorln(e)
 				continue
 			}
 
@@ -178,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) {
 	}
 }
 
-// SetDataset set dataset for the master server to dispatch.
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times at one pass. But only the first call
+// will be honored.
 //
-// SetDataset can be call multiple times from different nodes. But
-// only the first call will be honored.
+// After all tasks are done, another call of SetDataset will start another pass.
 func (c *Client) SetDataset(globPaths []string) error {
-	return c.conn.Call("Service.SetDataset", globPaths, nil)
+	err := c.conn.Call("Service.SetDataset", globPaths, nil)
+	return err
 }
 
 // getTask gets a new task from the master server.
-func (c *Client) getTask() (Task, error) {
+func (c *Client) getTask(passID int) (Task, error) {
 	var t Task
-	err := c.conn.Call("Service.GetTask", 0, &t)
+	err := c.conn.Call("Service.GetTask", passID, &t)
 	return t, err
 }
 
@@ -208,12 +222,6 @@ func (c *Client) taskFailed(meta TaskMeta) error {
 // NextRecord will block until the next record is available. It is
 // thread-safe.
 func (c *Client) NextRecord() ([]byte, error) {
-	c.initChOnce.Do(func() {
-		// initialize with in case WithBuffer is not used.
-		c.ch = make(chan record, 0)
-		go c.getRecords()
-	})
-
 	r := <-c.ch
 	return r.r, r.err
 }
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index ee305e2c80..d5f3d79464 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -54,22 +54,22 @@ func TestGetFinishTask(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if err != nil {
-			panic(err)
+		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
+		sErr = server.Register(s)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		mux := http.NewServeMux()
 		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
+		sErr = http.Serve(l, mux)
+		if sErr != nil {
+			panic(sErr)
 		}
 	}(l)
 
@@ -103,6 +103,7 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
+
 	err = c.SetDataset([]string{path})
 	if err != nil {
 		panic(err)
@@ -111,44 +112,47 @@ func TestGetFinishTask(t *testing.T) {
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
-			task, err := c.getTask()
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			task, cErr := c.getTask(i)
+			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+				t.Fatalf("error: %v, pass: %d\n", cErr, i)
 			}
 			tasks = append(tasks, task)
 		}
 
-		_, err = c.getTask()
-		if err == nil {
+		// getting task before task finishes should return error
+		_, cErr := c.getTask(i)
+		if cErr == nil {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].Meta.ID)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		cErr = c.taskFinished(tasks[0].Meta.ID)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
-
-		err = c.taskFailed(tasks[0].Meta)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		// call taskFailed once won't put the task to failed queue, just ensure
+		// the call
+		cErr = c.taskFailed(tasks[0].Meta)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
 
 		tasks = tasks[1:]
-		task, err := c.getTask()
-		if err != nil {
-			t.Fatal(err)
+		_, cErr = c.getTask(i)
+		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
 		}
-		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.Meta.ID)
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			cErr = c.taskFinished(task.Meta.ID)
+			if cErr != nil {
+				t.Fatal(cErr)
 			}
 		}
 	}
 
 	for i := 0; i < 10; i++ {
+		// init pass data
+		c.StartGetRecords(i)
 		checkOnePass(i)
 	}
 }
diff --git a/go/master/client_test.go b/go/master/client_test.go
index a3a434ae7e..79b9cc844d 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -20,8 +20,10 @@ import (
 	"net/http"
 	"net/rpc"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -29,6 +31,18 @@ import (
 	"github.com/PaddlePaddle/recordio"
 )
 
+// tool function for testing output goroutine ids
+func goid() int {
+	var buf [64]byte
+	n := runtime.Stack(buf[:], false)
+	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
+	id, err := strconv.Atoi(idField)
+	if err != nil {
+		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
+	}
+	return id
+}
+
 func TestNextRecord(t *testing.T) {
 	const (
 		path  = "/tmp/master_client_TestFull"
@@ -45,7 +59,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
 		if err != nil {
 			panic(err)
 		}
@@ -69,7 +83,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	w := recordio.NewWriter(f, -1, -1)
+	w := recordio.NewWriter(f, 1, -1)
 	for i := 0; i < total; i++ {
 		_, err = w.Write([]byte{byte(i)})
 		if err != nil {
@@ -87,32 +101,49 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10))
-	if err != nil {
-		panic(err)
-	}
-
-	err = c.SetDataset([]string{path})
-	if err != nil {
-		panic(err)
-	}
-
-	for pass := 0; pass < 50; pass++ {
-		received := make(map[byte]bool)
-		for i := 0; i < total; i++ {
-			r, err := c.NextRecord()
-			if err != nil {
-				t.Fatal(pass, i, "Read error:", err)
+	// start several client to test task fetching
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		// test for multiple concurrent clients
+		go func() {
+			defer wg.Done()
+			// each go-routine needs a single client connection instance
+			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
+			if e != nil {
+				t.Fatal(e)
 			}
-
-			if len(r) != 1 {
-				t.Fatal(pass, i, "Length should be 1.", r)
+			e = c.SetDataset([]string{path})
+			if e != nil {
+				panic(e)
 			}
-
-			if received[r[0]] {
-				t.Fatal(pass, i, "Received duplicate.", received, r)
+			// test for n passes
+			for pass := 0; pass < 10; pass++ {
+				c.StartGetRecords(pass)
+
+				received := make(map[byte]bool)
+				taskid := 0
+				for {
+					r, e := c.NextRecord()
+					if e != nil {
+						// ErrorPassAfter will wait, else break for next pass
+						if e.Error() == master.ErrPassBefore.Error() ||
+							e.Error() == master.ErrNoMoreAvailable.Error() {
+							break
+						}
+						t.Fatal(pass, taskid, "Read error:", e)
+					}
+					if len(r) != 1 {
+						t.Fatal(pass, taskid, "Length should be 1.", r)
+					}
+					if received[r[0]] {
+						t.Fatal(pass, taskid, "Received duplicate.", received, r)
+					}
+					taskid++
+					received[r[0]] = true
+				}
 			}
-			received[r[0]] = true
-		}
+		}()
 	}
+	wg.Wait()
 }
diff --git a/go/master/service.go b/go/master/service.go
index d1ec8939e1..1f2112ecfb 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -19,6 +19,7 @@ import (
 	"compress/gzip"
 	"encoding/gob"
 	"errors"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"sync"
@@ -33,6 +34,18 @@ const (
 	dialTimeout = 5 * time.Second
 )
 
+// ErrAllTaskFailed occur when tasks are in done or failed state.
+var ErrAllTaskFailed = errors.New("all task finished")
+
+// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
+var ErrNoMoreAvailable = errors.New("no more available task")
+
+// ErrPassBefore client side pass number does not match with master counter.
+var ErrPassBefore = errors.New("pass number smaller than master")
+
+// ErrPassAfter client side pass number does not match with master counter.
+var ErrPassAfter = errors.New("pass number larger than master")
+
 // Store is the interface for save and load the master state.
 type Store interface {
 	Save([]byte) error
@@ -75,17 +88,26 @@ type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
 	failureMax    int
-	ready         chan struct{}
 	store         Store
 
-	mu            sync.Mutex
-	initDone      bool
-	taskQueues    taskQueues
+	ready    chan struct{}
+	initDone bool
+
+	mu         sync.Mutex
+	taskQueues taskQueues
+	currPass   int
+	jobTasks   []taskEntry
+
 	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	id := 0
+	// generate uniq id across job using nanosecond + randint + counter
+	// FIXME(typhoonzero): this is a workaround, use uuid
+	randStart := rand.Int()
+	counter := 0
+	timestamp := time.Now().Nanosecond()
+	id := timestamp + randStart + counter
 	if chunksPerTask <= 0 {
 		chunksPerTask = 1
 	}
@@ -95,7 +117,8 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
 			cur.Task.Meta.ID = id
-			id++
+			counter++
+			id = timestamp + randStart + counter
 			result = append(result, cur)
 			cur.Task.Chunks = nil
 		}
@@ -266,19 +289,21 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error {
 		return err
 	}
 
-	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+	s.jobTasks = partition(chunks, s.chunksPerTask)
+	s.taskQueues.Todo = s.jobTasks
 
 	err = s.snapshot()
 	if err != nil {
 		log.Errorln(err)
 		return err
 	}
-
 	close(s.ready)
 	s.initDone = true
 	return nil
 }
 
+// processFailedTask retry s.failureMax times for failed task.
+// return true if all task are done or failed.
 func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	if t.Task.Meta.Epoch != epoch {
 		// new epoch, task launched after the
@@ -302,8 +327,9 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 		return
 	}
 
-	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
 }
 
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
@@ -331,37 +357,30 @@ func (s *Service) logFields() log.Fields {
 }
 
 // GetTask gets a new task from the service.
-func (s *Service) GetTask(_ int, task *Task) error {
+// passID is the client side pass count
+func (s *Service) GetTask(passID int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if passID < s.currPass {
+		return ErrPassBefore
+	}
+	if passID > s.currPass {
+		// Client may get run to pass after master when one client faster than the
+		// other
+		return ErrPassAfter
+	}
 
 	if len(s.taskQueues.Todo) == 0 {
-		if len(s.taskQueues.Done) == 0 {
-			if len(s.taskQueues.Pending) == 0 {
-				err := errors.New("all task failed")
-				log.WithFields(s.logFields()).Warningln("All tasks failed.")
-				return err
-			}
-
-			// TODO(helin): client need to retry in this
-			// error case. Gotcha: RPC client can't
-			// compare returned error with predefined
-			// errors like io.EOF, because the error
-			// instance deserialized from RPC is a
-			// different instance than the error defined
-			// in package. So we need to figure out a way
-			// for client to check this error correctly.
-			err := errors.New("no more available task")
-			log.WithFields(s.logFields()).Warningln("No more available task.")
-			return err
+		if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 {
+			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
+			return ErrAllTaskFailed
 		}
-		s.taskQueues.Todo = s.taskQueues.Done
-		s.taskQueues.Done = nil
-		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+		log.WithFields(s.logFields()).Warningln("No more available task.")
+		return ErrNoMoreAvailable
 	}
 
 	t := s.taskQueues.Todo[0]
@@ -381,7 +400,7 @@ func (s *Service) GetTask(_ int, task *Task) error {
 }
 
 // TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, _ *int) error {
+func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -401,11 +420,14 @@ func (s *Service) TaskFinished(taskID int, _ *int) error {
 	delete(s.taskQueues.Pending, taskID)
 
 	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
-
-	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
-		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
-		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
-		s.taskQueues.Done = nil
+	if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 {
+		// increase master side pass count if all tasks finished
+		s.currPass++
+		s.taskQueues.Todo = s.jobTasks
+		s.taskQueues.Done = []taskEntry{}
+		// TODO(typhoonzero): deal with failed tasks
+		s.taskQueues.Failed = []taskEntry{}
+		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass)
 	}
 
 	err := s.snapshot()
@@ -416,7 +438,7 @@ func (s *Service) TaskFinished(taskID int, _ *int) error {
 }
 
 // TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, _ *int) error {
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index 69a882fc33..bd1a939a55 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -44,7 +44,8 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.Meta.ID != i {
+		// test auto increament ids
+		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 17082cf892..85cb399590 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -6,16 +6,19 @@ import cPickle as pickle
 
 etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
 etcd_endpoint = "http://" + etcd_ip + ":2379"
+print "connecting to master, etcd endpoints: ", etcd_endpoint
+master_client = master.client(etcd_endpoint, 5, 64)
 
 
 def cloud_reader():
-    print "connecting to master, etcd endpoints: ", etcd_endpoint
-    master_client = master.client(etcd_endpoint, 5, 64)
+    global master_client
     master_client.set_dataset(
-        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30)
     while 1:
         r, e = master_client.next_record()
         if not r:
+            if e != -2:  # other errors
+                print "get record error:", e
             break
         yield pickle.loads(r)
 
@@ -27,10 +30,12 @@ def main():
     # network config
     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
     y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
+                                param_attr=paddle.attr.Param(
+                                    name='w', learning_rate=1e-3),
                                 size=1,
                                 act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
+                                bias_attr=paddle.attr.Param(
+                                    name='b', learning_rate=1e-3))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
     cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
@@ -40,7 +45,6 @@ def main():
     # create optimizer of new remote updater to pserver
     optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
 
-    print "etcd endoint: ", etcd_endpoint
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
@@ -51,6 +55,8 @@ def main():
     # event_handler to print training and testing info
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
+            # FIXME: for cloud data reader, pass number is managed by master
+            # should print the server side pass number
             if event.batch_id % 100 == 0:
                 print "Pass %d, Batch %d, Cost %f" % (
                     event.pass_id, event.batch_id, event.cost)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 645f3cc0dc..111496618d 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -166,55 +166,37 @@ def cluster_files_reader(files_pattern,
     return reader
 
 
-def convert(output_path,
-            reader,
-            num_shards,
-            name_prefix,
-            max_lines_to_shuffle=1000):
+def convert(output_path, reader, line_count, name_prefix):
     import recordio
     """
     Convert data from reader to recordio format files.
 
     :param output_path: directory in which output files will be saved.
     :param reader: a data reader, from which the convert program will read data instances.
-    :param num_shards: the number of shards that the dataset will be partitioned into.
     :param name_prefix: the name prefix of generated files.
     :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
     """
 
-    assert num_shards >= 1
-    assert max_lines_to_shuffle >= 1
-
-    def open_writers():
-        w = []
-        for i in range(0, num_shards):
-            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
-                                        num_shards - 1)
-            w.append(recordio.writer(n))
-
-        return w
-
-    def close_writers(w):
-        for i in range(0, num_shards):
-            w[i].close()
+    assert line_count >= 1
+    indx_f = 0
 
-    def write_data(w, lines):
+    def write_data(indx_f, lines):
         random.shuffle(lines)
-        for i, d in enumerate(lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
             # FIXME(Yancey1989):
             # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            o = pickle.dumps(d)
-            w[i % num_shards].write(o)
+            writer.write(cPickle.dumps(l))
+        writer.close()
 
-    w = open_writers()
     lines = []
-
     for i, d in enumerate(reader()):
         lines.append(d)
-        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
-            write_data(w, lines)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
             lines = []
+            indx_f += 1
             continue
 
-    write_data(w, lines)
-    close_writers(w)
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 3ac62d116b..b658a81630 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -49,7 +49,6 @@ class client(object):
     def set_dataset(self, paths):
         holder_type = ctypes.c_char_p * len(paths)
         holder = holder_type()
-        print paths
         for idx, path in enumerate(paths):
             c_ptr = ctypes.c_char_p(path)
             holder[idx] = c_ptr

From 8b9e678ddfa9b001a824850cecc6ca248b34c71a Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Wed, 26 Jul 2017 17:51:50 +0800
Subject: [PATCH 12/15] fix dropout and clipping setttings in layer helpers.

---
 python/paddle/trainer_config_helpers/attrs.py |  2 +-
 .../paddle/trainer_config_helpers/layers.py   | 31 +++++++------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 9b9f979bb6..ecba871910 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -272,7 +272,7 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s cannot support %s" %
+                raise NotImplementedError("Layer %s does not support %s" %
                                           (layer_name, key))
 
     @staticmethod
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 21eba71527..14f072fc55 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -865,7 +865,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
 @wrap_name_default("embedding")
 @wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING)
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
@@ -1320,7 +1320,7 @@ def pooling_layer(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
-@layer_support(DROPOUT)
+@layer_support()
 def lstmemory(input,
               name=None,
               size=None,
@@ -1429,7 +1429,7 @@ def lstmemory(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
-@layer_support(DROPOUT)
+@layer_support()
 def grumemory(input,
               size=None,
               name=None,
@@ -1793,7 +1793,7 @@ def repeat_layer(input,
 @wrap_name_default("seqreshape")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def seq_reshape_layer(input,
                       reshape_size,
                       act=None,
@@ -2703,7 +2703,7 @@ def img_cmrnorm_layer(input,
     default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def batch_norm_layer(input,
                      act=None,
                      name=None,
@@ -2783,15 +2783,6 @@ def batch_norm_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    if not isinstance(act, ReluActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
-
-    if not isinstance(input.activation, LinearActivation):
-        logger.log(logging.WARN,
-                   "The activation should be inside batch normalization, the "
-                   "previous layer's activation may be Linear")
 
     if num_channels is None:
         if input.num_filters is not None:
@@ -2861,7 +2852,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
@@ -2940,7 +2931,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
@@ -3024,7 +3015,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 @wrap_name_default("seqconcat")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
@@ -3177,7 +3168,7 @@ def memory(name,
 @wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
-@layer_support(ERROR_CLIPPING, DROPOUT)
+@layer_support()
 def lstm_step_layer(input,
                     state,
                     size=None,
@@ -4480,7 +4471,7 @@ def tensor_layer(a,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def selective_fc_layer(input,
                        size,
                        select=None,
@@ -5974,7 +5965,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     """
     The crop layer crops images by offset and shape. User can set crop shape by
     args 'shape' explicitly or by reference input layer.
-    
+
     The example usage is:
 
     .. code-block:: python

From 5f70e14a5c27f69a2bfcf44d0b1ea28c2160f283 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 28 Jul 2017 10:27:46 +0800
Subject: [PATCH 13/15] Memory error

---
 paddle/framework/grad_op_builder.cc |  2 +-
 paddle/operators/CMakeLists.txt     |  1 +
 paddle/operators/fc_op_test.cc      | 26 ++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 paddle/operators/fc_op_test.cc

diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index 6235be75f2..6d9d174eea 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -109,7 +109,7 @@ void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
   }
   grad_op->attrs_["input_format"] = in_format;
   grad_op->attrs_["output_format"] = out_format;
-  grad_op->in_out_idxs_.reset(grad_varmap);
+  grad_op->in_out_idxs_.resect(grad_varmap);
 }
 
 }  // namespace framework
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 0a14dc2114..6d4f804613 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -54,3 +54,4 @@ op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
         softmax_op net)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+cc_test(fc_op_test SRCS fc_op_test.cc DEPS fc_op)
diff --git a/paddle/operators/fc_op_test.cc b/paddle/operators/fc_op_test.cc
new file mode 100644
index 0000000000..796b149afe
--- /dev/null
+++ b/paddle/operators/fc_op_test.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+namespace f = paddle::framework;
+
+USE_OP_WITHOUT_KERNEL(fc);
+
+TEST(FC, create) {
+  for (size_t i = 0; i < 1000000; ++i) {
+    auto tmp = f::OpRegistry::CreateOp("fc", {"X", "W", "B"}, {"O"}, {});
+    ASSERT_NE(tmp, nullptr);
+  }
+}
\ No newline at end of file

From 9b9449fb7474a1577c1de9583299cd17c80cd662 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 28 Jul 2017 10:28:43 +0800
Subject: [PATCH 14/15] Typo

---
 paddle/framework/grad_op_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index 6d9d174eea..6235be75f2 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -109,7 +109,7 @@ void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
   }
   grad_op->attrs_["input_format"] = in_format;
   grad_op->attrs_["output_format"] = out_format;
-  grad_op->in_out_idxs_.resect(grad_varmap);
+  grad_op->in_out_idxs_.reset(grad_varmap);
 }
 
 }  // namespace framework

From 8e7c3253f8baaae0b1fdf7e606a4b49f324bd42e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 28 Jul 2017 12:41:17 +0800
Subject: [PATCH 15/15] Add Some Checker in Input/Output

---
 paddle/framework/operator.cc    | 20 +++++++++++++-------
 paddle/operators/CMakeLists.txt |  1 -
 paddle/operators/fc_op_test.cc  | 26 --------------------------
 3 files changed, 13 insertions(+), 34 deletions(-)
 delete mode 100644 paddle/operators/fc_op_test.cc

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 1e57e9a20f..3a1ffc0215 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -34,22 +34,26 @@ KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 #endif
 
 const std::string& OperatorBase::Input(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr,
+                 "Input Output Indices could not be nullptr");
   auto it = in_out_idxs_->find(name);
   PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                  name);
-
   if (attrs_.count("input_format") == 0) {
-    return inputs_[it->second];
+    return inputs_.at((size_t)it->second);
   } else {
     const auto& input_format = GetAttr<std::vector<int>>("input_format");
     int idx = input_format[it->second];
-    return inputs_.at(idx);
+    return inputs_.at((size_t)idx);
   }
 }
 
 std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
   auto input_format = GetAttr<std::vector<int>>("input_format");
   auto offset = in_out_idxs_->at(name);
+  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(),
+                 "Input Out Of Range");
 
   return std::vector<std::string>{
       inputs_.begin() + input_format.at(offset),
@@ -57,23 +61,25 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
 }
 
 const std::string& OperatorBase::Output(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
   auto it = in_out_idxs_->find(name);
   PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                  name);
-
   if (attrs_.count("output_format") == 0) {
-    return outputs_[it->second];
+    return outputs_.at((size_t)it->second);
   } else {
     const auto& output_format = GetAttr<std::vector<int>>("output_format");
     int idx = output_format[it->second];
-    return outputs_.at(idx);
+    return outputs_.at((size_t)idx);
   }
 }
 
 std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
   auto output_format = GetAttr<std::vector<int>>("output_format");
   auto offset = in_out_idxs_->at(name);
-
+  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(),
+                 "Output Out of Range");
   return std::vector<std::string>{
       outputs_.begin() + output_format.at(offset),
       outputs_.begin() + output_format.at(offset + 1)};
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 6d4f804613..0a14dc2114 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -54,4 +54,3 @@ op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
         softmax_op net)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
-cc_test(fc_op_test SRCS fc_op_test.cc DEPS fc_op)
diff --git a/paddle/operators/fc_op_test.cc b/paddle/operators/fc_op_test.cc
deleted file mode 100644
index 796b149afe..0000000000
--- a/paddle/operators/fc_op_test.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-namespace f = paddle::framework;
-
-USE_OP_WITHOUT_KERNEL(fc);
-
-TEST(FC, create) {
-  for (size_t i = 0; i < 1000000; ++i) {
-    auto tmp = f::OpRegistry::CreateOp("fc", {"X", "W", "B"}, {"O"}, {});
-    ASSERT_NE(tmp, nullptr);
-  }
-}
\ No newline at end of file