From 5e87e27c757efc1b6f0cea06a39a5ebc6dea5ec7 Mon Sep 17 00:00:00 2001
From: lianxiaochen <lianxiaochen@baidu.com>
Date: Fri, 23 Jun 2017 10:53:26 -0700
Subject: [PATCH 001/205] fix error clipping

---
 paddle/gserver/layers/Layer.cpp        |  9 ++++-----
 python/paddle/trainer/config_parser.py | 10 +++++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 125aaf947f..b8a1c8d0fc 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -354,12 +354,11 @@ void Layer::backwardActivation() {
   /* Do error clipping */
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
-      real maxAbsGrad = outGradVec.getAbsMax();
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
         LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                   << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
       }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 58e4902f57..8dec50221f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1571,7 +1571,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1588,6 +1594,8 @@ class FCLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims, sparse,
                                         format)
         self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
 
 
 @config_layer('selective_fc')

From 0e6ddcc7bc63eb6ddfe5f12f4d9060625befe41a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 10:01:10 +0800
Subject: [PATCH 002/205] ENH: Add GPU throw error

---
 paddle/platform/error.h | 87 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 paddle/platform/error.h

diff --git a/paddle/platform/error.h b/paddle/platform/error.h
new file mode 100644
index 0000000000..93424bb610
--- /dev/null
+++ b/paddle/platform/error.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle

From d3b77a5bc053b77309ecc094450e755604217674 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:38 +0800
Subject: [PATCH 003/205] ENH: Add Gpu info

---
 paddle/platform/gpu_info.cc | 49 +++++++++++++++++++++++++++++++++++++
 paddle/platform/gpu_info.h  | 36 +++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 paddle/platform/gpu_info.cc
 create mode 100644 paddle/platform/gpu_info.h

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000..4208d83078
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GpuDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+  return count;
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  return total * FLAGS_fraction_of_gpu_memory_to_use;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000..174f093b43
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of GPU devices in system.
+int GpuDeviceCount();
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t& available, size_t& total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU

From b29923f902dc6da1416a94bc153448f1546e62b2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:57 +0800
Subject: [PATCH 004/205] ENH: Add CPU info

---
 paddle/platform/cpu_info.cc | 55 +++++++++++++++++++++++++++++++++++++
 paddle/platform/cpu_info.h  | 26 ++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 paddle/platform/cpu_info.cc
 create mode 100644 paddle/platform/cpu_info.h

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000..deff76502e
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuTotalMemory() {
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
new file mode 100644
index 0000000000..3b768589e1
--- /dev/null
+++ b/paddle/platform/cpu_info.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total memory on host machine.
+size_t CpuTotalMemory();
+
+}  // namespace platform
+}  // namespace paddle

From 169022d0148a77cd10f16a82e841a75750e7e173 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 14:04:47 +0800
Subject: [PATCH 005/205] FIX: Improve fallback gpu allocator

---
 paddle/memory/detail/CMakeLists.txt           |  4 +-
 paddle/memory/detail/system_allocator.cc      | 64 ++++++++++++++-----
 paddle/memory/detail/system_allocator.h       | 15 +++--
 paddle/memory/detail/system_allocator_test.cc | 14 ++--
 paddle/platform/CMakeLists.txt                |  4 ++
 paddle/platform/cpu_info_test.cc              | 18 ++++++
 paddle/platform/cuda.h                        | 40 ------------
 7 files changed, 85 insertions(+), 74 deletions(-)
 create mode 100644 paddle/platform/cpu_info_test.cc
 delete mode 100644 paddle/platform/cuda.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad7..6caa97a76b 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,6 +1,8 @@
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_test(system_allocator_test
+    SRCS system_allocator_test.cc
+    DEPS system_allocator gpu_info gflags)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f8..332ff062d4 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,32 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  if (FLAGS_use_pinned_memory) {
+    void* p = malloc(size);
+    if (p != nullptr) {
+      mlock(p, size);
+    }
+  }
+
   void* p = malloc(size);
   if (p != nullptr && FLAGS_use_pinned_memory) {
     mlock(p, size);
@@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) {
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
   if (p != nullptr && FLAGS_use_pinned_memory) {
     munlock(p, size);
   }
@@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) {
 
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t remaining = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= remaining) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      total_alloc_size_ += size;
+      return p;
+    }
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    total_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  PADDLE_ASSERT(total_alloc_size_ >= size);
+  total_alloc_size_ -= size;
+  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
     platform::throw_on_error(err, "cudaFree{Host} failed");
   }
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f..e15302ce4f 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -30,21 +30,24 @@ namespace detail {
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+
+ private:
+  size_t total_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e..ba44e06ddb 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7abe2ab89e..17342356d6 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,3 +1,7 @@
+cc_library(cpu_info SRCS cpu_info.cc)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+
+nv_library(gpu_info SRCS gpu_info.cc)
 nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000..5b7ce7c763
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,18 @@
+#include "paddle/platform/cpu_info.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
+  ss << std::to_string(
+            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
+     << "% of CPU Memory Usage: " << mem_size << " GB";
+  std::cout << ss.str();
+}
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
deleted file mode 100644
index 8fe891f9ce..0000000000
--- a/paddle/platform/cuda.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace paddle {
-namespace platform {
-
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU

From e6c14f7e000d047cf3d3a1e18e2a13e3349b1ff9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:03 +0800
Subject: [PATCH 006/205] ENH: Polish cpu info interface

---
 paddle/platform/CMakeLists.txt   |  3 +-
 paddle/platform/cpu_info.cc      | 14 +++++++-
 paddle/platform/cpu_info.h       | 10 ++++--
 paddle/platform/cpu_info_test.cc | 13 ++++---
 paddle/platform/cuda_test.cu     | 59 --------------------------------
 5 files changed, 30 insertions(+), 69 deletions(-)
 delete mode 100644 paddle/platform/cuda_test.cu

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d0bedf6ba9..969c91985d 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc)
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
 nv_library(gpu_info SRCS gpu_info.cc)
-nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index deff76502e..3da04420e5 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -47,9 +47,21 @@ inline size_t CpuTotalPhysicalMemory() {
 #endif
 }
 
-size_t CpuTotalMemory() {
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
   return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index 3b768589e1..8df7c7b4bc 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -19,8 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Get the total memory on host machine.
-size_t CpuTotalMemory();
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
index 5b7ce7c763..8fb195aa7c 100644
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
@@ -1,18 +1,21 @@
 #include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
 
 #include <ostream>
 #include <sstream>
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
 TEST(CpuMemoryUsage, Print) {
   std::stringstream ss;
-  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
-  ss << std::to_string(
-            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
-     << "% of CPU Memory Usage: " << mem_size << " GB";
-  std::cout << ss.str();
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
 }
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f1..0000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}

From 6e7209f0584f73eb22313d98c676333379736d1e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:32 +0800
Subject: [PATCH 007/205] ENH: Add gpu info interface

---
 paddle/platform/gpu_info.cc | 25 +++++++++++++++++++++++++
 paddle/platform/gpu_info.h  |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 4208d83078..d6c6fe34ef 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -42,8 +42,33 @@ size_t GpuMaxAllocSize() {
 
   GpuMemoryUsage(available, total);
 
+  // Reserve the rest for page tables, etc.
   return total * FLAGS_fraction_of_gpu_memory_to_use;
 }
 
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 174f093b43..d7bf0f4093 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -30,6 +30,12 @@ void GpuMemoryUsage(size_t& available, size_t& total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
 }  // namespace platform
 }  // namespace paddle
 

From 464886bf56cd91ffcd6617390d62dbd13c90a093 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:31:05 +0800
Subject: [PATCH 008/205] FIX: fix typo in piece.h

---
 paddle/string/piece.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e6980..0272529d1c 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ public:
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);

From 26cd0bb5a59d913f8c216ceee0c6abb46317e31e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:13:24 +0800
Subject: [PATCH 009/205] ENH: count allocated fallback size for performance

---
 paddle/memory/detail/system_allocator.cc      | 52 +++++++++++++------
 paddle/memory/detail/system_allocator.h       |  3 +-
 .../paddle/trainer_config_helpers/networks.py |  4 +-
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 332ff062d4..2b0fbfa87e 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -39,22 +39,22 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
-  if (FLAGS_use_pinned_memory) {
-    void* p = malloc(size);
-    if (p != nullptr) {
-      mlock(p, size);
-    }
-  }
+  index = 0;  // unlock memory
 
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
 void CPUAllocator::Free(void* p, size_t size, size_t index) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
@@ -73,26 +73,34 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 
   // Reserve memory for page tables, etc.
   size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
-  size_t remaining = available > reserving ? available - reserving : 0;
+  size_t usable = available > reserving ? available - reserving : 0;
 
   // If remaining size no less than expected size, using general
   // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  if (size <= remaining) {
+  if (size <= usable) {
     cudaError_t result = cudaMalloc(&p, size);
     if (result == cudaSuccess) {
       index = 0;
-      total_alloc_size_ += size;
+      gpu_alloc_size_ += size;
       return p;
     }
   }
 
   // If remaining size less than expected size or cudaMalloc failed,
   // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
   cudaError_t result = cudaMallocHost(&p, size);
   if (result == cudaSuccess) {
     index = 1;
-    total_alloc_size_ += size;
+    fallback_alloc_size_ += size;
     return p;
   }
 
@@ -100,16 +108,26 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  PADDLE_ASSERT(total_alloc_size_ >= size);
-  total_alloc_size_ -= size;
-  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index e15302ce4f..7093c42967 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -47,7 +47,8 @@ class GPUAllocator : public SystemAllocator {
   virtual void Free(void* p, size_t size, size_t index);
 
  private:
-  size_t total_alloc_size_ = 0;
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 67154a8d7d..1bf59ed484 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1381,7 +1381,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1424,7 +1424,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From fb51c3dc895b78df966dd0d9713657289b1986b3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:57:10 +0800
Subject: [PATCH 010/205] FIX: add compile dependency gflags

---
 paddle/platform/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 969c91985d..5cbe491b2b 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
-cc_library(cpu_info SRCS cpu_info.cc)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
-nv_library(gpu_info SRCS gpu_info.cc)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

From 275e5b7d42903ea3c9bf4e4fed3f9eab45c727bf Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 11:12:18 +0800
Subject: [PATCH 011/205] FIX: yapf format version

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index f0b6625dc3..b77932ce5f 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1438,7 +1438,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 40573cd56f723ebde6328ccd5dabe4a363c9f3db Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Mon, 3 Jul 2017 14:41:43 +0800
Subject: [PATCH 012/205] add net headers

---
 paddle/framework/net.cc |  23 +++++
 paddle/framework/net.h  | 182 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 205 insertions(+)
 create mode 100644 paddle/framework/net.cc
 create mode 100644 paddle/framework/net.h

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
new file mode 100644
index 0000000000..0ce9296820
--- /dev/null
+++ b/paddle/framework/net.cc
@@ -0,0 +1,23 @@
+#include "paddle/framework/net.h"
+
+namespace paddle {
+namespace framework {
+
+PlainNet::PlainNet(const NetDesc& def) {}
+
+virtual Error PlainNet::InferShape() {
+  for (auto& op : ops_) {
+    // wrong shape
+    auto err = op.InferShape();
+    if (!err) return err;
+  }
+  // ok
+  return Error();
+}
+
+virtual Error PlainNet::Run(Scope* scope = nullptr,
+                            OpContext* context = nullptr, OpIndex begin = -1,
+                            OpIndex end = -1) const {}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
new file mode 100644
index 0000000000..88bdf0bb68
--- /dev/null
+++ b/paddle/framework/net.h
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+// operator's index stored in a network.
+typedef int OpIndex;
+/**
+ * NOTE following codes are some definitions of unimplemented concepts.
+ * We write some basic implementation to make Net compilable. These APIs will
+ * keep updating if the concepts related are implemented.
+ */
+
+// Operator's runtime context.
+struct OpContext {
+  int dev_id;
+  DevType dev_type{kCPU};
+  enum DevType { kCPU, kGPU };
+};
+
+// Proto definitions, use `struct`s for simpility.
+struct VarDesc {
+  std::string type;
+  std::vector<int> dims;
+};
+struct OpDesc {
+  std::string type;
+  std::vector<VarDesc> inputs;
+  std::vector<VarDesc> outputs;
+};
+struct struct NetDesc {
+  std::vector<OpDesc> ops;
+};
+class Operator {
+ public:
+  Operator(const OpDesc &def) {}
+  Error InferShape() {}
+  Error Run() {}
+};
+
+/**
+ * @brief Network that manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators, user can build
+ * a real network from a NetDesc which is a protobuf message and use
+ * Network.Run() * to run all the operators in the network.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the apis
+ * it defines.
+ */
+class Net {
+ public:
+  /**
+   * @brief Infer shapes of all inputs and outputs of operators.
+   */
+  virtual Error InferShape(Scope *scope) override;
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators and return success(true) or not, with all the
+   * variables are located in `scope`. `context` describes the detail execution
+   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+   * If no positive indexes are provided, all operators in `ops_` will run.
+   */
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                    OpIndex end = -1) const = 0;
+
+  /**
+   * @brief Add an Operator according to `def`.
+   */
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  /**
+   * @brief Add optimizer operators acctording to `attrs`.
+   */
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  /**
+   * @brief Add backward operators.
+   */
+  virtual Error AddBackwardOps() = 0;
+
+  /**
+   * @brief Create a network.
+   */
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+
+/**
+ * @brief a basic implementation of Net.
+ *
+ * PlainNet is a very simple Net, it create a list of operators, and run them
+ * sequentially following the order they added.
+ */
+class PlainNet : public Net {
+ public:
+  /**
+   * @brief Initialize a PlainNet.
+   *
+   * Initialize from  a network describe by `def`. NetDesc is the definition of
+   * a network.
+   */
+  PlainNet(const NetDesc &def);
+
+  /**
+   * Infer all the operators' input and output varialbes' shapes, will be called
+   * before every mini-batch
+   */
+  virtual Error InferShape(Scope *scope) override;
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  virtual Error Run(Scope *scope = nullptr, OpContext *context = nullptr,
+                    OpIndex begin = -1, OpIndex end = -1) const override;
+
+  /**
+   * @brief Add an operator to this network.
+   */
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  /**
+   * @brief Add all optimizer operators related into the network.
+   */
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  /**
+   * @brief Add all backward operators related into the network.
+   */
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  /**
+   * @brief Build the network.
+   *
+   * Create operators accordding to `def`, will be called by the constructor.
+   */
+  Error BuildNet(const NetDesc &def);
+
+  /**
+   * @brief Add an operator into this network.
+   *
+   * Add a operator which is identified as `type` and has attributes described
+   * in `attrs`, the `inputs` are the keys of readonly input variables,
+   * `outputs` are keys of mutable output variables. An `OpIndex` will be
+   * returned to indicate the offset of the new operator in `ops_`.
+   */
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+
+}  // namespace framework
+}  // namespace paddle

From 89110fd2660098bc949a1f13f7b53515e0c931a3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:51:32 +0800
Subject: [PATCH 013/205] ENH: Add useGpu in system allocator

---
 paddle/memory/detail/system_allocator.cc | 4 ++++
 paddle/memory/detail/system_allocator.h  | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 2b0fbfa87e..75a2c91ef9 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,6 +60,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
+bool CPUAllocator::UseGpu() { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
@@ -131,6 +133,8 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
+bool GPUAllocator::UseGpu() { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 7093c42967..f3bbfef843 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,12 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu();
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -45,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-
+  virtual bool UseGpu();
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From 929f9cbdff08090a222495db7db601f164cebb8c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:52:04 +0800
Subject: [PATCH 014/205] ENH: Add Metadata for memory block

---
 paddle/memory/detail/metadata.cc | 62 ++++++++++++++++++++++++++++++++
 paddle/memory/detail/metadata.h  | 53 +++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 paddle/memory/detail/metadata.cc
 create mode 100644 paddle/memory/detail/metadata.h

diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/metadata.cc
new file mode 100644
index 0000000000..4607cd8512
--- /dev/null
+++ b/paddle/memory/detail/metadata.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/metadata.h
new file mode 100644
index 0000000000..ddb826571b
--- /dev/null
+++ b/paddle/memory/detail/metadata.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle

From bbd3eab7ee88f02131edb41738a966aa0f1a0e88 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:54:32 +0800
Subject: [PATCH 015/205] ENH: Add Alloc for buddy Allocator

* Free will be added soon
---
 paddle/memory/detail/buddy_allocator.cc | 157 ++++++++++++++++++++++--
 paddle/memory/detail/buddy_allocator.h  |  88 +++++++++----
 2 files changed, 209 insertions(+), 36 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5ee..2462ba084b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,161 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size) {
+  PADDLE_ASSERT(min_chunk_size > 0);
+  PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
+
+  system_allocator_ = std::move(system_allocator);
+  min_chunk_size_ = min_chunk_size;
+  max_chunk_size_ = max_chunk_size;
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+  } else {
+    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  // if still failure, fail fatally
+  if (it == pool_.end()) {
+    return nullptr;
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << " Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert({index, max_chunk_size_, p}).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound({index, size, nullptr});
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+
+  pool_.erase(it);
+
+  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+
+  block->split(cache_, size);
+
+  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
+                 << ", " << block->right_buddy(cache_)->total_size(cache_)
+                 << ")";
+
+      pool_.insert({block->right_buddy(cache_)->index(cache_),
+                    block->right_buddy(cache_)->total_size(cache_),
+                    block->right_buddy(cache_)});
+    }
+  }
+
+  return block;
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc7..38bedc9a18 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -15,9 +15,15 @@
 #pragma once
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/metadata.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
+#include <set>
 #include <mutex>
 #include <vector>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -25,55 +31,83 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple type: allocator index, memory size, memory address
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /** 
+   *  \brief Find the suitable chunk from existing pool
+   *  
+   *  \param it   pool iterator which contains suitable block.
+   *  \param size the size of allocation.
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation  */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  PoolSet pool_;
+
+ private:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+  MetadataCache cache_;
+
+ private:
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
 };
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = nullptr;
   if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
+    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
+                           platform::CpuMaxChunkSize());
   }
   return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
 
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    int gpu_num = platform::GpuDeviceCount();
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] =
+          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize());
     }
   }
   return as[gpu_id];

From e25c155f3954ee8cde673f39e8f82c5baebd99c6 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 4 Jul 2017 02:37:31 +0000
Subject: [PATCH 016/205] add taskfail interface

---
 go/master/client.go  |  5 +++
 go/master/service.go | 99 ++++++++++++++++++++++++++++----------------
 2 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index d3bea49d0a..b6ca8cad15 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -112,6 +112,11 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
 
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(taskID int, epoch int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, epoch)
+}
+
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is
diff --git a/go/master/service.go b/go/master/service.go
index 58e68e7448..b078f318f5 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -34,29 +34,30 @@ type Chunk struct {
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
 	ID     int
+	Epoch  int
 	Chunks []Chunk
 }
 
 type taskEntry struct {
-	Epoch      int
 	NumTimeout int
 	Task       Task
+	FailedNum  int
 }
 
 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }
 
 // Service is the master server service.
 type Service struct {
-	chunksPerTask int
-	timeoutDur    time.Duration
-	timeoutMax    int
-	ready         chan struct{}
-	store         Store
+	chunksPerTask    int
+	timeoutDur       time.Duration
+	failortimeoutMax int
+	ready            chan struct{}
+	store            Store
 
 	mu         sync.Mutex
 	initDone   bool
@@ -91,11 +92,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failortimeoutMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failortimeoutMax = failortimeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -257,6 +258,34 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
 
+func (s *Service) checkTaskStatus(t taskEntry, epoch int) {
+	if t.Task.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
+
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}()
+
+	delete(s.taskQueues.Pending, t.Task.ID)
+
+	t.NumTimeout++
+	if t.NumTimeout+t.FailedNum > s.failortimeoutMax {
+		log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.FailedNum)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		return
+	}
+
+	log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.FailedNum)
+	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
+}
+
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
@@ -267,30 +296,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		if t.Epoch != epoch {
-			// new epoch, task launched after the
-			// schedule of this timeout check.
-			return
-		}
-
-		defer func() {
-			err := s.snapshot()
-			if err != nil {
-				log.Errorln(err)
-			}
-		}()
-
-		delete(s.taskQueues.Pending, t.Task.ID)
-
-		t.NumTimeout++
-		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-			return
-		}
-
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
-		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		s.checkTaskStatus(t, epoch)
 	}
 }
 
@@ -339,7 +345,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
 	s.taskQueues.Pending[t.Task.ID] = t
 	err := s.snapshot()
@@ -348,9 +354,9 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t)
 
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Task.Epoch))
 	return nil
 }
 
@@ -372,6 +378,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	// task finished, reset timeout
 	t.NumTimeout = 0
+	t.FailedNum = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 
@@ -389,3 +396,23 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+
+// TaskFailed tell the service that a task is failed.
+func (s *Service) TaskFailed(taskID int, epoch int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[taskID]
+	if !ok {
+		err := errors.New("pending task not found")
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%d not found.", taskID)
+		return err
+	}
+
+	s.checkTaskStatus(t, epoch)
+	return nil
+}

From 52cc601b48f6f5e179efa79bb2ba5442d42eac75 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 4 Jul 2017 02:41:47 +0000
Subject: [PATCH 017/205] fix bugs

---
 go/master/service.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/go/master/service.go b/go/master/service.go
index b078f318f5..c47319317a 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -41,7 +41,7 @@ type Task struct {
 type taskEntry struct {
 	NumTimeout int
 	Task       Task
-	FailedNum  int
+	NumFailed  int
 }
 
 type taskQueues struct {
@@ -275,13 +275,13 @@ func (s *Service) checkTaskStatus(t taskEntry, epoch int) {
 	delete(s.taskQueues.Pending, t.Task.ID)
 
 	t.NumTimeout++
-	if t.NumTimeout+t.FailedNum > s.failortimeoutMax {
-		log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.FailedNum)
+	if t.NumTimeout+t.NumFailed > s.failortimeoutMax {
+		log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.NumFailed)
 		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
 		return
 	}
 
-	log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.FailedNum)
+	log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.NumFailed)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
 	return
 }
@@ -378,7 +378,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	// task finished, reset timeout
 	t.NumTimeout = 0
-	t.FailedNum = 0
+	t.NumFailed = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 

From 9f365d36364d34f2cf186d5bc0569189145c612d Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 4 Jul 2017 11:23:49 +0800
Subject: [PATCH 018/205] "add net proto"

---
 paddle/framework/CMakeLists.txt  |  4 +++
 paddle/framework/net.h           | 48 ++++++++++----------------------
 paddle/framework/net_proto.proto | 16 +++++++++++
 3 files changed, 35 insertions(+), 33 deletions(-)
 create mode 100644 paddle/framework/net_proto.proto

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f7e5753ac2..8c34a77c20 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,4 +7,8 @@ cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto)
+
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)
+
+proto_library(net_proto SRCS net_proto.proto)
+cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 88bdf0bb68..b3064e4f90 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "paddle/framework/net_proto.pb.h"
+#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/scope.h"
 
 namespace paddle {
@@ -27,31 +29,11 @@ typedef int OpIndex;
  * keep updating if the concepts related are implemented.
  */
 
-// Operator's runtime context.
-struct OpContext {
-  int dev_id;
-  DevType dev_type{kCPU};
-  enum DevType { kCPU, kGPU };
-};
-
-// Proto definitions, use `struct`s for simpility.
-struct VarDesc {
-  std::string type;
-  std::vector<int> dims;
-};
-struct OpDesc {
-  std::string type;
-  std::vector<VarDesc> inputs;
-  std::vector<VarDesc> outputs;
-};
-struct struct NetDesc {
-  std::vector<OpDesc> ops;
-};
 class Operator {
  public:
   Operator(const OpDesc &def) {}
-  Error InferShape() {}
-  Error Run() {}
+  bool InferShape() {}
+  bool Run() {}
 };
 
 /**
@@ -73,7 +55,7 @@ class Net {
   /**
    * @brief Infer shapes of all inputs and outputs of operators.
    */
-  virtual Error InferShape(Scope *scope) override;
+  virtual bool InferShape(Scope *scope) override;
   /**
    * @brief Run the network.
    *
@@ -82,8 +64,8 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
-                    OpIndex end = -1) const = 0;
+  virtual bool Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -93,12 +75,12 @@ class Net {
   /**
    * @brief Add optimizer operators acctording to `attrs`.
    */
-  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+  virtual bool AddOptimizerOps(const OptAttrs &attrs) = 0;
 
   /**
    * @brief Add backward operators.
    */
-  virtual Error AddBackwardOps() = 0;
+  virtual bool AddBackwardOps() = 0;
 
   /**
    * @brief Create a network.
@@ -126,7 +108,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output varialbes' shapes, will be called
    * before every mini-batch
    */
-  virtual Error InferShape(Scope *scope) override;
+  virtual bool InferShape(Scope *scope) override;
 
   /**
    * @brief Run the network.
@@ -135,8 +117,8 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual Error Run(Scope *scope = nullptr, OpContext *context = nullptr,
-                    OpIndex begin = -1, OpIndex end = -1) const override;
+  virtual bool Run(Scope *scope = nullptr, OpContext *context = nullptr,
+                   OpIndex begin = -1, OpIndex end = -1) const override;
 
   /**
    * @brief Add an operator to this network.
@@ -146,12 +128,12 @@ class PlainNet : public Net {
   /**
    * @brief Add all optimizer operators related into the network.
    */
-  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+  virtual bool AddOptimizerOps(const OptAttrs &attrs) override;
 
   /**
    * @brief Add all backward operators related into the network.
    */
-  virtual Error AddBackwardOps() override;
+  virtual bool AddBackwardOps() override;
 
  protected:
   /**
@@ -159,7 +141,7 @@ class PlainNet : public Net {
    *
    * Create operators accordding to `def`, will be called by the constructor.
    */
-  Error BuildNet(const NetDesc &def);
+  bool BuildNet(const NetDesc &def);
 
   /**
    * @brief Add an operator into this network.
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
new file mode 100644
index 0000000000..e9aed8f349
--- /dev/null
+++ b/paddle/framework/net_proto.proto
@@ -0,0 +1,16 @@
+syntax="proto2";
+package paddle.framework;
+
+import "op_proto.proto"
+
+message NetDesc {
+  // network identification
+  optional string name = 1;
+  // operator contains in network
+  repeated OpProto operators = 2;
+  // network type to run with. e.g "plainNet", "DAG"
+  optional string type = 3;
+  // num worker always
+  optional int32 num_workers = 4;
+}
+

From 4e1617d05994bda1a9eb0e0b5b563249cc12f271 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 13:15:00 +0800
Subject: [PATCH 019/205] ENH: add buddy alloctor Free

---
 paddle/memory/detail/CMakeLists.txt     |   3 +
 paddle/memory/detail/buddy_allocator.cc |  19 ++--
 paddle/memory/detail/buddy_allocator.h  |  10 +-
 paddle/memory/detail/memory_block.cc    | 145 ++++++++++++++++++++++++
 paddle/memory/detail/memory_block.h     |  97 ++++++++++++++++
 paddle/memory/detail/system_allocator.h |   1 +
 6 files changed, 262 insertions(+), 13 deletions(-)
 create mode 100644 paddle/memory/detail/memory_block.cc
 create mode 100644 paddle/memory/detail/memory_block.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6caa97a76b..dbc98a8a62 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -7,3 +7,6 @@ else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+
+cc_library(metadata SRCS metadata.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 2462ba084b..e8d694327d 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -58,17 +58,16 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // refill the pool if failure
   if (it == pool_.end()) {
     it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
   } else {
     DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
-  // if still failure, fail fatally
-  if (it == pool_.end()) {
-    return nullptr;
-  }
-
   total_used_ += size;
   total_free_ -= size;
 
@@ -76,6 +75,13 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
 }
 
+void BuddyAllocator::Free(void* p) {
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+}
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -140,17 +146,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
                                    size_t size) {
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
-
   pool_.erase(it);
 
   DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
-
   block->split(cache_, size);
 
   DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
              << ")";
-
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 38bedc9a18..4006bdcce8 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <set>
 #include <mutex>
-#include <vector>
+#include <set>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace memory {
@@ -57,9 +57,9 @@ class BuddyAllocator {
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
 
-  /** 
+  /**
    *  \brief Find the suitable chunk from existing pool
-   *  
+   *
    *  \param it   pool iterator which contains suitable block.
    *  \param size the size of allocation.
    */
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000..1c9e87df49
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,145 @@
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this,
+              MemoryBlockMetadata(t, index, size - overhead(), size,
+                                  static_cast<MemoryBlock*>(left_buddy),
+                                  static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  assert(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= overhead()) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlockMetadata(FREE_MEMORY, index(cache),
+                                  remaining_size - overhead(), remaining_size,
+                                  this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - overhead();
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  assert(type(cache) == FREE_MEMORY);
+  assert(right_buddy->type(cache) == FREE_MEMORY);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy,
+              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  assert(type(cache) != FREE_MEMORY);
+  assert(type(cache) != INVALID_MEMORY);
+
+  set_type(cache, FREE_MEMORY);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<MemoryBlockMetadata*>(
+             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
+         1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+}
+
+}  // detail
+}  // memory
+}  // paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000..e2d39c31cf
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <cstddef>
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declaration
+class Metadata;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index f3bbfef843..555061a533 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -48,6 +48,7 @@ class GPUAllocator : public SystemAllocator {
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu();
+
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From c602e046132b7e4e38c34f348b2a7fa290d67361 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Tue, 4 Jul 2017 13:35:21 +0800
Subject: [PATCH 020/205] add fake interfaces to make compilable

---
 paddle/framework/net.cc          | 10 +++++---
 paddle/framework/net.h           | 44 +++++++++++++++++++-------------
 paddle/framework/net_proto.proto |  3 +--
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 0ce9296820..2d9e099dc0 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -5,7 +5,7 @@ namespace framework {
 
 PlainNet::PlainNet(const NetDesc& def) {}
 
-virtual Error PlainNet::InferShape() {
+Error PlainNet::InferShape(Scope* scope) {
   for (auto& op : ops_) {
     // wrong shape
     auto err = op.InferShape();
@@ -15,9 +15,11 @@ virtual Error PlainNet::InferShape() {
   return Error();
 }
 
-virtual Error PlainNet::Run(Scope* scope = nullptr,
-                            OpContext* context = nullptr, OpIndex begin = -1,
-                            OpIndex end = -1) const {}
+Error PlainNet::Run(Scope* scope, OpContext* context, OpIndex begin,
+                    OpIndex end) const {
+  // TODO Add implementation here.
+  return Error();
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index b3064e4f90..76e0ed9330 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -17,6 +17,7 @@
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/scope.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
@@ -29,11 +30,16 @@ typedef int OpIndex;
  * keep updating if the concepts related are implemented.
  */
 
+struct OpDesc;
+struct OpDef;
+struct OpContext;
+struct OpAttrs {};
+
 class Operator {
  public:
   Operator(const OpDesc &def) {}
-  bool InferShape() {}
-  bool Run() {}
+  Error InferShape() { return Error(); }
+  Error Run() { return Error(); }
 };
 
 /**
@@ -55,7 +61,7 @@ class Net {
   /**
    * @brief Infer shapes of all inputs and outputs of operators.
    */
-  virtual bool InferShape(Scope *scope) override;
+  virtual Error InferShape(Scope *scope) = 0;
   /**
    * @brief Run the network.
    *
@@ -64,28 +70,30 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual bool Run(Scope *scope, OpContext *context, OpIndex begin = -1,
-                   OpIndex end = -1) const = 0;
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                    OpIndex end = -1) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
    */
-  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+  virtual OpIndex AddOp(const OpDef &def) = 0;
 
   /**
    * @brief Add optimizer operators acctording to `attrs`.
    */
-  virtual bool AddOptimizerOps(const OptAttrs &attrs) = 0;
+  virtual Error AddOptimizerOps(const OpAttrs &attrs) = 0;
 
   /**
    * @brief Add backward operators.
    */
-  virtual bool AddBackwardOps() = 0;
+  virtual Error AddBackwardOps() = 0;
 
   /**
    * @brief Create a network.
    */
   static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+
+  virtual ~Net() = 0;
 };
 
 /**
@@ -108,7 +116,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output varialbes' shapes, will be called
    * before every mini-batch
    */
-  virtual bool InferShape(Scope *scope) override;
+  virtual Error InferShape(Scope *scope) override;
 
   /**
    * @brief Run the network.
@@ -117,23 +125,23 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual bool Run(Scope *scope = nullptr, OpContext *context = nullptr,
-                   OpIndex begin = -1, OpIndex end = -1) const override;
+  virtual Error Run(Scope *scope = nullptr, OpContext *context = nullptr,
+                    OpIndex begin = -1, OpIndex end = -1) const override;
 
   /**
    * @brief Add an operator to this network.
    */
-  virtual OpIndex AddOp(const proto::OpDef &def) override;
+  virtual OpIndex AddOp(const OpDef &def) override;
 
   /**
    * @brief Add all optimizer operators related into the network.
    */
-  virtual bool AddOptimizerOps(const OptAttrs &attrs) override;
+  virtual Error AddOptimizerOps(const OpAttrs &attrs) override;
 
   /**
    * @brief Add all backward operators related into the network.
    */
-  virtual bool AddBackwardOps() override;
+  virtual Error AddBackwardOps() override;
 
  protected:
   /**
@@ -141,7 +149,7 @@ class PlainNet : public Net {
    *
    * Create operators accordding to `def`, will be called by the constructor.
    */
-  bool BuildNet(const NetDesc &def);
+  Error BuildNet(const NetDesc &def);
 
   /**
    * @brief Add an operator into this network.
@@ -151,9 +159,9 @@ class PlainNet : public Net {
    * `outputs` are keys of mutable output variables. An `OpIndex` will be
    * returned to indicate the offset of the new operator in `ops_`.
    */
-  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
-                const std::vector<string> &outputs,
-                const OprAttr &attrs = OprAttr());
+  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
+                const std::vector<std::string> &outputs,
+                const OpAttrs &attrs = OpAttrs());
 
  private:
   // the operators owned by `Network`.
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
index e9aed8f349..2d042457e3 100644
--- a/paddle/framework/net_proto.proto
+++ b/paddle/framework/net_proto.proto
@@ -1,7 +1,7 @@
 syntax="proto2";
 package paddle.framework;
 
-import "op_proto.proto"
+import "op_proto.proto";
 
 message NetDesc {
   // network identification
@@ -13,4 +13,3 @@ message NetDesc {
   // num worker always
   optional int32 num_workers = 4;
 }
-

From 04e20034dfcbb0ceb1de30ddd5b1f8b8ee811d4f Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Tue, 4 Jul 2017 13:44:01 +0800
Subject: [PATCH 021/205] replace Error with void

---
 paddle/framework/net.cc | 11 +++--------
 paddle/framework/net.h  | 23 +++++++++++------------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 2d9e099dc0..d49861c343 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -5,20 +5,15 @@ namespace framework {
 
 PlainNet::PlainNet(const NetDesc& def) {}
 
-Error PlainNet::InferShape(Scope* scope) {
+void PlainNet::InferShape(Scope* scope) {
   for (auto& op : ops_) {
-    // wrong shape
-    auto err = op.InferShape();
-    if (!err) return err;
+    op.InferShape();
   }
-  // ok
-  return Error();
 }
 
-Error PlainNet::Run(Scope* scope, OpContext* context, OpIndex begin,
+void PlainNet::Run(Scope* scope, OpContext* context, OpIndex begin,
                     OpIndex end) const {
   // TODO Add implementation here.
-  return Error();
 }
 
 }  // namespace framework
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 76e0ed9330..55dcf147e1 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -17,7 +17,6 @@
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/scope.h"
-#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
@@ -38,8 +37,8 @@ struct OpAttrs {};
 class Operator {
  public:
   Operator(const OpDesc &def) {}
-  Error InferShape() { return Error(); }
-  Error Run() { return Error(); }
+  void InferShape() {}
+  void Run() {}
 };
 
 /**
@@ -61,7 +60,7 @@ class Net {
   /**
    * @brief Infer shapes of all inputs and outputs of operators.
    */
-  virtual Error InferShape(Scope *scope) = 0;
+  virtual void InferShape(Scope *scope) = 0;
   /**
    * @brief Run the network.
    *
@@ -70,7 +69,7 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+  virtual void Run(Scope *scope, OpContext *context, OpIndex begin = -1,
                     OpIndex end = -1) const = 0;
 
   /**
@@ -81,12 +80,12 @@ class Net {
   /**
    * @brief Add optimizer operators acctording to `attrs`.
    */
-  virtual Error AddOptimizerOps(const OpAttrs &attrs) = 0;
+  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
 
   /**
    * @brief Add backward operators.
    */
-  virtual Error AddBackwardOps() = 0;
+  virtual void AddBackwardOps() = 0;
 
   /**
    * @brief Create a network.
@@ -116,7 +115,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output varialbes' shapes, will be called
    * before every mini-batch
    */
-  virtual Error InferShape(Scope *scope) override;
+  virtual void InferShape(Scope *scope) override;
 
   /**
    * @brief Run the network.
@@ -125,7 +124,7 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual Error Run(Scope *scope = nullptr, OpContext *context = nullptr,
+  virtual void Run(Scope *scope = nullptr, OpContext *context = nullptr,
                     OpIndex begin = -1, OpIndex end = -1) const override;
 
   /**
@@ -136,12 +135,12 @@ class PlainNet : public Net {
   /**
    * @brief Add all optimizer operators related into the network.
    */
-  virtual Error AddOptimizerOps(const OpAttrs &attrs) override;
+  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
 
   /**
    * @brief Add all backward operators related into the network.
    */
-  virtual Error AddBackwardOps() override;
+  virtual void AddBackwardOps() override;
 
  protected:
   /**
@@ -149,7 +148,7 @@ class PlainNet : public Net {
    *
    * Create operators accordding to `def`, will be called by the constructor.
    */
-  Error BuildNet(const NetDesc &def);
+  void BuildNet(const NetDesc &def);
 
   /**
    * @brief Add an operator into this network.

From 109937b8d512904d04a1773bdf19ddb756ecd087 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Tue, 4 Jul 2017 15:18:20 +0800
Subject: [PATCH 022/205] fix ci error

---
 paddle/framework/CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 42600277f6..ceff1d3581 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,8 +7,6 @@ cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-
+proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)
-
-proto_library(net_proto SRCS net_proto.proto)
-cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
\ No newline at end of file
+cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)

From ff36389452c1af6cc6a5f03b5ca52404ab20f108 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 15:21:24 +0800
Subject: [PATCH 023/205] ENH: code style

---
 paddle/memory/detail/buddy_allocator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index e8d694327d..eddfd9d13c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -48,7 +48,6 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
     DLOG(INFO) << "Allocate from system allocator.";
-
     return SystemAlloc(size);
   }
 

From e95299b58300afda0d61e868998dfceb28e999da Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Tue, 4 Jul 2017 16:28:21 +0800
Subject: [PATCH 024/205] fix ci error

---
 paddle/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index ceff1d3581..0abc63a831 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,6 +7,6 @@ cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)
 cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)

From 379434b243faeaf9fd4d38cf9f95dfe45cc563d5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 17:21:09 +0800
Subject: [PATCH 025/205] Delete cmake in dynload

---
 paddle/platform/dynload/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 paddle/platform/dynload/CMakeLists.txt

diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
deleted file mode 100644
index 9f829b7012..0000000000
--- a/paddle/platform/dynload/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)

From 0ba63475659822bd146f1f1dcfc7eabca8b7047d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 21:23:23 +0800
Subject: [PATCH 026/205] ENH: Add buddy allocator Free

---
 paddle/memory/detail/CMakeLists.txt           |  9 ++--
 paddle/memory/detail/buddy_allocator.cc       | 41 ++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h        |  2 +-
 paddle/memory/detail/memory_block.h           |  2 +-
 .../detail/{metadata.cc => meta_data.cc}      |  2 +-
 .../memory/detail/{metadata.h => meta_data.h} |  0
 paddle/platform/cpu_info.h                    | 10 -----
 paddle/platform/gpu_info.cc                   | 13 ++++++
 paddle/platform/gpu_info.h                    |  6 +++
 9 files changed, 65 insertions(+), 20 deletions(-)
 rename paddle/memory/detail/{metadata.cc => meta_data.cc} (97%)
 rename paddle/memory/detail/{metadata.h => meta_data.h} (100%)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index dbc98a8a62..c3167cd30a 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,12 +1,9 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test
-    SRCS system_allocator_test.cc
-    DEPS system_allocator gpu_info gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
-cc_library(metadata SRCS metadata.cc)
+cc_library(meta_data SRCS meta_data.cc)
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index eddfd9d13c..f677feda0d 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -75,10 +75,49 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
 }
 
 void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
   auto block = static_cast<MemoryBlock*>(p)->metadata();
 
-  // acquire the allocator lock
+  // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    if (system_allocator_->UseGpu()) {
+      cache_.erase(block);
+    }
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+  }
+
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+
+  // TODO(gangliao): Clean up if existing too much free memory
 }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 4006bdcce8..49bd6cf901 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index e2d39c31cf..2945520113 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <cstddef>
 #include <unordered_map>
diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/meta_data.cc
similarity index 97%
rename from paddle/memory/detail/metadata.cc
rename to paddle/memory/detail/meta_data.cc
index 4607cd8512..a3b7a9b4fe 100644
--- a/paddle/memory/detail/metadata.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <functional>
 
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/meta_data.h
similarity index 100%
rename from paddle/memory/detail/metadata.h
rename to paddle/memory/detail/meta_data.h
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index edd76517a6..8df7c7b4bc 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -28,15 +28,5 @@ size_t CpuMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
 
-int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
-
-void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index d6c6fe34ef..05a243c506 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -31,6 +31,19 @@ int GpuDeviceCount() {
   return count;
 }
 
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
 void GpuMemoryUsage(size_t& available, size_t& total) {
   throw_on_error(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index d7bf0f4093..81ee5f6e0a 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -24,6 +24,12 @@ namespace platform {
 //! Get the total number of GPU devices in system.
 int GpuDeviceCount();
 
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
 //！Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t& available, size_t& total);
 

From 4dc3c9e0cc1b6ec5dbc324f4804974247ca6506f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 23:28:15 +0800
Subject: [PATCH 027/205] ENH: Add paddle_memory for external usage

---
 paddle/memory/CMakeLists.txt            | 10 ++++
 paddle/memory/detail/CMakeLists.txt     |  6 +++
 paddle/memory/detail/buddy_allocator.cc | 12 ++---
 paddle/memory/detail/buddy_allocator.h  |  2 +-
 paddle/memory/detail/memory_block.cc    | 56 +++++++++++--------
 paddle/memory/detail/memory_block.h     | 10 +---
 paddle/memory/detail/meta_cache.cc      | 57 ++++++++++++++++++++
 paddle/memory/detail/meta_cache.h       | 71 +++++++++++++++++++++++++
 paddle/memory/detail/meta_data.cc       |  8 +++
 paddle/memory/detail/meta_data.h        |  1 +
 10 files changed, 196 insertions(+), 37 deletions(-)
 create mode 100644 paddle/memory/detail/meta_cache.cc
 create mode 100644 paddle/memory/detail/meta_cache.h

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad..8c290712fc 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,11 @@
 add_subdirectory(detail)
+
+cc_library(memory
+    SRCS
+    memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c3167cd30a..4fdabc8eeb 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -3,7 +3,13 @@ if(${WITH_GPU})
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
 endif(${WITH_GPU})
+
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
 cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index f677feda0d..aa5b6b557c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -20,14 +20,14 @@ namespace memory {
 namespace detail {
 
 BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size) {
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {
   PADDLE_ASSERT(min_chunk_size > 0);
   PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
-
-  system_allocator_ = std::move(system_allocator);
-  min_chunk_size_ = min_chunk_size;
-  max_chunk_size_ = max_chunk_size;
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -90,7 +90,7 @@ void BuddyAllocator::Free(void* p) {
 
     // Invalidate GPU allocation from cache
     if (system_allocator_->UseGpu()) {
-      cache_.erase(block);
+      cache_.invalidate(block);
     }
     return;
   }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 49bd6cf901..ecf23b77ae 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
 #include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
@@ -80,7 +81,6 @@ class BuddyAllocator {
 
  private:
   // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
   MetadataCache cache_;
 
  private:
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index 1c9e87df49..eaa97e7b4a 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -1,4 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/platform/assert.h"
 
 namespace paddle {
@@ -7,10 +23,9 @@ namespace detail {
 
 void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
                        void* left_buddy, void* right_buddy) {
-  cache.store(this,
-              MemoryBlockMetadata(t, index, size - overhead(), size,
-                                  static_cast<MemoryBlock*>(left_buddy),
-                                  static_cast<MemoryBlock*>(right_buddy)));
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
 }
 
 MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
@@ -35,10 +50,10 @@ MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
 
 void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // make sure the split fits
-  assert(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(cache) >= size);
 
   // bail out if there is no room for another partition
-  if (total_size(cache) - size <= overhead()) {
+  if (total_size(cache) - size <= sizeof(Metadata)) {
     return;
   }
 
@@ -53,13 +68,13 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // Write the metadata for the new block
   auto new_block_right_buddy = metadata.right_buddy;
 
-  cache.store(static_cast<MemoryBlock*>(right_partition),
-              MemoryBlockMetadata(FREE_MEMORY, index(cache),
-                                  remaining_size - overhead(), remaining_size,
-                                  this, new_block_right_buddy));
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
 
   metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - overhead();
+  metadata.size = size - sizeof(Metadata);
   metadata.total_size = size;
 
   cache.store(this, metadata);
@@ -76,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  assert(type(cache) == FREE_MEMORY);
-  assert(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
 
   auto metadata = cache.load(this);
 
@@ -97,16 +112,15 @@ void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   metadata.total_size += right_buddy->total_size(cache);
 
   cache.store(this, metadata);
-  cache.store(right_buddy,
-              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
 
 void MemoryBlock::mark_as_free(MetadataCache& cache) {
   // check for double free or corruption
-  assert(type(cache) != FREE_MEMORY);
-  assert(type(cache) != INVALID_MEMORY);
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
 
-  set_type(cache, FREE_MEMORY);
+  set_type(cache, FREE_CHUNK);
 }
 
 void MemoryBlock::set_type(MetadataCache& cache, Type t) {
@@ -130,14 +144,12 @@ size_t MemoryBlock::index(MetadataCache& cache) const {
 }
 
 void* MemoryBlock::data() const {
-  return const_cast<MemoryBlockMetadata*>(
-             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
-         1;
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
 }
 
 MemoryBlock* MemoryBlock::metadata() const {
   return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+      reinterpret_cast<const Metadata*>(this) - 1));
 }
 
 }  // detail
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index 2945520113..a5168b519f 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,24 +14,18 @@
 
 #pragma once
 
-#include "paddle/memory/detail/meta_data.h"
-
 #include <cstddef>
-#include <unordered_map>
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// Forward Declaration
-class Metadata;
+// Forward Declarations
+class MetadataCache;
 
 /*! \brief A class used to interpret the contents of a memory block */
 class MemoryBlock {
  public:
-  // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
-
   enum Type {
     FREE_CHUNK,    // memory is free and idle
     ARENA_CHUNK,   // memory is being occupied
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000..189ab4fc7b
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    assert(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000..3ca1020d22
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/*! A cache for accessing memory block meta-data that may be expensive to access
+   directly.
+
+    Note: this class exists to unify the metadata format between GPU and CPU
+   allocations.
+    It should be removed when the CPU can access all GPU allocations directly
+   via UVM.
+*/
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+ public:
+  /*! \brief Acquire any external metadata updates. */
+  void acquire(MemoryBlock*);
+
+  /*! \brief Publish any local updates externally. */
+  void release(MemoryBlock*);
+
+  /*! \brief Indicate that the specified metadata will no longer be used */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
index a3b7a9b4fe..70c5c1f439 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -29,6 +29,14 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
       left_buddy(l),
       right_buddy(r) {}
 
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
 template <class T>
 inline void hash_combine(std::size_t& seed, const T& v) {
   std::hash<T> hasher;
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
index ddb826571b..628cf1f2e3 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@@ -26,6 +26,7 @@ class Metadata {
  public:
   Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
            MemoryBlock* r);
+  Metadata();
 
  public:
   /*! \brief Update the guards when metadata is changed */

From 9f2357561d939bdeae2a7bc0bd41be43d9ab0fe5 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Wed, 5 Jul 2017 10:08:23 +0800
Subject: [PATCH 028/205] fix ci error

---
 paddle/framework/net.cc | 2 +-
 paddle/framework/net.h  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index d49861c343..8c565c28cb 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -12,7 +12,7 @@ void PlainNet::InferShape(Scope* scope) {
 }
 
 void PlainNet::Run(Scope* scope, OpContext* context, OpIndex begin,
-                    OpIndex end) const {
+                   OpIndex end) const {
   // TODO Add implementation here.
 }
 
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 55dcf147e1..9564c831ee 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -70,7 +70,7 @@ class Net {
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
   virtual void Run(Scope *scope, OpContext *context, OpIndex begin = -1,
-                    OpIndex end = -1) const = 0;
+                   OpIndex end = -1) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -125,7 +125,7 @@ class PlainNet : public Net {
    * will be used.
    */
   virtual void Run(Scope *scope = nullptr, OpContext *context = nullptr,
-                    OpIndex begin = -1, OpIndex end = -1) const override;
+                   OpIndex begin = -1, OpIndex end = -1) const override;
 
   /**
    * @brief Add an operator to this network.
@@ -142,6 +142,8 @@ class PlainNet : public Net {
    */
   virtual void AddBackwardOps() override;
 
+  virtual ~PlainNet() override {}
+
  protected:
   /**
    * @brief Build the network.

From 5c10a5ad555d834dac4785d8cd2feac18da9b67b Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Wed, 5 Jul 2017 10:34:49 +0800
Subject: [PATCH 029/205] remove virtual

---
 paddle/framework/net.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 9564c831ee..e60356dc17 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -91,8 +91,6 @@ class Net {
    * @brief Create a network.
    */
   static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-
-  virtual ~Net() = 0;
 };
 
 /**
@@ -142,8 +140,6 @@ class PlainNet : public Net {
    */
   virtual void AddBackwardOps() override;
 
-  virtual ~PlainNet() override {}
-
  protected:
   /**
    * @brief Build the network.

From 568c03ba1d311ac2af2cb9242cefb00537174e50 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Wed, 5 Jul 2017 10:51:47 +0800
Subject: [PATCH 030/205] add virtual implementation

---
 paddle/framework/net.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index e60356dc17..2025bfa4b2 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -91,6 +91,8 @@ class Net {
    * @brief Create a network.
    */
   static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+
+  virtual ~Net() {}
 };
 
 /**
@@ -140,6 +142,8 @@ class PlainNet : public Net {
    */
   virtual void AddBackwardOps() override;
 
+  virtual ~PlainNet() override {}
+
  protected:
   /**
    * @brief Build the network.

From d0ad0314bb868b9e0c1aa77f74ca0d2d3e8b8ef0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 16:33:18 +0800
Subject: [PATCH 031/205] FIX: glog dependency

---
 paddle/memory/detail/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 4fdabc8eeb..6cb6422e47 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -12,4 +12,4 @@ cc_library(meta_cache SRCS meta_cache.cc)
 
 cc_library(memory_block SRCS memory_block.cc)
 
-cc_library(buddy_allocator SRCS buddy_allocator.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)

From ada1c20bbc2520d566b7d2bd2a56cf94cbcddd27 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 19:16:02 +0800
Subject: [PATCH 032/205] FIX: Buddy Allocator Free with Merge feature

---
 paddle/memory/detail/buddy_allocator.cc | 33 ++++++++++++++++++++++---
 paddle/memory/detail/buddy_allocator.h  | 15 +++++++----
 paddle/memory/detail/memory_block.cc    |  4 +--
 paddle/platform/CMakeLists.txt          |  2 +-
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index aa5b6b557c..9f334a7048 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -89,9 +89,8 @@ void BuddyAllocator::Free(void* p) {
                             block->index(cache_));
 
     // Invalidate GPU allocation from cache
-    if (system_allocator_->UseGpu()) {
-      cache_.invalidate(block);
-    }
+    cache_.invalidate(block);
+
     return;
   }
 
@@ -104,12 +103,35 @@ void BuddyAllocator::Free(void* p) {
   if (block->has_right_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its right buddy "
                << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
+                   right_buddy});
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
   }
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its left buddy "
                << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
+                   left_buddy});
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
   }
 
   // Dumping this block into pool
@@ -167,13 +189,16 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 
   while (1) {
     auto it = pool_.lower_bound({index, size, nullptr});
+
+    // no match chunk memory
     if (it == pool_.end()) return it;
 
     if (std::get<0>(*it) > index) {
+      // find suitable one
       if (std::get<1>(*it) >= size) {
         return it;
       }
-
+      // update and continue
       index = std::get<0>(*it);
       continue;
     }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index ecf23b77ae..2fd9c8162a 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,14 +42,14 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- public:
+ private:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 
- private:
-  // Tuple type: allocator index, memory size, memory address
+  // Tuple (allocator index, memory size, memory address)
   using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
   using PoolSet = std::set<IndexSizeAddress>;
 
   /*! \brief Allocate fixed-size memory from system */
@@ -57,7 +57,6 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
-
   /**
    *  \brief Find the suitable chunk from existing pool
    *
@@ -77,13 +76,19 @@ class BuddyAllocator {
   size_t max_chunk_size_;  // the maximum size of each chunk
 
  private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
   PoolSet pool_;
 
  private:
-  // Unify the metadata format between GPU and CPU allocations
+  /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
 
  private:
+  /*! Allocate CPU/GPU memory from system */
   SystemAllocator* system_allocator_;
   std::mutex mutex_;
 };
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index eaa97e7b4a..bc67bcef0f 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -91,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
 
   auto metadata = cache.load(this);
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 2f3d1c061e..0ad11f1b10 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)

From 74691789e9e5ee782adb003642f66699603b20e2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:16:03 +0800
Subject: [PATCH 033/205] ENH: add memory unit test

---
 paddle/memory/CMakeLists.txt            |  6 ++--
 paddle/memory/detail/CMakeLists.txt     |  4 +--
 paddle/memory/detail/buddy_allocator.cc | 33 +++++++++++------
 paddle/memory/detail/buddy_allocator.h  | 27 --------------
 paddle/memory/memory.cc                 | 42 ++++++++++++++++++----
 paddle/memory/memory_test.cc            | 48 +++++++++++++++++++++++++
 paddle/platform/gpu_info.cc             |  2 +-
 7 files changed, 112 insertions(+), 50 deletions(-)
 create mode 100644 paddle/memory/memory_test.cc

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8c290712fc..fac442cca5 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_subdirectory(detail)
 
-cc_library(memory
-    SRCS
-    memory.cc)
+cc_library(memory SRCS memory.cc)
 
 cc_library(paddle_memory
     DEPS
     memory meta_data
     meta_cache memory_block
     buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6cb6422e47..b9c3fc31c1 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 9f334a7048..ed2eedf9af 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -24,10 +24,20 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {
-  PADDLE_ASSERT(min_chunk_size > 0);
-  PADDLE_ASSERT(max_chunk_size > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -62,7 +72,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
@@ -142,6 +152,8 @@ void BuddyAllocator::Free(void* p) {
   // TODO(gangliao): Clean up if existing too much free memory
 }
 
+size_t BuddyAllocator::Used() { return total_used_; }
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -172,7 +184,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  DLOG(INFO) << " Creating and inserting new block " << p
+  DLOG(INFO) << "Creating and inserting new block " << p
              << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
@@ -211,20 +223,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
   block->split(cache_, size);
 
-  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
              << ")";
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
-                 << ", " << block->right_buddy(cache_)->total_size(cache_)
-                 << ")";
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert({block->right_buddy(cache_)->index(cache_),
                     block->right_buddy(cache_)->total_size(cache_),
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 2fd9c8162a..eeb2dc8836 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -93,33 +93,6 @@ class BuddyAllocator {
   std::mutex mutex_;
 };
 
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
-                           platform::CpuMaxChunkSize());
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] =
-          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
-                             platform::GpuMaxChunkSize());
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
-
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e2..dde6ff0ef3 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -22,37 +22,67 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+   return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GpuDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 void* Alloc(platform::Place pl, size_t size) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return GetCPUBuddyAllocator()->Alloc(size);
 }
 
 void Free(paddle::platform::Place pl, void* p) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
+    GetGPUBuddyAllocator(gpu_id)->Free(p);
+    return;
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+  GetCPUBuddyAllocator()->Free(p);
 }
 
 size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+    return GetGPUBuddyAllocator(gpu_id)->Used();
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return GetCPUBuddyAllocator()->Used();
 }
 
 }  // namespace memory
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000..a98ccbb806
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/platform/place.h"
+
+#include "gtest/gtest.h"
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 05a243c506..fe475d23ce 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -56,7 +56,7 @@ size_t GpuMaxAllocSize() {
   GpuMemoryUsage(available, total);
 
   // Reserve the rest for page tables, etc.
-  return total * FLAGS_fraction_of_gpu_memory_to_use;
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {

From 936cd1e11885daf35ae34b9fbf11a798e9e51022 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:34:51 +0800
Subject: [PATCH 034/205] FIX: code format

---
 paddle/memory/memory.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index dde6ff0ef3..43f2084e8d 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -29,7 +29,7 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
                                    platform::CpuMinChunkSize(),
                                    platform::CpuMaxChunkSize());
   }
-   return a;
+  return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.

From 5d2e8edb3ec86e13b7684cbe372650d21fe7954d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:37:20 +0800
Subject: [PATCH 035/205] FIX: dynamic loader deps

---
 paddle/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 0ad11f1b10..bf3e8f31ab 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)

From 1264480b048cf68e29f3dffa91e228425df55908 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Thu, 6 Jul 2017 10:48:00 +0800
Subject: [PATCH 036/205] fix ci

---
 paddle/framework/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index b33014210f..fc2fbf88f1 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,5 +11,6 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
 #cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
+
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)

From 3ad8e364715915fba5909c137834e34f38b6e9ac Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 11:24:01 +0800
Subject: [PATCH 037/205] FIX: merge static libs with propagation dependencies

---
 cmake/generic.cmake | 51 ++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cae9524b2f..87d8caaec4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
 
-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
   foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps 
+  # also help to track dependencies.
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # Make the generated dummy source file depended on all static input
+  # libs. If input lib changes,the source file is touched
+  # which causes the desired effect (relink).
+  add_custom_command(OUTPUT ${dummyfile}
+    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    DEPENDS ${libs})
+
+  # Generate dummy staic lib
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  add_library(${TARGET_NAME} STATIC ${dummyfile})
+  target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+  foreach(lib ${libs})
+    # Get the file names of the libraries to be merged
     set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
   endforeach()
 
+  # Get the file name of the generated library
+  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
   if(APPLE) # Use OSX's libtool to merge archives
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
@@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME)
       set(objdir ${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
-
-      list(APPEND mergebases "${mergebase}")
-    endforeach()
-
-    # We need a target for the output merged library
-    add_library(${TARGET_NAME} STATIC ${mergebases})
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
-
-    foreach(lib ${libs})
       add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
+        WORKING_DIRECTORY ${objdir})
     endforeach()
 
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD

From a669bf48d966a92206c57d72258bb625b5ff2fbc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 13:38:11 +0800
Subject: [PATCH 038/205] FIX: explicit construct pool element

---
 paddle/memory/detail/buddy_allocator.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ed2eedf9af..2cfacec46c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -118,8 +118,9 @@ void BuddyAllocator::Free(void* p) {
 
     if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
-                   right_buddy});
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
 
       // merge its right buddy to the block
       block->merge(cache_, right_buddy);
@@ -135,8 +136,8 @@ void BuddyAllocator::Free(void* p) {
 
     if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
-                   left_buddy});
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
 
       // merge the block to its left buddy
       left_buddy->merge(cache_, block);
@@ -147,7 +148,8 @@ void BuddyAllocator::Free(void* p) {
   // Dumping this block into pool
   DLOG(INFO) << "Inserting free block (" << block << ", "
              << block->total_size(cache_) << ")";
-  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // TODO(gangliao): Clean up if existing too much free memory
 }
@@ -193,14 +195,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
-  return pool_.insert({index, max_chunk_size_, p}).first;
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
   size_t index = 0;
 
   while (1) {
-    auto it = pool_.lower_bound({index, size, nullptr});
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
 
     // no match chunk memory
     if (it == pool_.end()) return it;
@@ -237,9 +239,10 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
       DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
                  << block->right_buddy(cache_)->total_size(cache_) << ")";
 
-      pool_.insert({block->right_buddy(cache_)->index(cache_),
-                    block->right_buddy(cache_)->total_size(cache_),
-                    block->right_buddy(cache_)});
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
     }
   }
 

From 108b0fad2ffdf8faf281e34ea64437abe7a3eca3 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 06:40:58 +0000
Subject: [PATCH 039/205] fix by helin and wuyi's comments

---
 go/master/service.go | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/go/master/service.go b/go/master/service.go
index c47319317a..29ff63bcc9 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -39,9 +39,9 @@ type Task struct {
 }
 
 type taskEntry struct {
-	NumTimeout int
-	Task       Task
-	NumFailed  int
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }
 
 type taskQueues struct {
@@ -53,11 +53,11 @@ type taskQueues struct {
 
 // Service is the master server service.
 type Service struct {
-	chunksPerTask    int
-	timeoutDur       time.Duration
-	failortimeoutMax int
-	ready            chan struct{}
-	store            Store
+	chunksPerTask int
+	timeoutDur    time.Duration
+	failureMax    int
+	ready         chan struct{}
+	store         Store
 
 	mu         sync.Mutex
 	initDone   bool
@@ -92,11 +92,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failortimeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.failortimeoutMax = failortimeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -258,7 +258,7 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
 
-func (s *Service) checkTaskStatus(t taskEntry, epoch int) {
+func (s *Service) procFailedTask(t taskEntry, epoch int) {
 	if t.Task.Epoch != epoch {
 		// new epoch, task launched after the
 		// schedule of this timeout check or failed status report.
@@ -274,14 +274,14 @@ func (s *Service) checkTaskStatus(t taskEntry, epoch int) {
 
 	delete(s.taskQueues.Pending, t.Task.ID)
 
-	t.NumTimeout++
-	if t.NumTimeout+t.NumFailed > s.failortimeoutMax {
-		log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.NumFailed)
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
 		return
 	}
 
-	log.Warningf("Task %v timed out %d times and failed %d times, discard.", t.Task, t.NumTimeout, t.NumFailed)
+	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
 	return
 }
@@ -296,7 +296,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		s.checkTaskStatus(t, epoch)
+		s.procFailedTask(t, epoch)
 	}
 }
 
@@ -377,8 +377,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 
 	// task finished, reset timeout
-	t.NumTimeout = 0
-	t.NumFailed = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 
@@ -413,6 +412,6 @@ func (s *Service) TaskFailed(taskID int, epoch int) error {
 		return err
 	}
 
-	s.checkTaskStatus(t, epoch)
+	s.procFailedTask(t, epoch)
 	return nil
 }

From a94d217487a222526e303c443aaa3370321447ae Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 07:09:55 +0000
Subject: [PATCH 040/205] add TaskID

---
 go/master/client.go  |  4 ++--
 go/master/service.go | 14 ++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index b6ca8cad15..bf2612d91b 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -113,8 +113,8 @@ func (c *Client) taskFinished(taskID int) error {
 }
 
 // TaskFailed tell the master server as task is failed.
-func (c *Client) taskFailed(taskID int, epoch int) error {
-	return c.conn.Call("Service.TaskFinished", taskID, epoch)
+func (c *Client) taskFailed(taskID TaskID) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
 
 // NextRecord returns next record in the dataset.
diff --git a/go/master/service.go b/go/master/service.go
index 29ff63bcc9..b1334a2d8e 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -396,8 +396,14 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	return err
 }
 
-// TaskFailed tell the service that a task is failed.
-func (s *Service) TaskFailed(taskID int, epoch int) error {
+// TaskID is a struct which client uses for reports failure.
+type TaskID struct {
+	ID    int
+	Epoch int
+}
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(taskID TaskID, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -405,13 +411,13 @@ func (s *Service) TaskFailed(taskID int, epoch int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	t, ok := s.taskQueues.Pending[taskID]
+	t, ok := s.taskQueues.Pending[taskID.ID]
 	if !ok {
 		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%d not found.", taskID)
 		return err
 	}
 
-	s.procFailedTask(t, epoch)
+	s.procFailedTask(t, taskID.Epoch)
 	return nil
 }

From 8f7088590c7031dedd554f62762a559a4efe6b9c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 07:14:10 +0000
Subject: [PATCH 041/205] fix bugs

---
 go/master/service.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/master/service.go b/go/master/service.go
index b1334a2d8e..daf3928230 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -414,7 +414,7 @@ func (s *Service) TaskFailed(taskID TaskID, dummy *int) error {
 	t, ok := s.taskQueues.Pending[taskID.ID]
 	if !ok {
 		err := errors.New("pending task not found")
-		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%d not found.", taskID)
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", taskID)
 		return err
 	}
 

From adf8c95b62fc5ef1f608bc06dce32bb4b396828c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 15:40:22 +0800
Subject: [PATCH 042/205] FIX: propagation dependencies under linux

---
 cmake/generic.cmake | 68 ++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 87d8caaec4..3900ea2604 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,36 +104,32 @@ function(merge_static_libs TARGET_NAME)
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
 
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps 
-  # also help to track dependencies.
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  # Make the generated dummy source file depended on all static input
-  # libs. If input lib changes,the source file is touched
-  # which causes the desired effect (relink).
-  add_custom_command(OUTPUT ${dummyfile}
-    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
-    DEPENDS ${libs})
-
-  # Generate dummy staic lib
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-  add_library(${TARGET_NAME} STATIC ${dummyfile})
-  target_link_libraries(${TARGET_NAME} ${libs_deps})
+  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps 
+    # also help to track dependencies.
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
-  foreach(lib ${libs})
-    # Get the file names of the libraries to be merged
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-  endforeach()
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
 
-  # Get the file name of the generated library
-  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    # Generate dummy staic lib
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
 
-  if(APPLE) # Use OSX's libtool to merge archives
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library
       set(objdir ${lib}.objdir)
@@ -148,13 +144,27 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
-        WORKING_DIRECTORY ${objdir})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
+
+      list(APPEND mergebases "${mergebase}")
     endforeach()
 
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+
+    # Get the file name of the generated library
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
+    endforeach()
   endif()
 endfunction(merge_static_libs)
 

From ddfa6cf0d1fe91f8bf2e1d55841afee9e30d1859 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 17:07:04 +0800
Subject: [PATCH 043/205] FIX: remove boost from memory folder

---
 paddle/memory/memory.cc | 56 +++++++++++++++++++----------------------
 paddle/memory/memory.h  | 11 +++++---
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 43f2084e8d..def580f7a4 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -32,7 +32,22 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
   return a;
 }
 
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
@@ -49,41 +64,22 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
-#endif  // PADDLE_ONLY_CPU
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Alloc(size);
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    GetGPUBuddyAllocator(gpu_id)->Free(p);
-    return;
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  GetCPUBuddyAllocator()->Free(p);
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Used();
-  }
 #endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Used();
-}
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade..2d6f4fd2a0 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);
 
 }  // namespace memory
 }  // namespace paddle

From 26d95a6bbfea68c1f79c14fd0db0afedaf27e01d Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Fri, 7 Jul 2017 18:06:10 +0800
Subject: [PATCH 044/205] fix new remote updater for go pserver

---
 go/pserver/client/c/test/test_train.py       |  2 +-
 go/pserver/optimizer.go                      |  4 ++--
 paddle/trainer/NewRemoteParameterUpdater.cpp | 22 +++++++++++++++++---
 paddle/trainer/NewRemoteParameterUpdater.h   |  2 ++
 python/paddle/v2/optimizer.py                |  2 ++
 python/paddle/v2/trainer.py                  |  1 +
 6 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 68e1d9b269..d6922672f4 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -19,7 +19,7 @@ def main():
     # create parameters
     parameters = paddle.parameters.create(cost)
 
-    # create optimizer
+    # create optimizer of new remote updater to pserver
     optimizer = paddle.optimizer.Momentum(momentum=0)
 
     #TODO(zhihong) : replace optimizer with new OptimizerConfig
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 54d1082094..ee5fe6205b 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -42,12 +42,12 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
 	c := paramWithConfigs.Config
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content),
+		"ParamSize":   len(p.Content) / C.sizeof_float,
 		"ConfigSize":  len(c),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
 	cbuffer = C.malloc(C.size_t(len(p.Content)))
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)/C.sizeof_float))
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
 		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
 		(*C.char)(nullPtr), 0)
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index f25ce2f7f0..b359d9da21 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -22,7 +22,8 @@ DECLARE_string(save_dir);
 namespace paddle {
 NewRemoteParameterUpdater::NewRemoteParameterUpdater(
     const OptimizationConfig &config, const std::string pserverSpec)
-    : parameterClient_(-1),
+    : trainerConfig_(config),
+      parameterClient_(-1),
       newParameters_(nullptr),
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
@@ -51,7 +52,22 @@ void NewRemoteParameterUpdater::init(
     LOG(INFO) << "paddle_begin_init_params start";
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      std::string bytes = paramConfig.SerializeAsString();
+      LOG(INFO) << "old param config: " << paramConfig.DebugString();
+      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
+      OptimizerConfig optimizeConfigV2;
+      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
+      sgdConfigV2->set_momentum(paramConfig.momentum());
+      sgdConfigV2->set_decay(paramConfig.decay_rate());
+      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      auto constlr = optimizeConfigV2.mutable_const_lr();
+      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (trainerConfig_.algorithm() == "sgd") {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+        // FIXME: config all algorithms
+      } else {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      }
+      std::string bytes = optimizeConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
@@ -83,4 +99,4 @@ void NewRemoteParameterUpdater::finishBatch(real cost) {
 void NewRemoteParameterUpdater::startPass() {}
 
 bool NewRemoteParameterUpdater::finishPass() { return true; }
-}
+}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index f735185f62..dfed00bc21 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <functional>
 #include <thread>
+#include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/pserver/ParameterClient2.h"
@@ -101,6 +102,7 @@ private:
   }
 
 protected:
+  const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
   /// the parameters for new pserver client
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 8124e219ba..390c22ee55 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -66,6 +66,8 @@ class Optimizer(object):
             if use_sparse_remote_updater:
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
+
+        :param pserver_spec: pserver location, eg: localhost:3000
         :return: parameter_updater
         """
         if is_local:
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index f9658a8c5d..96c6c4b89a 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -41,6 +41,7 @@ class SGD(object):
     :type parameters: paddle.v2.parameters.Parameters
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
+    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
     """
 

From 199b5fcb45c69560de1b24b3147f5e7db309abe3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 10 Jul 2017 11:22:17 +0800
Subject: [PATCH 045/205] ENH: refine code comments

---
 paddle/memory/detail/buddy_allocator.h   |  3 ++-
 paddle/memory/detail/meta_cache.h        | 25 +++++++++---------------
 paddle/memory/detail/system_allocator.cc |  4 ++--
 paddle/memory/detail/system_allocator.h  |  6 +++---
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index eeb2dc8836..a89dd8eb7c 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,7 +42,7 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- private:
+ public:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
@@ -57,6 +57,7 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
+
   /**
    *  \brief Find the suitable chunk from existing pool
    *
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index 3ca1020d22..ca0789779e 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -23,14 +23,14 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-/*! A cache for accessing memory block meta-data that may be expensive to access
-   directly.
-
-    Note: this class exists to unify the metadata format between GPU and CPU
-   allocations.
-    It should be removed when the CPU can access all GPU allocations directly
-   via UVM.
-*/
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
 class MetadataCache {
  public:
   MetadataCache(bool uses_gpu);
@@ -42,14 +42,7 @@ class MetadataCache {
   /*! \brief Store the associated metadata for the specified memory block. */
   void store(MemoryBlock*, const Metadata&);
 
- public:
-  /*! \brief Acquire any external metadata updates. */
-  void acquire(MemoryBlock*);
-
-  /*! \brief Publish any local updates externally. */
-  void release(MemoryBlock*);
-
-  /*! \brief Indicate that the specified metadata will no longer be used */
+  /*! \brief Indicate that the specified metadata will no longer be used. */
   void invalidate(MemoryBlock*);
 
  public:
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 75a2c91ef9..1579174b1a 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,7 +60,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
-bool CPUAllocator::UseGpu() { return false; }
+bool CPUAllocator::UseGpu() const { return false; }
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -133,7 +133,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
-bool GPUAllocator::UseGpu() { return true; }
+bool GPUAllocator::UseGpu() const { return true; }
 
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 555061a533..04efcd9709 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,14 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
-  virtual bool UseGpu() = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -47,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 
  private:
   size_t gpu_alloc_size_ = 0;

From 338dd13542387387028b0f3adbfc296756734d5a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 10 Jul 2017 11:56:57 +0800
Subject: [PATCH 046/205] Add voc2012 dataset for image segment

---
 python/paddle/v2/dataset/__init__.py          |  5 +-
 python/paddle/v2/dataset/tests/vocseg_test.py | 42 +++++++++++
 python/paddle/v2/dataset/voc_seg.py           | 74 +++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/v2/dataset/tests/vocseg_test.py
 create mode 100644 python/paddle/v2/dataset/voc_seg.py

diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 80ff6295c3..cdd85cce37 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -24,8 +24,11 @@ import conll05
 import uci_housing
 import sentiment
 import wmt14
+import mq2007
+import flowers
+import voc_seg
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc_seg'
 ]
diff --git a/python/paddle/v2/dataset/tests/vocseg_test.py b/python/paddle/v2/dataset/tests/vocseg_test.py
new file mode 100644
index 0000000000..1a773fa18b
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/vocseg_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc_seg
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/voc_seg.py b/python/paddle/v2/dataset/voc_seg.py
new file mode 100644
index 0000000000..9b79f726d2
--- /dev/null
+++ b/python/paddle/v2/dataset/voc_seg.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional segmentations have been prepared. As in previous years the assignment to training/test sets has been maintained. The total number of images with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import numpy as np
+from common import download
+from paddle.v2.image import *
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar'
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = load_image_bytes(data)
+            label = load_image_bytes(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images.
+    """
+    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images.
+    """
+    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images.
+    """
+    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'val')

From 9fde3959c39a59950bb6a6721d26dea5445fce7b Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Mon, 10 Jul 2017 12:26:12 +0800
Subject: [PATCH 047/205] update docs url

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index fa16cc3cf2..947c360078 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
 
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
   You might want to start from the this online interactive book that can run in Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 

From b707ed89f7b84cfd97778564a33586a2fcc279a3 Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Mon, 10 Jul 2017 12:26:48 +0800
Subject: [PATCH 048/205] Update write_docs_cn.rst

---
 doc/howto/dev/write_docs_cn.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index d536f53abc..36e5d420c9 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
 
     python -c "import py_paddle"
 
-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 
 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================
 
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
 
 
 

From 8718dc5ed559c992c814d549ce44df3ebd9d2c5f Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Mon, 10 Jul 2017 12:49:18 +0800
Subject: [PATCH 049/205] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 947c360078..6c8bd0e135 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

From d93abc8a6c4720271eb3fd708f012f8bd1ed71ac Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Mon, 10 Jul 2017 12:50:12 +0800
Subject: [PATCH 050/205] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6c8bd0e135..e36330b1fd 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

From fb6161aad6104e2503299330747839f21fba49b0 Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Mon, 10 Jul 2017 12:50:44 +0800
Subject: [PATCH 051/205] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e36330b1fd..6c8bd0e135 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

From 1a968b4f64567d1281dd278a6b412cd823663e43 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 10 Jul 2017 20:39:48 +0800
Subject: [PATCH 052/205] init

---
 paddle/framework/ddim.h         | 10 ++++
 paddle/framework/tensor.h       | 27 ++++++++--
 paddle/framework/tensor_types.h | 91 +++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 paddle/framework/tensor_types.h

diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180be..053a09d63a 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "paddle/framework/dim.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace framework {
@@ -91,6 +92,15 @@ int arity(const DDim& ddim);
 
 std::ostream& operator<<(std::ostream&, const DDim&);
 
+template <int NDIMS>
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const {
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  for (int d = 0; d < paddle::framework::arity(dims); d++) {
+    dsizes[d] = dims[d];
+  }
+  return dsizes;
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index ce5d98b04e..81af430611 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
+#include "paddle/framework/tensor_types.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace framework {
@@ -33,6 +35,13 @@ class Tensor {
     return static_cast<const T*>(holder_->Ptr());
   }
 
+  template <typename T>
+  T* data() const {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tensor::data must be called after Tensor::mutable_data.");
+    return static_cast<T*>(holder_->Ptr());
+  }
+
   template <typename T,  // must be POD types
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
@@ -41,14 +50,23 @@ class Tensor {
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->Size() < product(dims) * sizeof(T)) {
       holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+      dims_ = dims;
     }
     return static_cast<T*>(holder_->Ptr());
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims) {
-    return mutable_data<T>(dims, paddle::platform::get_place());
+  DDim dim() const { return dims_; }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstantTensor Tensor::tensor() {
+    return typename TTypes<T, NDIMS>::Tensor(
+        data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor Tensor::tensor() {
+    return typename TTypes<T, NDIMS>::Tensor(
+        data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
   }
 
  private:
@@ -92,6 +110,7 @@ class Tensor {
   };
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  DDim dims_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor_types.h b/paddle/framework/tensor_types.h
new file mode 100644
index 0000000000..b68697108c
--- /dev/null
+++ b/paddle/framework/tensor_types.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// Helper to define Tensor types given that the scalar is of type T.
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+
+  // Unaligned Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>>
+      UnalignedTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
+      UnalignedConstTensor;
+
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Tensor32Bit;
+
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+      Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstScalar;
+
+  // Unaligned Scalar tensor of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+      UnalignedScalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>>
+      UnalignedConstScalar;
+
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Vec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+
+  // Unaligned Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>>
+      UnalignedFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
+      UnalignedConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>>
+      UnalignedVec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
+      UnalignedConstVec;
+
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Matrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+
+  // Unaligned Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>>
+      UnalignedMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
+      UnalignedConstMatrix;
+};
+
+}  // namespace framework
+}  // namespace paddle

From bc021d775ed333dc9dca217203ee0d2999700813 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 11 Jul 2017 09:42:07 +0800
Subject: [PATCH 053/205] "move opContext to DeviceContext"

---
 paddle/framework/net.cc              |  5 +----
 paddle/framework/net.h               |  6 ++----
 paddle/framework/net_proto.proto     |  2 +-
 paddle/framework/net_test.cc         | 24 ++++++++++++++++++++++++
 paddle/framework/op_registry_test.cc |  2 +-
 5 files changed, 29 insertions(+), 10 deletions(-)
 create mode 100644 paddle/framework/net_test.cc

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 8c565c28cb..20c0aef049 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -11,10 +11,7 @@ void PlainNet::InferShape(Scope* scope) {
   }
 }
 
-void PlainNet::Run(Scope* scope, OpContext* context, OpIndex begin,
-                   OpIndex end) const {
-  // TODO Add implementation here.
-}
+void PlainNet::Run(Scope* scope) const {}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 2025bfa4b2..ef50133491 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -69,8 +69,7 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual void Run(Scope *scope, OpContext *context, OpIndex begin = -1,
-                   OpIndex end = -1) const = 0;
+  virtual void Run(Scope *scope) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -124,8 +123,7 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(Scope *scope = nullptr, OpContext *context = nullptr,
-                   OpIndex begin = -1, OpIndex end = -1) const override;
+  virtual void Run(Scope *scope) const override;
 
   /**
    * @brief Add an operator to this network.
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
index 2d042457e3..0779f49fe2 100644
--- a/paddle/framework/net_proto.proto
+++ b/paddle/framework/net_proto.proto
@@ -9,7 +9,7 @@ message NetDesc {
   // operator contains in network
   repeated OpProto operators = 2;
   // network type to run with. e.g "plainNet", "DAG"
-  optional string type = 3;
+  optional string net_type = 3;
   // num worker always
   optional int32 num_workers = 4;
 }
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
new file mode 100644
index 0000000000..04f5efdf79
--- /dev/null
+++ b/paddle/framework/net_test.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+class FakeFC : public OpBase {}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 17849ca019..ae6b738712 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -119,4 +119,4 @@ TEST(OpRegistry, CustomChecker) {
   for (size_t i = 0; i < debug_str.length(); ++i) {
     ASSERT_EQ(debug_str[i], str[i]);
   }
-}
\ No newline at end of file
+}

From a40a7a5cb1a80f5489800dd6cda329667ac47c4d Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 11 Jul 2017 10:25:30 +0800
Subject: [PATCH 054/205] fix by helin's comments

---
 go/master/client.go                |  6 ++--
 go/master/client_internal_test.go  |  4 +--
 go/master/service.go               | 47 +++++++++++++++---------------
 go/master/service_internal_test.go |  2 +-
 4 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index bf2612d91b..6f06fd0421 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -62,7 +62,7 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		c.taskFinished(t.Meta.ID)
 	}
 }
 
@@ -113,8 +113,8 @@ func (c *Client) taskFinished(taskID int) error {
 }
 
 // TaskFailed tell the master server as task is failed.
-func (c *Client) taskFailed(taskID TaskID) error {
-	return c.conn.Call("Service.TaskFinished", taskID, nil)
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFinished", meta, nil)
 }
 
 // NextRecord returns next record in the dataset.
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 364dce7b58..dc4d9eab14 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -95,7 +95,7 @@ func TestGetFinishTask(t *testing.T) {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].ID)
+		err = c.taskFinished(tasks[0].Meta.ID)
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
@@ -107,7 +107,7 @@ func TestGetFinishTask(t *testing.T) {
 		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
+			err = c.taskFinished(task.Meta.ID)
 			if err != nil {
 				t.Fatalf("Error: %v, pass: %d\n", err, i)
 			}
diff --git a/go/master/service.go b/go/master/service.go
index daf3928230..1291ac48f1 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -31,10 +31,15 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }
 
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
-	Epoch  int
+	Meta   TaskMeta
 	Chunks []Chunk
 }
 
@@ -74,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
+			cur.Task.Meta.ID = id
 			id++
 			result = append(result, cur)
 			cur.Task.Chunks = nil
@@ -84,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
 
@@ -258,8 +263,8 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
 
-func (s *Service) procFailedTask(t taskEntry, epoch int) {
-	if t.Task.Epoch != epoch {
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
 		// new epoch, task launched after the
 		// schedule of this timeout check or failed status report.
 		return
@@ -272,7 +277,7 @@ func (s *Service) procFailedTask(t taskEntry, epoch int) {
 		}
 	}()
 
-	delete(s.taskQueues.Pending, t.Task.ID)
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
 
 	t.NumFailure++
 	if t.NumFailure > s.failureMax {
@@ -296,7 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		s.procFailedTask(t, epoch)
+		s.processFailedTask(t, epoch)
 	}
 }
 
@@ -345,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	t := s.taskQueues.Todo[0]
-	t.Task.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Meta)
 
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Task.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
 
@@ -373,7 +378,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	if !ok {
 		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}
 
 	// task finished, reset timeout
@@ -396,14 +401,8 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	return err
 }
 
-// TaskID is a struct which client uses for reports failure.
-type TaskID struct {
-	ID    int
-	Epoch int
-}
-
 // TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(taskID TaskID, dummy *int) error {
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -411,13 +410,13 @@ func (s *Service) TaskFailed(taskID TaskID, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	t, ok := s.taskQueues.Pending[taskID.ID]
+	t, ok := s.taskQueues.Pending[meta.ID]
 	if !ok {
 		err := errors.New("pending task not found")
-		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", taskID)
-		return err
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Meta)
+		return nil
 	}
 
-	s.procFailedTask(t, taskID.Epoch)
+	s.processFailedTask(t, meta.Epoch)
 	return nil
 }
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index bc435b505c..9c0d1d0a39 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		if ts[i].Task.Meta.ID != i {
 			t.Error(ts[i], i)
 		}
 	}

From 18e65b0c084ef482492b528985173341a24284cc Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 11 Jul 2017 10:37:41 +0800
Subject: [PATCH 055/205] "support net_proto header"

---
 paddle/framework/CMakeLists.txt  |  2 +-
 paddle/framework/net.cc          |  7 +++++--
 paddle/framework/net.h           | 14 +++++++-------
 paddle/framework/net_test.cc     |  2 +-
 paddle/platform/device_context.h |  1 +
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 39cfb46237..e6e3b79d7b 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,4 +18,4 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-#cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
+cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 20c0aef049..f0c128d554 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -11,7 +11,10 @@ void PlainNet::InferShape(Scope* scope) {
   }
 }
 
-void PlainNet::Run(Scope* scope) const {}
-
+void PlainNet::Run(Scope* scope, DeviceContext* ctx) {
+  for (auto& op : ops_) {
+    op.Run(ctx);
+  }
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index ef50133491..b2894320da 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -17,9 +17,11 @@
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/scope.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
+using namespace paddle::platform;
 
 // operator's index stored in a network.
 typedef int OpIndex;
@@ -30,15 +32,13 @@ typedef int OpIndex;
  */
 
 struct OpDesc;
-struct OpDef;
-struct OpContext;
 struct OpAttrs {};
 
 class Operator {
  public:
   Operator(const OpDesc &def) {}
   void InferShape() {}
-  void Run() {}
+  void Run(DeviceContext *ctx) {}
 };
 
 /**
@@ -69,12 +69,12 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual void Run(Scope *scope) const = 0;
+  virtual void Run(Scope *scope, DeviceContext *ctx) = 0;
 
   /**
    * @brief Add an Operator according to `def`.
    */
-  virtual OpIndex AddOp(const OpDef &def) = 0;
+  virtual OpIndex AddOp(const OpProto &def) = 0;
 
   /**
    * @brief Add optimizer operators acctording to `attrs`.
@@ -123,12 +123,12 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(Scope *scope) const override;
+  virtual void Run(Scope *scope, DeviceContext *ctx) override;
 
   /**
    * @brief Add an operator to this network.
    */
-  virtual OpIndex AddOp(const OpDef &def) override;
+  virtual OpIndex AddOp(const OpProto &def) override;
 
   /**
    * @brief Add all optimizer operators related into the network.
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
index 04f5efdf79..a8e31c1497 100644
--- a/paddle/framework/net_test.cc
+++ b/paddle/framework/net_test.cc
@@ -19,6 +19,6 @@
 
 namespace paddle {
 namespace framework {
-class FakeFC : public OpBase {}
+class FakeFC : public Operator {}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fcef0a5e30..160eb4e120 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -36,6 +36,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {};
 
 #ifndef PADDLE_ONLY_CPU
+
 class GPUPlaceGuard {
  public:
   explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {

From dbe16a2122e6c50996907c4e3e3b0aede02c5363 Mon Sep 17 00:00:00 2001
From: LiuYongFeng <lyongfeng@gmail.com>
Date: Tue, 11 Jul 2017 10:39:52 +0800
Subject: [PATCH 056/205] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6c8bd0e135..2a6beeb342 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 
    We appreciate your contributions!
 
+
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

From b64c7a635dd0898777f27e0b4aac9495bb9c28f0 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 11 Jul 2017 02:41:34 +0000
Subject: [PATCH 057/205] fix by helin's comments

---
 go/master/client.go               | 2 +-
 go/master/client_internal_test.go | 6 ++++++
 go/master/service.go              | 6 ++----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index 6f06fd0421..59bac6874c 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -114,7 +114,7 @@ func (c *Client) taskFinished(taskID int) error {
 
 // TaskFailed tell the master server as task is failed.
 func (c *Client) taskFailed(meta TaskMeta) error {
-	return c.conn.Call("Service.TaskFinished", meta, nil)
+	return c.conn.Call("Service.TaskFailed", meta, nil)
 }
 
 // NextRecord returns next record in the dataset.
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index dc4d9eab14..49263474c8 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -99,6 +99,12 @@ func TestGetFinishTask(t *testing.T) {
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
+
+		err = c.taskFailed(tasks[0].Meta)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+
 		tasks = tasks[1:]
 		task, err := c.getTask()
 		if err != nil {
diff --git a/go/master/service.go b/go/master/service.go
index 1291ac48f1..a6050ab994 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -359,7 +359,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Meta)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
 
 	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
@@ -376,7 +376,6 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
 		return nil
 	}
@@ -412,8 +411,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 
 	t, ok := s.taskQueues.Pending[meta.ID]
 	if !ok {
-		err := errors.New("pending task not found")
-		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Meta)
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
 		return nil
 	}
 

From 8e4d04376dbb5e996ad0948c5f8c6b8705b17e4a Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 10:42:35 +0800
Subject: [PATCH 058/205] fix dyload link undefined reference dlsym

---
 cmake/generic.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 725cf28037..83e3d155d0 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,6 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
 endif(NOT APPLE)
 
 function(merge_static_libs TARGET_NAME)

From dd8685ff1c4d0890e1c0ea58126a666b85ef4f92 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 11 Jul 2017 02:47:00 +0000
Subject: [PATCH 059/205] fix bug

---
 go/pserver/client/client_test.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 29b400812c..2b72a202b5 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -42,7 +42,8 @@ func initClient() [numPserver]int {
 		ports[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -174,7 +175,7 @@ func TestNativeClient(t *testing.T) {
 // TODO: tmperary disable etcdClient test for dependency of etcd)
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
-	etcd_client := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
 	ClientTest(t, c2)
 }

From b871641a5315b10bfb1d0776e288dd25ef2969d2 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 11 Jul 2017 10:53:48 +0800
Subject: [PATCH 060/205] "switch to shared_ptr"

---
 paddle/framework/net.cc | 2 +-
 paddle/framework/net.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index f0c128d554..73b3051235 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -11,7 +11,7 @@ void PlainNet::InferShape(Scope* scope) {
   }
 }
 
-void PlainNet::Run(Scope* scope, DeviceContext* ctx) {
+void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
   for (auto& op : ops_) {
     op.Run(ctx);
   }
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index b2894320da..76992e0728 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -69,7 +69,7 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual void Run(Scope *scope, DeviceContext *ctx) = 0;
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -123,7 +123,7 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(Scope *scope, DeviceContext *ctx) override;
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
 
   /**
    * @brief Add an operator to this network.

From d6f7c3535d0907af4e2d955451e9a872d6b857a3 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 12:52:07 +0800
Subject: [PATCH 061/205] move unaligned tensor types

---
 paddle/framework/tensor_types.h | 38 ---------------------------------
 1 file changed, 38 deletions(-)

diff --git a/paddle/framework/tensor_types.h b/paddle/framework/tensor_types.h
index b68697108c..26de25b7c2 100644
--- a/paddle/framework/tensor_types.h
+++ b/paddle/framework/tensor_types.h
@@ -16,17 +16,6 @@ struct TTypes {
       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstTensor;
 
-  // Unaligned Rank-<NDIMS> tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>>
-      UnalignedTensor;
-  typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
-      UnalignedConstTensor;
-
-  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      Tensor32Bit;
-
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<
       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
@@ -37,14 +26,6 @@ struct TTypes {
                            Eigen::Aligned>
       ConstScalar;
 
-  // Unaligned Scalar tensor of scalar type T.
-  typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedScalar;
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
-                                                  Eigen::RowMajor, IndexType>>
-      UnalignedConstScalar;
-
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
@@ -59,18 +40,6 @@ struct TTypes {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstVec;
 
-  // Unaligned Rank-1 tensor (vector) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedFlat;
-  typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedConstFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedVec;
-  typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedConstVec;
-
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
@@ -78,13 +47,6 @@ struct TTypes {
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstMatrix;
-
-  // Unaligned Rank-2 tensor (matrix) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>>
-      UnalignedMatrix;
-  typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
-      UnalignedConstMatrix;
 };
 
 }  // namespace framework

From 958511160bc42fee48c9ad775dfb08e5198bf3e9 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 13:40:44 +0800
Subject: [PATCH 062/205] add simple add_op_functor

---
 paddle/framework/ddim.cc          | 12 ++++++++
 paddle/framework/ddim.h           |  8 +-----
 paddle/framework/tensor.h         | 47 +++++++++++++++++++++++++++++--
 paddle/framework/tensor_types.h   | 14 +++++++++
 paddle/operators/add_op_functor.h | 35 +++++++++++++++++++++++
 5 files changed, 107 insertions(+), 9 deletions(-)
 create mode 100644 paddle/operators/add_op_functor.h

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595..9431645cf5 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,4 +1,5 @@
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -220,5 +221,16 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+template <int NDIMS>
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const {
+  int rank = paddle::framework::arity(dims);
+  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same")
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  for (int d = 0; d < paddle::framework::arity(dims); d++) {
+    dsizes[d] = dims[d];
+  }
+  return dsizes;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 053a09d63a..a83a367196 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -93,13 +93,7 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
 
 template <int NDIMS>
-Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const {
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
-  for (int d = 0; d < paddle::framework::arity(dims); d++) {
-    dsizes[d] = dims[d];
-  }
-  return dsizes;
-}
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const;
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 81af430611..0fa74e7ab1 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -57,18 +57,61 @@ class Tensor {
 
   DDim dim() const { return dims_; }
 
+  size_t NumElements() const { return product(dims_); }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor Tensor::shaped(DDim new_dims) {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(data<T>(), dims);
+  }
+
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::ConstantTensor Tensor::tensor() {
+  typename TTypes<T, NDIMS>::Tensor Tensor::tensor() {
     return typename TTypes<T, NDIMS>::Tensor(
         data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
   }
 
+  // flat to rank = 1
+  template <typename T>
+  typename TTypes<T>::Flat flat() {
+    return shaped<T, 1>({NumElements()});
+  }
+
+  // to TensorType Vec
+  template <typename T>
+  typename TTypes<T>::Vec vec() {
+    return tensor<T, 1>();
+  }
+
+  // to TensorType Matrix
+  template <typename T>
+  typename TTypes<T>::Matrix matrix() {
+    return tensor<T, 2>();
+  }
+
+  // const versions of all the methods above.
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::Tensor Tensor::tensor() {
+  typename TTypes<T, NDIMS>::ConstantTensor Tensor::tensor() const {
     return typename TTypes<T, NDIMS>::Tensor(
         data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
   }
 
+  template <typename T>
+  typename TTypes<T>::ConstFlat flat() const {
+    return shaped<T, 1>({NumElements()});
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstVec vec() const {
+    return tensor<T, 1>();
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstMatrix matrix() const {
+    return tensor<T, 2>();
+  }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
diff --git a/paddle/framework/tensor_types.h b/paddle/framework/tensor_types.h
index 26de25b7c2..4bf27a377e 100644
--- a/paddle/framework/tensor_types.h
+++ b/paddle/framework/tensor_types.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/operators/add_op_functor.h b/paddle/operators/add_op_functor.h
new file mode 100644
index 0000000000..904f24b030
--- /dev/null
+++ b/paddle/operators/add_op_functor.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor_types.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+namespace functor {
+
+template <typename Device, typename T>
+struct Add {
+  void Operator()(const Device& d,
+                  typename TTypes<T>::ConstTensor input1,
+                  typename TTypes<T>::ConstTensor input2,
+                  typename TTypes<T>::Tensor output) {
+    output.device(d) = input1 + input2;
+  }
+};
+}  // namespace functor
+}  // namespace operators
+}  // namespace paddle

From 313e9f551fe0db22cbf5ccbeee5a744eab5892ed Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 13:57:03 +0800
Subject: [PATCH 063/205] Fix slow parsing a recursive depends topology

* Fix  #2797
* It because trainer_config_helpers' __dfs_travel__ did not record the
  node which travelled, and if the topology has a recursive dependency,
  there are some nodes will be travelled multiple times.
* Add a `travelled` set to record which node is travelled.
* Also add a unittest for this situation.
---
 .../paddle/trainer_config_helpers/networks.py |   7 +
 .../tests/configs/file_list.sh                |   3 +-
 .../test_cost_layers_with_weight.protostr     |   2 +
 .../protostr/test_recursive_topology.protostr | 497 ++++++++++++++++++
 .../tests/configs/test_recursive_topology.py  |  16 +
 5 files changed, 524 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index b77932ce5f..5cbfe600e4 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1408,6 +1408,8 @@ def outputs(layers, *args):
     :return:
     """
 
+    traveled = set()
+
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
         """
@@ -1419,6 +1421,11 @@ def outputs(layers, *args):
         :type layer: LayerOutput
         :return:
         """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
+
         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
         retv = []
         if layer.parents is not None:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a939c41ad0..70e342fb79 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,7 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index b7d74f85ab..96fb1d4ebd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -131,6 +131,7 @@ input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -154,6 +155,7 @@ sub_models {
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
new file mode 100644
index 0000000000..8133aa9c8d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
@@ -0,0 +1,497 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
new file mode 100644
index 0000000000..1a693f8dff
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=100)
+
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)

From abff52abcbb6f2f3d015b1955228089007bfeb30 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 01:10:48 -0500
Subject: [PATCH 064/205] Simplelize framework/CMakeLists.txt (#2803)

* generic.cmake can propogate dependencies through libraries. It is no
  need to specific all dependencies.
---
 paddle/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e6e3b79d7b..aecc97d4a8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,4 +18,4 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto attr_type op_proto)
+cc_library(net SRCS net.cc DEPS net_proto)

From d607f0b70308c61e5399773a475b8e8c640e63c1 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 14:15:45 +0800
Subject: [PATCH 065/205] use cached rank

---
 paddle/framework/ddim.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 9431645cf5..3fd3e538e8 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -226,7 +226,7 @@ Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const {
   int rank = paddle::framework::arity(dims);
   PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same")
   Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
-  for (int d = 0; d < paddle::framework::arity(dims); d++) {
+  for (int d = 0; d < rank; d++) {
     dsizes[d] = dims[d];
   }
   return dsizes;

From d4017cadcd0fa07d8874e052ffa91700ebb32a05 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:18:38 +0800
Subject: [PATCH 066/205] ENH: Add auto-free if allocate too much

---
 paddle/memory/detail/buddy_allocator.cc | 69 ++++++++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h  |  3 ++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 3f630973e9..27c1b4033b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -152,7 +152,7 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // Clean up if existing too much free memory
-  
+
   // Prefer freeing fallback allocation first
   CleanIdleFallBackAlloc();
 
@@ -198,6 +198,12 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
 
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
@@ -256,9 +262,68 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 }
 
 void BuddyAllocator::CleanIdleFallBackAlloc() {
-  
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
 }
 
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
+}
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 14ee1fa07c..4fa3fb0ee5 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -94,6 +94,9 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;

From 6a3b8416df124153d4a1fd1f8f559107578ed58e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:20:43 +0800
Subject: [PATCH 067/205] FIX: clang-format

---
 paddle/memory/memory_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index fed7444798..9fdcd03b1a 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -33,7 +33,7 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void*> ps;
+  std::vector<void *> ps;
   ps.reserve(8);
 
   for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {

From a0466053073eae411175e19de610dbe7575ad1d7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 15:47:42 +0800
Subject: [PATCH 068/205] Refine CUDA Related libraries

---
 paddle/platform/CMakeLists.txt         |  10 +-
 paddle/platform/cuda.h                 |   6 +-
 paddle/platform/device_context.cc      |  13 +++
 paddle/platform/dynload/CMakeLists.txt |   1 +
 paddle/platform/dynload/cublas.cc      |  15 +++
 paddle/platform/dynload/cublas.h       |  89 +++++++----------
 paddle/platform/dynload/cudnn.cc       |  28 ++++++
 paddle/platform/dynload/cudnn.h        | 129 ++++++++++++-------------
 paddle/platform/dynload/curand.cc      |  15 +++
 paddle/platform/dynload/curand.h       |  45 ++++-----
 10 files changed, 201 insertions(+), 150 deletions(-)
 create mode 100644 paddle/platform/device_context.cc
 create mode 100644 paddle/platform/dynload/cublas.cc
 create mode 100644 paddle/platform/dynload/cudnn.cc
 create mode 100644 paddle/platform/dynload/curand.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index ebacd5d6dc..7a198aec6c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,14 @@
 add_subdirectory(dynload)
 
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_test(cuda_test SRCS cuda_test.cu DEPS dyload_cuda)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dyload_cuda dynamic_loader )
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 5ed36c0f02..96889abf9e 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -28,19 +28,19 @@ inline void throw_on_error(cudaError_t e, const char* message) {
   }
 }
 
-int GetDeviceCount(void) {
+inline int GetDeviceCount(void) {
   int count;
   throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
   return count;
 }
 
-int GetCurrentDeviceId(void) {
+inline int GetCurrentDeviceId(void) {
   int device_id;
   throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
   return device_id;
 }
 
-void SetDeviceId(int device_id) {
+inline void SetDeviceId(int device_id) {
   throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
 }
 
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000..a2dea2ed1e
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,13 @@
+#include <paddle/platform/device_context.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+namespace dummy {
+// Make DeviceContext A library.
+int DUMMY_VAR_FOR_DEV_CTX = 0;
+
+}  // namespace dummy
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 9f829b7012..4a8866b3d3 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+nv_library(dyload_cuda SRCS cublas.cc cudnn.cc curand.cc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
new file mode 100644
index 0000000000..f83fcf34d7
--- /dev/null
+++ b/paddle/platform/dynload/cublas.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/cublas.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name;
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 258cc88031..1332be31b1 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
 
 /**
  * The following macro definition can generate structs
@@ -34,10 +34,10 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(Args... args) {                       \
+    inline cublasStatus_t operator()(Args... args) {                \
       typedef cublasStatus_t (*cublasFunc)(Args...);                \
       std::call_once(cublas_dso_flag,                               \
                      paddle::platform::dynload::GetCublasDsoHandle, \
@@ -45,62 +45,43 @@ void *cublas_dso_handle = nullptr;
       void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
       return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name;  // struct DynLoad__##__name
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    inline template <typename... Args>           \
+    cublasStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-// include all needed cublas functions in HPPL
-// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+  __macro(cublasSgemv);                   \
+  __macro(cublasDgemv);                   \
+  __macro(cublasSgemm);                   \
+  __macro(cublasDgemm);                   \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasCreate);                  \
+  __macro(cublasDestroy);                 \
+  __macro(cublasSetStream);               \
+  __macro(cublasSetPointerMode);          \
+  __macro(cublasGetPointerMode);          \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
 
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
-#endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000..8b5e15b5ef
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.cc
@@ -0,0 +1,28 @@
+#include <paddle/platform/dynload/cudnn.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 0a9562c573..ef0dd85b08 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -23,12 +23,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
     auto operator()(Args... args) -> decltype(__name(args...)) {   \
@@ -39,17 +39,19 @@ void* cudnn_dso_handle = nullptr;
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
-  } __name; /* struct DynLoad__##__name */
+  };                                                               \
+  extern struct DynLoad__##__name __name
 
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
   struct DynLoad__##__name {                                     \
     template <typename... Args>                                  \
     auto operator()(Args... args) -> decltype(__name(args...)) { \
       return __name(args...);                                    \
     }                                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                                             \
+  extern DynLoad__##__name __name
 
 #endif
 
@@ -57,80 +59,73 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 // APIs in R5
 #if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
new file mode 100644
index 0000000000..5c1fab992c
--- /dev/null
+++ b/paddle/platform/dynload/curand.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/curand.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}
+}
+}
\ No newline at end of file
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index 9dc0a25c0f..d8c46bc41e 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
     curandStatus_t operator()(Args... args) {                       \
@@ -36,32 +36,29 @@ void *curand_dso_handle = nullptr;
       void *p_##__name = dlsym(curand_dso_handle, #__name);         \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name; /* struct DynLoad__##__name */
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)       \
-  __macro(curandDestroyGenerator)
-// clang-format on
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandDestroyGenerator);
 
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

From a5eb1d8fabe15b1e59e09fb0ed18de76de8843be Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 16:21:19 +0800
Subject: [PATCH 069/205] fix build error

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/ddim.cc        |  6 +++---
 paddle/framework/ddim.h         |  2 +-
 paddle/framework/tensor.h       | 15 ++++-----------
 4 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4409c6feae..2d26a62d0f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3fd3e538e8..fe8f79abd4 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -222,9 +222,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 }
 
 template <int NDIMS>
-Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const {
-  int rank = paddle::framework::arity(dims);
-  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same")
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(const DDim& dims) {
+  int rank = arity(dims);
+  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same");
   Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
   for (int d = 0; d < rank; d++) {
     dsizes[d] = dims[d];
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index a83a367196..18395c3636 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -93,7 +93,7 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
 
 template <int NDIMS>
-Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(DDim dims) const;
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(const DDim& dims);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 0fa74e7ab1..21818937e8 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -28,13 +28,6 @@ namespace framework {
 
 class Tensor {
  public:
-  template <typename T>
-  const T* data() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data.");
-    return static_cast<const T*>(holder_->Ptr());
-  }
-
   template <typename T>
   T* data() const {
     PADDLE_ENFORCE(holder_ != nullptr,
@@ -60,14 +53,14 @@ class Tensor {
   size_t NumElements() const { return product(dims_); }
 
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::Tensor Tensor::shaped(DDim new_dims) {
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) {
     Eigen::array<Eigen::DenseIndex, NDIMS> dims =
-        paddle::framework::ToEigenDSizes(new_dims);
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
     return typename TTypes<T, NDIMS>::Tensor(data<T>(), dims);
   }
 
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::Tensor Tensor::tensor() {
+  typename TTypes<T, NDIMS>::Tensor tensor() {
     return typename TTypes<T, NDIMS>::Tensor(
         data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
   }
@@ -92,7 +85,7 @@ class Tensor {
 
   // const versions of all the methods above.
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::ConstantTensor Tensor::tensor() const {
+  typename TTypes<T, NDIMS>::ConstantTensor tensor() const {
     return typename TTypes<T, NDIMS>::Tensor(
         data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
   }

From 383b96f32c60ec542819c62b4e09009cae9afc9d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 16:26:58 +0800
Subject: [PATCH 070/205] FIX: merge conflicts

---
 paddle/memory/detail/meta_cache.cc | 2 +-
 paddle/memory/memory.cc            | 2 +-
 paddle/platform/CMakeLists.txt     | 2 +-
 paddle/platform/device_context.h   | 3 ++-
 paddle/platform/gpu_info.cc        | 4 ++--
 paddle/platform/gpu_info.h         | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
index 189ab4fc7b..30ff80e7ba 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -25,7 +25,7 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 Metadata MetadataCache::load(const MemoryBlock* block) {
   if (uses_gpu_) {
     auto existing_metadata = cache_.find(block);
-    assert(existing_metadata->second.check_guards());
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
     return existing_metadata->second;
   } else {
     PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index def580f7a4..430ce98bfc 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -52,7 +52,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
+    int gpu_num = platform::GetDeviceCount();
     as = new detail::BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4b3f55b3c7..d16c747aee 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,4 +8,4 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3)
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 gpu_info)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e120..02194581d1 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include "paddle/platform/place.h"
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index fe475d23ce..9b917f9d35 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -23,11 +23,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
 namespace paddle {
 namespace platform {
 
-int GpuDeviceCount() {
+int GetDeviceCount() {
   int count;
   throw_on_error(
       cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
   return count;
 }
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 81ee5f6e0a..79e71956bd 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace platform {
 
 //! Get the total number of GPU devices in system.
-int GpuDeviceCount();
+int GetDeviceCount();
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

From 27b196ba6dbf8e12389cb27e8451c4ea284b61e2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 03:45:46 -0500
Subject: [PATCH 071/205] Expose paddle.framework by pybind11 (#2793)

* Expose paddle.framework by pybind11

* Export paddle.framework.{Scope, Variable} to paddle.v2.framework.core.
* See python/paddle/v2/framework/tests/test_scope.py for Python usage
* See paddle/pybind/pybind.cc for C++ bind code.

* add copyright
---
 .gitignore                                    |  3 ++
 CMakeLists.txt                                |  1 +
 cmake/external/pybind11.cmake                 | 30 ++++++++++++
 cmake/external/python.cmake                   |  3 ++
 cmake/flags.cmake                             |  4 +-
 paddle/CMakeLists.txt                         |  1 +
 paddle/pybind/CMakeLists.txt                  |  1 +
 paddle/pybind/pybind.cc                       | 46 +++++++++++++++++++
 python/CMakeLists.txt                         |  9 +++-
 .../paddle/v2/framework/tests/CMakeLists.txt  |  2 +-
 .../v2/framework/tests/test_protobuf.py       |  4 ++
 .../paddle/v2/framework/tests/test_scope.py   | 37 +++++++++++++++
 python/setup.py.in                            |  4 +-
 13 files changed, 141 insertions(+), 4 deletions(-)
 create mode 100644 cmake/external/pybind11.cmake
 create mode 100644 paddle/pybind/CMakeLists.txt
 create mode 100644 paddle/pybind/pybind.cc
 create mode 100644 python/paddle/v2/framework/tests/test_scope.py

diff --git a/.gitignore b/.gitignore
index 275173b967..5c2fb134ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ third_party/
 
 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/framework/core.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15a7c6b074..2c713db3e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
new file mode 100644
index 0000000000..9391c285c7
--- /dev/null
+++ b/cmake/external/pybind11.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
+
+LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 6546b2c83b..67a359d4b5 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
     FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)
 
 SET(py_env "")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a996dea92..c31e62fc08 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,9 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)
 
 set(GPU_COMMON_FLAGS
     -fPIC
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 307e99bbe3..58a35564f8 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000..af85fdeecb
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
new file mode 100644
index 0000000000..55aebc59ec
--- /dev/null
+++ b/paddle/pybind/pybind.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/framework/scope.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of Paddle Paddle");
+
+  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](pd::Variable& var, int val) -> void {
+             *var.GetMutable<int>() = val;
+           })
+      .def("get_int",
+           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+
+  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
+      .def(py::init<const std::shared_ptr<pd::Scope>&>())
+      .def("get_var",
+           &pd::Scope::GetVariable,
+           py::return_value_policy::reference)
+      .def("create_var",
+           &pd::Scope::CreateVariable,
+           py::return_value_policy::reference);
+
+  return m.ptr();
+}
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 13a1802ee3..0171f9d8cc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,10 +26,17 @@ endif(WITH_GOLANG)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
+
+add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+
+
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 8cb0c5c376..d809917af1 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1 @@
-add_python_test(test_framework test_protobuf.py)
+add_python_test(test_framework test_protobuf.py test_scope.py)
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
index f0e6019199..b8702477e6 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -24,3 +24,7 @@ class TestFrameworkProto(unittest.TestCase):
         attr.type = attr_type_lib.FLOAT
         op_proto.type = "cos"
         self.assertTrue(op_proto.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
new file mode 100644
index 0000000000..f0ee45cfc7
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -0,0 +1,37 @@
+import paddle.v2.framework.core
+import unittest
+
+
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNotNone(scope)
+        scope_with_parent = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope_with_parent)
+
+    def test_none_variable(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNone(scope.get_var("test"))
+
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var_a = scope.create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.get_var('var_a'))
+        scope2 = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope2.get_var('var_a'))
+
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var = scope.create_var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index a422b3832f..271ee6e552 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -29,7 +29,9 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'],
+            'paddle.v2.framework': ['core.so']
+      },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.

From 267f9a2cdfad6b627eb6094a28cf5db41bc4f1a4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 04:21:37 -0500
Subject: [PATCH 072/205] Move static variable defined in .cc (#2782)

* Move static variable defined in .cc

We cannot define static variable in .h, because it will be multi-defined
errors.

Also fix some cpp syntax, like:

* Prefer to use algorithm not manually for-loop, to make code more
  readable.
* Remove unused `()`.
* Enforce take a bool. It is no need `xxx==true`.
* Use range-based for-loop iterator from op_desc.attrs

* Fix a protential static variable init order error
---
 paddle/framework/CMakeLists.txt      |   3 +-
 paddle/framework/op_registry.cc      |  36 +++++++
 paddle/framework/op_registry.h       | 154 +++++++--------------------
 paddle/framework/op_registry_test.cc |  62 +++++++++++
 4 files changed, 136 insertions(+), 119 deletions(-)
 create mode 100644 paddle/framework/op_registry.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index aecc97d4a8..0a5edba6ef 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,7 +11,8 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
new file mode 100644
index 0000000000..bc6a0dda57
--- /dev/null
+++ b/paddle/framework/op_registry.cc
@@ -0,0 +1,36 @@
+#include <paddle/framework/op_registry.h>
+
+namespace paddle {
+namespace framework {
+
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+}
+}
\ No newline at end of file
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 81241b5342..a782834693 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -3,6 +3,7 @@
 #include "paddle/framework/attr_checker.h"
 
 //#include "paddle/framework/op_base.h"
+#include <algorithm>
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 
@@ -64,36 +65,6 @@ struct AttrTypeHelper {
   }
 };
 
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
-
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
@@ -103,22 +74,22 @@ class OpProtoAndCheckerMaker {
  protected:
   void AddInput(const std::string& name, const std::string& comment) {
     auto input = proto_->mutable_inputs()->Add();
-    *(input->mutable_name()) = name;
-    *(input->mutable_comment()) = comment;
+    *input->mutable_name() = name;
+    *input->mutable_comment() = comment;
   }
 
   void AddOutput(const std::string& name, const std::string& comment) {
     auto output = proto_->mutable_outputs()->Add();
-    *(output->mutable_name()) = name;
-    *(output->mutable_comment()) = comment;
+    *output->mutable_name() = name;
+    *output->mutable_comment() = comment;
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
                                const std::string& comment) {
     auto attr = proto_->mutable_attrs()->Add();
-    *(attr->mutable_name()) = name;
-    *(attr->mutable_comment()) = comment;
+    *attr->mutable_name() = name;
+    *attr->mutable_comment() = comment;
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
@@ -134,49 +105,51 @@ class OpProtoAndCheckerMaker {
 };
 
 class OpRegistry {
-  typedef std::function<OpBase*()> OpCreator;
+  using OpCreator = std::function<OpBase*()>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
-    creators_[op_type] = []() { return new OpType; };
-    OpProto& op_proto = protos_[op_type];
-    OpAttrChecker& op_checker = op_checkers_[op_type];
+    creators()[op_type] = [] { return new OpType; };
+    OpProto& op_proto = protos()[op_type];
+    OpAttrChecker& op_checker = op_checkers()[op_type];
     ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
+    PADDLE_ENFORCE(op_proto.IsInitialized(),
                    "Fail to initialize %s's OpProto !", op_type);
   }
 
   static OpBase* CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
-    OpBase* op = (creators_.at(op_type))();
-    (op->inputs_).resize(op_desc.inputs_size());
-    for (int i = 0; i < op_desc.inputs_size(); ++i) {
-      (op->inputs_)[i] = op_desc.inputs(i);
-    }
-    (op->outputs_).resize(op_desc.outputs_size());
-    for (int i = 0; i < op_desc.outputs_size(); ++i) {
-      (op->outputs_)[i] = op_desc.outputs(i);
-    }
-    for (int i = 0; i < op_desc.attrs_size(); ++i) {
-      const AttrDesc& ith_attr = op_desc.attrs(i);
-      std::string name = ith_attr.name();
-      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
+    OpBase* op = creators().at(op_type)();
+    op->inputs_.reserve((size_t)op_desc.inputs_size());
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
+              std::back_inserter(op->inputs_));
+    op->outputs_.reserve((size_t)op_desc.outputs_size());
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
+              std::back_inserter(op->outputs_));
+    for (auto& attr : op_desc.attrs()) {
+      op->attr_map_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
-    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
-    op_checker.Check(op->attr_map_);
+    op_checkers().at(op_type).Check(op->attr_map_);
     return op;
   }
 
  private:
-  static std::unordered_map<std::string, OpCreator> creators_;
-  static std::unordered_map<std::string, OpProto> protos_;
-  static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-};
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
 
-std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
-std::unordered_map<std::string, OpProto> OpRegistry::protos_;
-std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
+    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
+};
 
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
@@ -194,60 +167,5 @@ class OpRegisterHelper {
   const OpRegisterHelper<__op_class, __op_maker_class>               \
       __op_class##Register::reg(#__op_type);
 
-// Demos
-
-class CosineOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
-  }
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
-  }
-};
-
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
-
-class MyTestOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
-  }
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
-  }
-};
-
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index ae6b738712..a92f1feb47 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,6 +1,63 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
+namespace paddle {
+namespace framework {
+class CosineOp : public OpBase {
+ public:
+  virtual std::string Run() const {
+    std::string msg = "CosineOp runs! scale = " +
+                      std::to_string(boost::get<float>(attr_map_.at("scale")));
+    return msg;
+  }
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+
+class MyTestOp : public OpBase {
+ public:
+  virtual std::string Run() const {
+    std::string msg =
+        "MyTestOp runs! test_attr = " +
+        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
+    return msg;
+  }
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+}  // namespace framework
+}  // namespace paddle
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
@@ -120,3 +177,8 @@ TEST(OpRegistry, CustomChecker) {
     ASSERT_EQ(debug_str[i], str[i]);
   }
 }
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file

From 69d76812ae3c9e43f46f7a24175c2795ae9034d4 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 11 Jul 2017 19:15:48 +0800
Subject: [PATCH 073/205] fix cublas dynload bug

---
 paddle/platform/dynload/cublas.cc |  4 ++--
 paddle/platform/dynload/cublas.h  | 31 +++++++++++++++++--------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index f83fcf34d7..4e3dfdaefb 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -6,10 +6,10 @@ namespace dynload {
 std::once_flag cublas_dso_flag;
 void *cublas_dso_handle = nullptr;
 
-#define DEFINE_WRAP(__name) DynLoad__##__name __name;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 1332be31b1..47c7a8ec21 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -58,26 +58,29 @@ extern void *cublas_dso_handle;
   extern DynLoad__##__name __name
 #endif
 
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSgemv);                   \
   __macro(cublasDgemv);                   \
   __macro(cublasSgemm);                   \
   __macro(cublasDgemm);                   \
   __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);                   \
-  __macro(cublasCreate);                  \
-  __macro(cublasDestroy);                 \
-  __macro(cublasSetStream);               \
-  __macro(cublasSetPointerMode);          \
-  __macro(cublasGetPointerMode);          \
-  __macro(cublasSgemmBatched);            \
-  __macro(cublasDgemmBatched);            \
-  __macro(cublasCgemmBatched);            \
-  __macro(cublasZgemmBatched);            \
-  __macro(cublasSgetrfBatched);           \
-  __macro(cublasSgetriBatched);           \
-  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
+  __macro(cublasDgeam);
+
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched);
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 

From ca39515e24be50931000be632134dce2e4a23d3f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 11 Jul 2017 18:52:09 +0800
Subject: [PATCH 074/205] Add several interfaces for Tensor class

1. Add member variable 'DDim dims_' and a getter function 'dims()'.
'dims' is supposed to hold tensor's shape during Op::InferShape.
2. Remove 'mutable_data' which use default Place. User must specify a
explicit Place when call 'mutable_data'.
3. A PlaceHolder may be shared by more than one tensor, and some of them may be the others' slices. So we add a new member variable 'offset_' for Tensor, which is used to show the byte offset between PlaceHolder::ptr_ and where tensor's data really begins.
4. Add functions 'ShareDataFrom' and 'Slice' for Tensor.

TODO: Tensor needs a 'CopyFrom' function.
---
 paddle/framework/tensor.h | 57 ++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index ce5d98b04e..d40edb190c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <type_traits>
 #include "paddle/framework/ddim.h"
@@ -26,31 +27,65 @@ namespace framework {
 
 class Tensor {
  public:
+  Tensor() : offset_(0) {}
+
+  Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+
   template <typename T>
   const T* data() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data.");
-    return static_cast<const T*>(holder_->Ptr());
+    PADDLE_ENFORCE(
+        holder_ != nullptr,
+        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
+    return reinterpret_cast<const T*>(
+        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
   }
 
   template <typename T,  // must be POD types
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
+    dims_ = dims;
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T)) {
+        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
       holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+      offset_ = 0;
     }
-    return static_cast<T*>(holder_->Ptr());
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+                                offset_);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims) {
-    return mutable_data<T>(dims, paddle::platform::get_place());
+  void ShareDataFrom(const Tensor& src) {
+    PADDLE_ENFORCE(src.holder_ != nullptr,
+                   "Tenosr 'src' has not been initialized.");
+    holder_ = src.holder_;
+    dims_ = src.dims_;
+    offset_ = src.offset_;
   }
 
+  Tensor Slice(const int& begin_idx, const int& end_idx) {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "The sliced tenosr has not been initialized.");
+    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
+                   "Slice index is less than zero or out of bound.");
+    PADDLE_ENFORCE(begin_idx < end_idx,
+                   "Begin index must be less than end index.");
+    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+    std::vector<int> d = vectorize(dims_);
+    int base = 1;
+    for (size_t i = 1; i < d.size(); ++i) {
+      base *= d[i];
+    }
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.dims_ = dims_;
+    dst.dims_[0] = end_idx - begin_idx;
+    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    return dst;
+  }
+
+  DDim dims() const { return dims_; }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -59,6 +94,7 @@ class Tensor {
     virtual void* Ptr() const = 0;
     virtual paddle::platform::Place Place() const = 0;
     virtual size_t Size() const = 0;
+    virtual size_t TypeSize() const = 0;
   };
 
   template <typename T>
@@ -85,6 +121,7 @@ class Tensor {
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t Size() const { return size_; }
     virtual paddle::platform::Place Place() const { return place_; }
+    virtual size_t TypeSize() const { return sizeof(T); }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
@@ -92,6 +129,8 @@ class Tensor {
   };
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  DDim dims_;
+  size_t offset_;  // marks the begin of tensor data area.
 };
 
 }  // namespace framework

From d027f47d7d93b1bdbf7b91090f362fdd879c7120 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Jul 2017 20:22:18 +0800
Subject: [PATCH 075/205] Default scope function

`Paddle` manages Scope as programming language's scope.  It just a
thread-local stack of Scope. Top of that stack is current scope, the
bottom of that stack is all scopes' parent.

Invoking `create_var/get_var`  can `create/get` variable in current
scope. Invoking `enter_local_scope/leave_local_scope` can create or
destroy local scope.

A `scoped_function` will take a `function` as input. That function will
be invoked in a new local scope.
---
 .../v2/framework/default_scope_funcs.py       | 83 +++++++++++++++++++
 .../paddle/v2/framework/tests/CMakeLists.txt  |  3 +-
 .../tests/test_default_scope_funcs.py         | 33 ++++++++
 3 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/v2/framework/default_scope_funcs.py
 create mode 100644 python/paddle/v2/framework/tests/test_default_scope_funcs.py

diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
new file mode 100644
index 0000000000..4e772326c9
--- /dev/null
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -0,0 +1,83 @@
+"""
+Default scope function.
+
+`Paddle` manages Scope as programming language's scope.  It just a 
+thread-local stack of Scope. Top of that stack is current scope, the bottom 
+of that stack is all scopes' parent. 
+
+Invoking `create_var/get_var`  can `create/get` variable in current scope. 
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
+scope. 
+
+A `scoped_function` will take a `function` as input. That function will be 
+invoked in a new local scope. 
+"""
+
+import paddle.v2.framework.core
+import threading
+
+__tl_scope__ = threading.local()
+
+__all__ = [
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var',
+    'get_var', 'scoped_function'
+]
+
+
+def get_cur_scope():
+    """
+    Get current scope.
+    :rtype: paddle.v2.framework.core.Scope
+    """
+    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
+    if cur_scope_stack is None:
+        __tl_scope__.cur_scope = list()
+    if len(__tl_scope__.cur_scope) == 0:
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None))
+    return __tl_scope__.cur_scope[-1]
+
+
+def enter_local_scope():
+    """
+    Enter a new local scope
+    """
+    cur_scope = get_cur_scope()
+    new_scope = paddle.v2.framework.core.Scope(cur_scope)
+    __tl_scope__.cur_scope.append(new_scope)
+
+
+def leave_local_scope():
+    """
+    Leave local scope
+    """
+    __tl_scope__.cur_scope.pop()
+
+
+def create_var(name):
+    """
+    create variable in current scope.
+    """
+    return get_cur_scope().create_var(name)
+
+
+def get_var(name):
+    """
+    get variable in current scope.
+    """
+    return get_cur_scope().get_var(name)
+
+
+def scoped_function(func):
+    """
+    invoke `func` in new scope.
+    
+    :param func: a callable function that will be run in new scope.
+    :type func: callable
+    """
+    enter_local_scope()
+    try:
+        func()
+    except:
+        raise
+    finally:
+        leave_local_scope()
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index d809917af1..7023e82b5f 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1,2 @@
-add_python_test(test_framework test_protobuf.py test_scope.py)
+add_python_test(test_framework test_protobuf.py test_scope.py
+    test_default_scope_funcs.py)
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
new file mode 100644
index 0000000000..81033deb15
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -0,0 +1,33 @@
+from paddle.v2.framework.default_scope_funcs import *
+import unittest
+
+
+class TestDefaultScopeFuncs(unittest.TestCase):
+    def test_cur_scope(self):
+        self.assertIsNotNone(get_cur_scope())
+
+    def test_none_variable(self):
+        self.assertIsNone(get_var("test"))
+
+    def test_create_var_get_var(self):
+        var_a = create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        enter_local_scope()
+        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        leave_local_scope()
+
+    def test_var_get_int(self):
+        def __new_scope__():
+            i = create_var("var_i")
+            self.assertFalse(i.is_int())
+            i.set_int(10)
+            self.assertTrue(i.is_int())
+            self.assertEqual(10, i.get_int())
+
+        for _ in xrange(10):
+            scoped_function(__new_scope__)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0665dc9755bdae807ac4e970edc774952236ab98 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 11 Jul 2017 20:31:11 +0800
Subject: [PATCH 076/205] add more test

---
 paddle/framework/tensor.h       |   2 +-
 paddle/framework/tensor_test.cc | 140 +++++++++++++++++++++++++++-----
 2 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index d40edb190c..1b98e17379 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -57,7 +57,7 @@ class Tensor {
 
   void ShareDataFrom(const Tensor& src) {
     PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Tenosr 'src' has not been initialized.");
+                   "Can not share data from an uninitialized tensor.");
     holder_ = src.holder_;
     dims_ = src.dims_;
     offset_ = src.offset_;
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 727d81f8d7..f4822838cf 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,15 +15,27 @@
 #include <gtest/gtest.h>
 #include <string>
 
-TEST(Tensor, ASSERT) {
-  paddle::framework::Tensor cpu_tensor;
+TEST(Tensor, Dims) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor tt(make_ddim({2, 3, 4}));
+  DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  paddle::framework::Tensor src_tensor;
 
   bool caught = false;
   try {
-    const double* p __attribute__((unused)) = cpu_tensor.data<double>();
+    src_tensor.data<double>();
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
-    std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
+    std::string msg =
+        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -32,54 +44,138 @@ TEST(Tensor, ASSERT) {
   ASSERT_TRUE(caught);
 }
 
-/* mutable_data() is not tested at present
+/* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
 
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
-    Tensor cpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set cpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set cpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
     EXPECT_EQ(p1, p2);
-    // set cpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
 
   {
-    Tensor gpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set gpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set gpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
     EXPECT_EQ(p1, p2);
-    // set gpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
     EXPECT_EQ(p1, p2);
   }
 }
-*/
+
+TEST(Tensor, ShareDataFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataFrom(src_tensor);
+    } catch (EnforceNotMet err) {
+      caught = true;
+      std::string msg = "Can not share data from an uninitialized tensor.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    dst_tensor.ShareDataFrom(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataFrom(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+}
+
+TEST(Tensor, Slice) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
+    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+}
+
+*/
\ No newline at end of file

From 043764e7701fbc603a36d1695d2bde39222b82fc Mon Sep 17 00:00:00 2001
From: zhanghaichao <zhanghaichao@baidu.com>
Date: Tue, 11 Jul 2017 12:28:26 -0700
Subject: [PATCH 077/205] Error fix in doc on Write New Layers

---
 doc/howto/dev/new_layer_cn.rst | 2 +-
 doc/howto/dev/new_layer_en.rst | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
index 9489a921c7..75037e693b 100644
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么
 
 .. math::
 
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 46481f5ead..110a9fb38f 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
 
 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
 
-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
 
 Suppose our loss function is :math:`c(y)`, then
 
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 
 .. math::
 
@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:
 
 .. math::
-  
+
    \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
 
 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                       /* weight */ true);
       }
     }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash

From a2e5f652d3fa558d580cac410c4c8aba03d6e188 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 12 Jul 2017 06:36:20 +0800
Subject: [PATCH 078/205] add operator base (#2725)

Add OperatorBase.

issue: https://github.com/PaddlePaddle/Paddle/issues/2790

Paddle design the Operator with Kernel. OperatorBase has no type and device information when create, One operator can have multiple kernels, Operator will choose a kernel to run according to context. The kernel should be bind to Operator before or during Operator running.
---
 paddle/CMakeLists.txt                |   1 +
 paddle/framework/CMakeLists.txt      |   4 +-
 paddle/framework/op_registry.cc      |   4 +-
 paddle/framework/op_registry.h       |  28 ++-----
 paddle/framework/op_registry_test.cc |  72 +++++++++---------
 paddle/framework/operator.cc         |  51 +++++++++++++
 paddle/framework/operator.h          | 107 +++++++++++++++++++++++++++
 paddle/framework/operator_test.cc    |  80 ++++++++++++++++++++
 paddle/operators/.clang-format       |   5 ++
 paddle/operators/CMakeLists.txt      |   0
 paddle/operators/demo_op.h           |  59 +++++++++++++++
 11 files changed, 351 insertions(+), 60 deletions(-)
 create mode 100644 paddle/framework/operator.cc
 create mode 100644 paddle/framework/operator.h
 create mode 100644 paddle/framework/operator_test.cc
 create mode 100644 paddle/operators/.clang-format
 create mode 100644 paddle/operators/CMakeLists.txt
 create mode 100644 paddle/operators/demo_op.h

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 58a35564f8..2c1eb7521d 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(operators)
   add_subdirectory(pybind)
 endif()
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 0a5edba6ef..aac49fdb7a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,10 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index bc6a0dda57..4b35e04e68 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -32,5 +32,5 @@ template <>
 void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
   attr->set_type(paddle::framework::AttrType::STRINGS);
 }
-}
-}
\ No newline at end of file
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index a782834693..02c99d50bb 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,27 +1,14 @@
 #pragma once
 
-#include "paddle/framework/attr_checker.h"
-
-//#include "paddle/framework/op_base.h"
 #include <algorithm>
+#include "paddle/framework/attr_checker.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 
-//==================For test================//
-class OpBase {
- public:
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  AttributeMap attr_map_;
-
-  virtual std::string Run() const = 0;
-  virtual ~OpBase() {}
-};
-//=========================================//
-
 // helper class to set attribute type
 struct AttrTypeHelper {
   template <typename T>
@@ -105,7 +92,7 @@ class OpProtoAndCheckerMaker {
 };
 
 class OpRegistry {
-  using OpCreator = std::function<OpBase*()>;
+  using OpCreator = std::function<OperatorBase*()>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
@@ -118,9 +105,10 @@ class OpRegistry {
                    "Fail to initialize %s's OpProto !", op_type);
   }
 
-  static OpBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorBase* CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
-    OpBase* op = creators().at(op_type)();
+    OperatorBase* op = creators().at(op_type)();
+    op->desc_ = op_desc;
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
               std::back_inserter(op->inputs_));
@@ -128,9 +116,9 @@ class OpRegistry {
     std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
               std::back_inserter(op->outputs_));
     for (auto& attr : op_desc.attrs()) {
-      op->attr_map_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
-    op_checkers().at(op_type).Check(op->attr_map_);
+    op_checkers().at(op_type).Check(op->attrs_);
     return op;
   }
 
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index a92f1feb47..c4baafc2ae 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,14 +1,16 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
+#include "paddle/framework/operator.h"
+#include "paddle/operators/demo_op.h"
+
+using namespace paddle::framework;
 
 namespace paddle {
 namespace framework {
-class CosineOp : public OpBase {
+class CosineOp : public OperatorWithKernel {
  public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
+  void Run(const OpRunContext* context) const override {
+    printf("%s\n", DebugString().c_str());
   }
 };
 
@@ -28,13 +30,11 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
 
-class MyTestOp : public OpBase {
+class MyTestOp : public OperatorWithKernel {
  public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
+  void Run(const OpRunContext* ctx) const override {
+    printf("%s\n", DebugString().c_str());
+    printf("test_attr = %d\n", ctx->op_->GetAttr<int>("test_attr"));
   }
 };
 
@@ -64,19 +64,19 @@ TEST(OpRegistry, CreateOp) {
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
+  float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.3);
+  attr->set_f(scale);
 
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<Scope>();
+  auto dev_ctx = DeviceContext();
+  op->Run(scope, &dev_ctx);
+  float scale_get = op->GetAttr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
@@ -92,7 +92,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -111,15 +111,14 @@ TEST(OpRegistry, DefaultValue) {
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
-  paddle::framework::OpBase* op =
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  float default_value = 1.0;
-  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<Scope>();
+  auto dev_ctx = DeviceContext();
+  op->Run(scope, &dev_ctx);
+  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -131,7 +130,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -150,7 +149,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -168,14 +167,13 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto dev_ctx = DeviceContext();
+  auto scope = std::make_shared<Scope>();
+  op->Run(scope, &dev_ctx);
+  int test_attr = op->GetAttr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
new file mode 100644
index 0000000000..3db3706e47
--- /dev/null
+++ b/paddle/framework/operator.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+std::string OperatorBase::DebugString() const {
+  std::stringstream ss;
+  ss << "=================\n";
+  ss << "type = " << desc_.type() << "\n";
+  ss << "inputs = [";
+  for (auto& ipt : inputs_) {
+    ss << ipt << ", ";
+  }
+  ss << "]\n";
+  ss << "outputs = [";
+  for (auto& opt : outputs_) {
+    ss << opt << ", ";
+  }
+  ss << "]\n";
+  ss << "attr_keys = [";
+  for (auto& attr : attrs_) {
+    ss << attr.first << ", ";
+  }
+  ss << "]\n";
+  return ss.str();
+}
+
+const Variable* OpRunContext::Input(int index) const {
+  return scope_->GetVariable(op_->inputs_[index]);
+}
+
+Variable* OpRunContext::Output(int index) const {
+  return scope_->GetVariable(op_->outputs_[index]);
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
new file mode 100644
index 0000000000..6570d58698
--- /dev/null
+++ b/paddle/framework/operator.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <boost/variant.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+class OperatorBase;
+
+class DeviceContext {};
+
+/**
+ * OpRunContext is the only parameter of Operator's Run function.
+ * Run will get input/output variables, state such as momentum and
+ * device resource such as CUDA stream, cublas handle, etc. from
+ * OpRunContext. User should construct it before run the Operator.
+ */
+class OpRunContext {
+ public:
+  OpRunContext(const OperatorBase* op, const std::shared_ptr<Scope> scope,
+               const DeviceContext* device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}
+
+  const Variable* Input(int index) const;
+  Variable* Output(int index) const;
+
+ public:
+  const OperatorBase* op_;
+  const std::shared_ptr<Scope> scope_;
+  const DeviceContext* device_context_;
+};
+
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  std::string DebugString() const;
+
+  /// InferShape infer the size of Variables used by this Operator with
+  /// information inside scope
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const DeviceContext* dev_ctx) const = 0;
+
+ public:
+  OpDesc desc_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  AttributeMap attrs_;
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  virtual ~OperatorWithKernel() {}
+
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const {}
+
+  void Run(const std::shared_ptr<Scope>& scope,
+           const DeviceContext* dev_ctx) const {
+    OpRunContext op_ctx(this, scope, dev_ctx);
+    Run(&op_ctx);
+  }
+
+  /// when implement an Op, your should implement this function.
+  /// this function should be moved to OpKernel later
+  virtual void Run(const OpRunContext* context) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
new file mode 100644
index 0000000000..48808dabb2
--- /dev/null
+++ b/paddle/framework/operator_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+class OperatorTest : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext* ctx) const override {
+    float scale = ctx->op_->GetAttr<float>("scale");
+    PADDLE_ENFORCE(ctx->Input(0) == nullptr, "Input(0) should not initialized");
+    PADDLE_ENFORCE(ctx->Output(0) == nullptr,
+                   "Output(1) should not initialized");
+    auto output1 = ctx->scope_->CreateVariable("output1");
+    PADDLE_ENFORCE(output1 != nullptr, "should create output1 from scope");
+    printf("get attr %s = %f\n", "scale", scale);
+    printf("%s\n", DebugString().c_str());
+  }
+};
+
+class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("test_operator");
+    AddComment("This is test op");
+  }
+};
+
+REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
+
+TEST(OperatorBase, DebugString) {
+  OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  std::vector<std::string> inputs = {"IN1", "IN2"};
+  for (auto& input : inputs) {
+    op_desc.add_inputs(input);
+  }
+  std::vector<std::string> outputs = {"OUT1", "OUT2"};
+  for (auto& output : outputs) {
+    op_desc.add_outputs(output);
+  }
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  float scale = 3.14;
+  attr->set_f(scale);
+
+  DeviceContext device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(op->inputs_, inputs);
+  ASSERT_EQ(op->outputs_, outputs);
+  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
+  op->Run(scope, &device_context);
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/operators/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/paddle/operators/demo_op.h b/paddle/operators/demo_op.h
new file mode 100644
index 0000000000..d0b7420b4e
--- /dev/null
+++ b/paddle/operators/demo_op.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+
+using namespace paddle::framework;
+
+namespace paddle {
+namespace operators {
+
+class CosineOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext *context) const override {
+    printf("%s\n", DebugString().c_str());
+  }
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+
+class MyTestOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext *context) const override {
+    printf("%s\n", DebugString().c_str());
+  }
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+
+}  // namespace operators
+}  // namespace operators

From c4f301ded74b1f4c0dd1526a76ece9e8e26d2048 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 12 Jul 2017 10:34:59 +0800
Subject: [PATCH 079/205] Modify comments and fix code format.

---
 python/paddle/v2/dataset/voc_seg.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/dataset/voc_seg.py b/python/paddle/v2/dataset/voc_seg.py
index 9b79f726d2..595ab41cd8 100644
--- a/python/paddle/v2/dataset/voc_seg.py
+++ b/python/paddle/v2/dataset/voc_seg.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 """
 Image dataset for segmentation.
-The 2012 dataset contains images from 2008-2011 for which additional segmentations have been prepared. As in previous years the assignment to training/test sets has been maintained. The total number of images with segmentation has been increased from 7,062 to 9,993.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
 """
 
 import tarfile
@@ -23,7 +26,9 @@ from paddle.v2.image import *
 
 __all__ = ['train', 'test', 'val']
 
-VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar'
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+            VOCtrainval_11-May-2012.tar'
+
 VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
 SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
 DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
@@ -55,20 +60,20 @@ def reader_creator(filename, sub_name):
 
 def train():
     """
-    Create a train dataset reader containing 2913 images.
+    Create a train dataset reader containing 2913 images in HWC order.
     """
     return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'trainval')
 
 
 def test():
     """
-    Create a test dataset reader containing 1464 images.
+    Create a test dataset reader containing 1464 images in HWC order.
     """
     return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'train')
 
 
 def val():
     """
-    Create a val dataset reader containing 1449 images.
+    Create a val dataset reader containing 1449 images in HWC order.
     """
     return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'val')

From 14d2c3990fe74e063d30f21540019802e1b36194 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 10:41:45 +0800
Subject: [PATCH 080/205] split device_context

---
 paddle/platform/CMakeLists.txt         |  10 +-
 paddle/platform/cuda_device_context.h  | 148 +++++++++++++++++++++++++
 paddle/platform/device_context.cc      |  13 ---
 paddle/platform/device_context.h       | 131 +---------------------
 paddle/platform/device_context_test.cc |   2 +-
 paddle/platform/dynload/CMakeLists.txt |   2 +-
 6 files changed, 158 insertions(+), 148 deletions(-)
 create mode 100644 paddle/platform/cuda_device_context.h
 delete mode 100644 paddle/platform/device_context.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7a198aec6c..4e34e8d02c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,14 +1,8 @@
 add_subdirectory(dynload)
 
-nv_test(cuda_test SRCS cuda_test.cu DEPS dyload_cuda)
+nv_test(cuda_test SRCS cuda_test.cu DEPS dynload_cuda)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-IF(WITH_GPU)
-    set(GPU_CTX_DEPS dyload_cuda dynamic_loader )
-ELSE()
-    set(GPU_CTX_DEPS)
-ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
+nv_test(device_context_test SRCS device_context_test.cc DEPS place eigen3 dynload_cuda)
diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device_context.h
new file mode 100644
index 0000000000..e0d79631c5
--- /dev/null
+++ b/paddle/platform/cuda_device_context.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/enforce.h"
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#define EIGEN_USE_GPU
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+class GPUPlaceGuard {
+ public:
+  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+
+  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+
+ private:
+  GPUPlace previous_;
+};
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    GPUPlaceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+
+  cudaStream_t stream() { return stream_; }
+
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
+                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
+                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
+  ~CUDADeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+
+    if (rand_generator_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
+                         rand_generator_) == CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
+    }
+
+    delete eigen_stream_;
+    delete eigen_device_;
+
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+
+  cublasHandle_t blas_handle_{nullptr};
+
+  cudnnHandle_t dnn_handle_{nullptr};
+
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
deleted file mode 100644
index a2dea2ed1e..0000000000
--- a/paddle/platform/device_context.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <paddle/platform/device_context.h>
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-namespace dummy {
-// Make DeviceContext A library.
-int DUMMY_VAR_FOR_DEV_CTX = 0;
-
-}  // namespace dummy
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e120..f30c147126 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -13,16 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "paddle/framework/enforce.h"
-#ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
-#define EIGEN_USE_GPU
-#endif
-#include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -33,128 +24,18 @@ class DeviceContext {
   virtual ~DeviceContext() {}
 };
 
-class CPUDeviceContext : public DeviceContext {};
-
-#ifndef PADDLE_ONLY_CPU
-
-class GPUPlaceGuard {
+class CPUDeviceContext : public DeviceContext {
  public:
-  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
+  Eigen::DefaultDevice eigen_handle() {
+    if (!eigen_handle_) {
+      eigen_handle_ = new Eigen::DefaultDevice();
     }
+    return *eigen_handle_;
   }
 
-  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
-
  private:
-  GPUPlace previous_;
+  Eigen::DefaultDevice* eigen_handle_{nullptr};
 };
 
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    GPUPlaceGuard guard(gpu_place_);
-    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
-                                     "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
-  }
-
-  void Wait() {
-    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
-                                     "cudaStreamSynchronize failed");
-  }
-
-  cudaStream_t stream() { return stream_; }
-
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
-
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
-                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
-                     "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
-                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
-                     "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
-                         CURAND_STATUS_SUCCESS,
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
-                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
-                     "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
-  ~CUDADeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
-                         rand_generator_) == CURAND_STATUS_SUCCESS,
-                     "curandDestroyGenerator failed");
-    }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
-    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
-                                     "cudaStreamDestroy failed");
-  }
-
- private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
-
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
-
-  cublasHandle_t blas_handle_{nullptr};
-
-  cudnnHandle_t dnn_handle_{nullptr};
-
-  int random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
-};
-#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 61be4a307d..cc81e9e789 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/cuda_device_context.h"
 
 TEST(CUDADeviceContext, Init) {
   int count = paddle::platform::GetDeviceCount();
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 4a8866b3d3..d205ead845 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dyload_cuda SRCS cublas.cc cudnn.cc curand.cc)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)

From 2dccab87d471791ec76e6de3973343932b82bbea Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 12 Jul 2017 10:55:35 +0800
Subject: [PATCH 081/205] fix some syntax problems

---
 paddle/framework/tensor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 1b98e17379..a0945e8055 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,7 +29,7 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
-  Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
 
   template <typename T>
   const T* data() const {
@@ -63,7 +63,7 @@ class Tensor {
     offset_ = src.offset_;
   }
 
-  Tensor Slice(const int& begin_idx, const int& end_idx) {
+  Tensor Slice(const int& begin_idx, const int& end_idx) const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "The sliced tenosr has not been initialized.");
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],

From 2dbe60e489221e9883bf08e48efb10cffaabe62b Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 12 Jul 2017 11:08:26 +0800
Subject: [PATCH 082/205] Remove Dim::contiguous and Dim::contiguous_strides

Paddle's data block is row-major order, while Dim::contiguous and
Dim::contiguous_strides are based on column-order. So remove them to
prevent misuse.
---
 paddle/framework/dim.h       | 48 ------------------------------------
 paddle/framework/dim_test.cu | 28 ---------------------
 2 files changed, 76 deletions(-)

diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index bcde291d12..883fdc55eb 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 809bf04826..0521741519 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -58,24 +58,6 @@ TEST(Dim, Equality) {
     EXPECT_EQ(paddle::framework::get<1>(c), 3);
     EXPECT_EQ(paddle::framework::get<2>(c), 12);
 
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
     // generate from an index
     auto size = paddle::framework::make_dim(4, 5, 2);
     c = paddle::framework::Dim<3>(14, size);
@@ -101,16 +83,6 @@ TEST(Dim, Bool) {
     EXPECT_TRUE(a == a);
     EXPECT_FALSE(a == b);
     EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
 }
 
 TEST(Dim, Print) {

From 8f5a9fd9a7297007dc259114c23d986c8ee4e06a Mon Sep 17 00:00:00 2001
From: qijun <qijun@baidu.com>
Date: Wed, 12 Jul 2017 11:20:14 +0800
Subject: [PATCH 083/205] fix gpu build error

---
 paddle/platform/CMakeLists.txt        | 2 +-
 paddle/platform/cuda_device_context.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4e34e8d02c..5e2f203555 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -5,4 +5,4 @@ nv_test(cuda_test SRCS cuda_test.cu DEPS dynload_cuda)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS place eigen3 dynload_cuda)
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device_context.h
index e0d79631c5..0ba1f802a6 100644
--- a/paddle/platform/cuda_device_context.h
+++ b/paddle/platform/cuda_device_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
 #include "paddle/platform/place.h"
+#include "paddle/platform/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {

From b5a8d5b4b46c36dadc1a17b66c72984930e76305 Mon Sep 17 00:00:00 2001
From: qijun <qijun@baidu.com>
Date: Wed, 12 Jul 2017 11:25:32 +0800
Subject: [PATCH 084/205] remove unused deps

---
 paddle/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 5e2f203555..e93592cc4c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(dynload)
 
-nv_test(cuda_test SRCS cuda_test.cu DEPS dynload_cuda)
+nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

From 6bbc2944aec25e11921bb98c93440a4e29bc3967 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 12:10:27 +0800
Subject: [PATCH 085/205] fix code style

---
 paddle/platform/cuda_device_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device_context.h
index 0ba1f802a6..69415fe615 100644
--- a/paddle/platform/cuda_device_context.h
+++ b/paddle/platform/cuda_device_context.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
-#include "paddle/platform/place.h"
 #include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {

From 51690f104597da7b519d7209de4f2a76175a689c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 13:33:55 +0800
Subject: [PATCH 086/205] Compile for armv8.

---
 cmake/cross_compiling/android.cmake | 7 +++++++
 cmake/external/openblas.cmake       | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 9724c16122..dcfbc5d012 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -106,6 +106,9 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
                 SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
             ENDIF()
         ENDIF()
+        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        ENDIF()
         SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
     ENDIF()
 
@@ -162,6 +165,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
     ENDIF()
 
+    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+      LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+    ENDIF()
+
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
     STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
 
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 5b9d9844ed..60a1041936 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
             # arm_soft_fp_abi branch of OpenBLAS to support softfp
             #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
             SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                SET(TARGET "ARMV7")
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(TARGET "ARMV8")
+            ENDIF()
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
         ELSEIF(RPI)
             # use hardfp
             SET(OPENBLAS_COMMIT "v0.2.19")

From ef5f9debc61ce4f6b3142fedbf85a118a34731eb Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 13:51:04 +0800
Subject: [PATCH 087/205] refine device_context

---
 paddle/platform/CMakeLists.txt                |  1 +
 .../{cuda_device_context.h => cuda_device.h}  | 13 +++---
 paddle/platform/cuda_device_test.cc           | 33 +++++++++++++++
 paddle/platform/device.h                      | 41 +++++++++++++++++++
 paddle/platform/device_context.h              | 23 ++++-------
 paddle/platform/device_context_test.cc        | 23 +++++------
 6 files changed, 102 insertions(+), 32 deletions(-)
 rename paddle/platform/{cuda_device_context.h => cuda_device.h} (94%)
 create mode 100644 paddle/platform/cuda_device_test.cc
 create mode 100644 paddle/platform/device.h

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index e93592cc4c..d40e49b546 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -5,4 +5,5 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
+nv_test(cuda_device_test SRCS cuda_device_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
 nv_test(device_context_test SRCS device_context_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device.h
similarity index 94%
rename from paddle/platform/cuda_device_context.h
rename to paddle/platform/cuda_device.h
index 69415fe615..cbb69d1cc5 100644
--- a/paddle/platform/cuda_device_context.h
+++ b/paddle/platform/cuda_device.h
@@ -20,10 +20,12 @@ limitations under the License. */
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
-#include "paddle/platform/device_context.h"
+#include "paddle/platform/device.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+using DEVICE_GPU = Eigen::GpuDevice;
+
 namespace paddle {
 namespace platform {
 
@@ -41,9 +43,10 @@ class GPUPlaceGuard {
   GPUPlace previous_;
 };
 
-class CUDADeviceContext : public DeviceContext {
+template <>
+class Device<DEVICE_GPU> {
  public:
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+  explicit Device(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
     GPUPlaceGuard guard(gpu_place_);
     paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
                                      "cudaStreamCreate failed");
@@ -58,7 +61,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudaStream_t stream() { return stream_; }
 
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+  DEVICE_GPU eigen_device() { return *eigen_device_; }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
@@ -136,7 +139,7 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
 
   Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
+  DEVICE_GPU* eigen_device_;
 
   cublasHandle_t blas_handle_{nullptr};
 
diff --git a/paddle/platform/cuda_device_test.cc b/paddle/platform/cuda_device_test.cc
new file mode 100644
index 0000000000..ea647be876
--- /dev/null
+++ b/paddle/platform/cuda_device_test.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cuda_device.h"
+#include "gtest/gtest.h"
+
+TEST(Device, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::Device<DEVICE_GPU>* device =
+        new paddle::platform::Device<DEVICE_GPU>(i);
+    Eigen::GpuDevice gpu_device = device->eigen_device();
+    ASSERT_NE(nullptr, gpu_device.stream());
+    cudnnHandle_t cudnn_handle = device->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device;
+  }
+}
diff --git a/paddle/platform/device.h b/paddle/platform/device.h
new file mode 100644
index 0000000000..9ae41cbcb0
--- /dev/null
+++ b/paddle/platform/device.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+
+using DEVICE_CPU = Eigen::DefaultDevice;
+
+namespace paddle {
+namespace platform {
+
+template <typename DeviceType>
+class Device;
+
+template <>
+class Device<DEVICE_CPU> {
+ public:
+  DEVICE_CPU eigen_handle() {
+    if (!eigen_handle_) {
+      eigen_handle_ = new Eigen::DefaultDevice();
+    }
+    return *eigen_handle_;
+  }
+
+ private:
+  DEVICE_CPU* eigen_handle_{nullptr};
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index f30c147126..8b0bac6280 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -14,27 +14,22 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/enforce.h"
+#include "paddle/platform/device.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cuda_device.h"
+#endif
 
 namespace paddle {
 namespace platform {
 
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-};
+struct DeviceContext {
+  void* device_context{nullptr};
 
-class CPUDeviceContext : public DeviceContext {
- public:
-  Eigen::DefaultDevice eigen_handle() {
-    if (!eigen_handle_) {
-      eigen_handle_ = new Eigen::DefaultDevice();
-    }
-    return *eigen_handle_;
+  template <typename DeviceType>
+  inline paddle::platform::Device<DeviceType>* device_context() {
+    return static_cast<paddle::platform::Device<DeviceType>*>(device_context);
   }
-
- private:
-  Eigen::DefaultDevice* eigen_handle_{nullptr};
 };
 
 }  // namespace platform
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index cc81e9e789..ab8a6d8195 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -12,22 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
-#include "paddle/platform/cuda_device_context.h"
 
-TEST(CUDADeviceContext, Init) {
+TEST(DeviceContext, Init) {
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
-    paddle::platform::CUDADeviceContext* device_context =
-        new paddle::platform::CUDADeviceContext(i);
-    Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    paddle::platform::Device<DEVICE_GPU>* device =
+        new paddle::platform::Device<DEVICE_GPU>(i);
+    paddle::platform::DeviceContext context;
+    context.device_context = device;
+    Eigen::GpuDevice gpu_device =
+        context.device_context<DEVICE_GPU>->eigen_device();
     ASSERT_NE(nullptr, gpu_device.stream());
-    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-    ASSERT_NE(nullptr, cudnn_handle);
-    cublasHandle_t cublas_handle = device_context->cublas_handle();
-    ASSERT_NE(nullptr, cublas_handle);
-    curandGenerator_t curand_handle = device_context->curand_generator();
-    ASSERT_NE(nullptr, curand_handle);
-    delete device_context;
+    delete device;
   }
-}
+}
\ No newline at end of file

From 0ff819207230ac345efefc0a37a3883e81d43c02 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 12 Jul 2017 14:02:57 +0800
Subject: [PATCH 088/205] Add OperatorWithKernel class

* User can register OpKernel to its Ops. The OpKernelMap saved in
  OperatorWithKernel. Each Op which inherits OperatorWithKernel will
  use `OpKernel::Compute` instead of Run.
---
 paddle/CMakeLists.txt                |   1 -
 paddle/framework/op_registry_test.cc |  33 ++++----
 paddle/framework/operator.cc         |   8 --
 paddle/framework/operator.h          | 117 ++++++++++++++++++---------
 paddle/framework/operator_test.cc    |  39 ++++-----
 paddle/operators/.clang-format       |   5 --
 paddle/operators/CMakeLists.txt      |   0
 paddle/operators/demo_op.h           |  59 --------------
 paddle/platform/device_context.h     |  18 ++++-
 9 files changed, 127 insertions(+), 153 deletions(-)
 delete mode 100644 paddle/operators/.clang-format
 delete mode 100644 paddle/operators/CMakeLists.txt
 delete mode 100644 paddle/operators/demo_op.h

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 2c1eb7521d..58a35564f8 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,7 +15,6 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
-  add_subdirectory(operators)
   add_subdirectory(pybind)
 endif()
 
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index c4baafc2ae..f5d45a80bb 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,17 +1,15 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
-#include "paddle/framework/operator.h"
-#include "paddle/operators/demo_op.h"
 
 using namespace paddle::framework;
 
 namespace paddle {
 namespace framework {
-class CosineOp : public OperatorWithKernel {
+class CosineOp : public OperatorBase {
  public:
-  void Run(const OpRunContext* context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -30,12 +28,13 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
 
-class MyTestOp : public OperatorWithKernel {
+class MyTestOp : public OperatorBase {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+
  public:
-  void Run(const OpRunContext* ctx) const override {
-    printf("%s\n", DebugString().c_str());
-    printf("test_attr = %d\n", ctx->op_->GetAttr<int>("test_attr"));
-  }
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -73,8 +72,8 @@ TEST(OpRegistry, CreateOp) {
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<Scope>();
-  auto dev_ctx = DeviceContext();
-  op->Run(scope, &dev_ctx);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
   float scale_get = op->GetAttr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
@@ -116,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<Scope>();
-  auto dev_ctx = DeviceContext();
-  op->Run(scope, &dev_ctx);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
@@ -169,9 +168,9 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(4);
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto dev_ctx = DeviceContext();
+  paddle::platform::CPUDeviceContext dev_ctx;
   auto scope = std::make_shared<Scope>();
-  op->Run(scope, &dev_ctx);
+  op->Run(scope, dev_ctx);
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3db3706e47..8f7adff8b3 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -39,13 +39,5 @@ std::string OperatorBase::DebugString() const {
   return ss.str();
 }
 
-const Variable* OpRunContext::Input(int index) const {
-  return scope_->GetVariable(op_->inputs_[index]);
-}
-
-Variable* OpRunContext::Output(int index) const {
-  return scope_->GetVariable(op_->outputs_[index]);
-}
-
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 6570d58698..0ce422e007 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,44 +14,22 @@ limitations under the License. */
 
 #pragma once
 
+#include <paddle/framework/attr_checker.h>
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/scope.h>
+#include <paddle/platform/device_context.h>
+#include <paddle/platform/place.h>
+#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/framework/attr_checker.h"
-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/scope.h"
-#include "paddle/utils/Error.h"
-
 namespace paddle {
 namespace framework {
 
 class OperatorBase;
 
-class DeviceContext {};
-
-/**
- * OpRunContext is the only parameter of Operator's Run function.
- * Run will get input/output variables, state such as momentum and
- * device resource such as CUDA stream, cublas handle, etc. from
- * OpRunContext. User should construct it before run the Operator.
- */
-class OpRunContext {
- public:
-  OpRunContext(const OperatorBase* op, const std::shared_ptr<Scope> scope,
-               const DeviceContext* device_context)
-      : op_(op), scope_(scope), device_context_(device_context) {}
-
-  const Variable* Input(int index) const;
-  Variable* Output(int index) const;
-
- public:
-  const OperatorBase* op_;
-  const std::shared_ptr<Scope> scope_;
-  const DeviceContext* device_context_;
-};
-
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -77,7 +55,10 @@ class OperatorBase {
 
   /// Net will call this function to Run an op.
   virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const DeviceContext* dev_ctx) const = 0;
+                   const platform::DeviceContext& dev_ctx) const = 0;
+
+ protected:
+  std::string Type() const { return desc_.type(); }
 
  public:
   OpDesc desc_;
@@ -86,22 +67,84 @@ class OperatorBase {
   AttributeMap attrs_;
 };
 
+class OpKernel {
+ public:
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
+  class KernelContext {
+   public:
+    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                  const platform::DeviceContext& device_context)
+        : op_(*op), scope_(scope), device_context_(device_context) {}
+
+    const Variable* Input(int index) const {
+      return scope_->GetVariable(op_.inputs_[index]);
+    }
+
+    Variable* Output(int index) const {
+      return scope_->GetVariable(op_.outputs_[index]);
+    }
+
+    const OperatorBase& op_;
+    const std::shared_ptr<Scope>& scope_;
+    const platform::DeviceContext& device_context_;
+  };
+
+  virtual void Compute(const KernelContext& context) const = 0;
+
+  virtual ~OpKernel() {}
+};
+
 class OperatorWithKernel : public OperatorBase {
  public:
-  virtual ~OperatorWithKernel() {}
+  struct OpKernelKey {
+    platform::Place place_;
 
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const {}
+    OpKernelKey() = default;
+    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+      place_ = dev_ctx.GetPlace();
+    }
+
+    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+  };
+
+  struct OpKernelHash {
+    std::hash<bool> hash_;
+    size_t operator()(const OpKernelKey& key) const {
+      return hash_(platform::is_gpu_place(key.place_));
+    }
+  };
+
+  using OpKernelMap =
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
   void Run(const std::shared_ptr<Scope>& scope,
-           const DeviceContext* dev_ctx) const {
-    OpRunContext op_ctx(this, scope, dev_ctx);
-    Run(&op_ctx);
+           const platform::DeviceContext& dev_ctx) const final {
+    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
   }
 
-  /// when implement an Op, your should implement this function.
-  /// this function should be moved to OpKernel later
-  virtual void Run(const OpRunContext* context) const = 0;
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  };
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#define REGISTER_OP_KERNEL(type, PlaceType, KernelType)                   \
+  struct __op_kernel_register__##type##__ {                               \
+    __op_kernel_register__##type##__() {                                  \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new KernelType());                                       \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__ __reg_kernel_##type##__
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 48808dabb2..86f45f108a 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OperatorTest : public OperatorWithKernel {
+class OperatorTest : public OperatorBase {
  public:
-  void Run(const OpRunContext* ctx) const override {
-    float scale = ctx->op_->GetAttr<float>("scale");
-    PADDLE_ENFORCE(ctx->Input(0) == nullptr, "Input(0) should not initialized");
-    PADDLE_ENFORCE(ctx->Output(0) == nullptr,
-                   "Output(1) should not initialized");
-    auto output1 = ctx->scope_->CreateVariable("output1");
-    PADDLE_ENFORCE(output1 != nullptr, "should create output1 from scope");
-    printf("get attr %s = %f\n", "scale", scale);
-    printf("%s\n", DebugString().c_str());
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    float scale = GetAttr<float>("scale");
+    ASSERT_NEAR(scale, 3.14, 1e-5);
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
   }
 };
 
@@ -49,31 +47,26 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
 
-TEST(OperatorBase, DebugString) {
+TEST(OperatorBase, all) {
   OpDesc op_desc;
   op_desc.set_type("test_operator");
-  std::vector<std::string> inputs = {"IN1", "IN2"};
-  for (auto& input : inputs) {
-    op_desc.add_inputs(input);
-  }
-  std::vector<std::string> outputs = {"OUT1", "OUT2"};
-  for (auto& output : outputs) {
-    op_desc.add_outputs(output);
-  }
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
   float scale = 3.14;
   attr->set_f(scale);
 
-  DeviceContext device_context;
+  platform::CPUDeviceContext device_context;
   auto scope = std::make_shared<Scope>();
 
   OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->inputs_, inputs);
-  ASSERT_EQ(op->outputs_, outputs);
   ASSERT_EQ(op->GetAttr<float>("scale"), scale);
-  op->Run(scope, &device_context);
+  scope->CreateVariable("OUT1");
+  op->Run(scope, device_context);
+  std::cout << op->DebugString() << std::endl;
+  delete op;
 }
 
 }  // namespace framework
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
deleted file mode 100644
index 29282dc87e..0000000000
--- a/paddle/operators/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/paddle/operators/demo_op.h b/paddle/operators/demo_op.h
deleted file mode 100644
index d0b7420b4e..0000000000
--- a/paddle/operators/demo_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-
-using namespace paddle::framework;
-
-namespace paddle {
-namespace operators {
-
-class CosineOp : public OperatorWithKernel {
- public:
-  void Run(const OpRunContext *context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
-  }
-};
-
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
-
-class MyTestOp : public OperatorWithKernel {
- public:
-  void Run(const OpRunContext *context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
-  }
-};
-
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
-
-}  // namespace operators
-}  // namespace operators
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e120..e3c2cd2647 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
 #endif
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include <paddle/platform/place.h>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 namespace paddle {
 namespace platform {
@@ -31,9 +31,16 @@ namespace platform {
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
 };
 
-class CPUDeviceContext : public DeviceContext {};
+class CPUDeviceContext : public DeviceContext {
+ public:
+  Place GetPlace() const override {
+    Place retv = CPUPlace();
+    return retv;
+  }
+};
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -61,6 +68,11 @@ class CUDADeviceContext : public DeviceContext {
     eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
   }
 
+  Place GetPlace() const override {
+    Place retv = GPUPlace();
+    return retv;
+  }
+
   void Wait() {
     paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
                                      "cudaStreamSynchronize failed");

From 1ba879beadc33e84ff3d5c62ee5ac188027d7638 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 12 Jul 2017 14:38:03 +0800
Subject: [PATCH 089/205] Use PIL to read image in palette mode

---
 python/paddle/v2/dataset/tests/vocseg_test.py |  2 +-
 python/paddle/v2/dataset/voc_seg.py           | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/paddle/v2/dataset/tests/vocseg_test.py b/python/paddle/v2/dataset/tests/vocseg_test.py
index 1a773fa18b..8217ff45b1 100644
--- a/python/paddle/v2/dataset/tests/vocseg_test.py
+++ b/python/paddle/v2/dataset/tests/vocseg_test.py
@@ -21,7 +21,7 @@ class TestVOC(unittest.TestCase):
         sum = 0
         label = 0
         for l in reader():
-            self.assertEqual(l[0].size, l[1].size)
+            self.assertEqual(l[0].size, 3 * l[1].size)
             sum += 1
         return sum
 
diff --git a/python/paddle/v2/dataset/voc_seg.py b/python/paddle/v2/dataset/voc_seg.py
index 595ab41cd8..0df4423ff0 100644
--- a/python/paddle/v2/dataset/voc_seg.py
+++ b/python/paddle/v2/dataset/voc_seg.py
@@ -20,14 +20,16 @@ with segmentation has been increased from 7,062 to 9,993.
 """
 
 import tarfile
+import io
 import numpy as np
 from common import download
 from paddle.v2.image import *
+from PIL import Image
 
 __all__ = ['train', 'test', 'val']
 
 VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
-            VOCtrainval_11-May-2012.tar'
+VOCtrainval_11-May-2012.tar'
 
 VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
 SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
@@ -51,8 +53,10 @@ def reader_creator(filename, sub_name):
             label_file = LABEL_FILE.format(line)
             data = tarobject.extractfile(name2mem[data_file]).read()
             label = tarobject.extractfile(name2mem[label_file]).read()
-            data = load_image_bytes(data)
-            label = load_image_bytes(label)
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
             yield data, label
 
     return reader

From 4d336d9063451a7568863b249ac53fe7de8bbaa8 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 15:03:44 +0800
Subject: [PATCH 090/205] follow comments

---
 .../{cuda_device.h => cuda_device_context.h}  | 15 ++++---
 paddle/platform/cuda_device_test.cc           | 33 ---------------
 paddle/platform/device.h                      | 41 -------------------
 paddle/platform/device_context.h              | 34 +++++++++++----
 paddle/platform/device_context_test.cc        | 33 +++++++++++----
 5 files changed, 59 insertions(+), 97 deletions(-)
 rename paddle/platform/{cuda_device.h => cuda_device_context.h} (94%)
 delete mode 100644 paddle/platform/cuda_device_test.cc
 delete mode 100644 paddle/platform/device.h

diff --git a/paddle/platform/cuda_device.h b/paddle/platform/cuda_device_context.h
similarity index 94%
rename from paddle/platform/cuda_device.h
rename to paddle/platform/cuda_device_context.h
index cbb69d1cc5..420159fb2c 100644
--- a/paddle/platform/cuda_device.h
+++ b/paddle/platform/cuda_device_context.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
-#include "paddle/platform/device.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -29,6 +28,13 @@ using DEVICE_GPU = Eigen::GpuDevice;
 namespace paddle {
 namespace platform {
 
+class CUDADeviceContext;
+
+template <>
+DEVICE_GPU DeviceContext::get_eigen_device<DEVICE_GPU>() {
+  return static_cast<CUDADeviceContext*>(this)->eigen_handle();
+}
+
 class GPUPlaceGuard {
  public:
   explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
@@ -43,8 +49,7 @@ class GPUPlaceGuard {
   GPUPlace previous_;
 };
 
-template <>
-class Device<DEVICE_GPU> {
+class CUDADeviceContext : public DeviceContext {
  public:
   explicit Device(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
     GPUPlaceGuard guard(gpu_place_);
@@ -61,7 +66,7 @@ class Device<DEVICE_GPU> {
 
   cudaStream_t stream() { return stream_; }
 
-  DEVICE_GPU eigen_device() { return *eigen_device_; }
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
@@ -139,7 +144,7 @@ class Device<DEVICE_GPU> {
   cudaStream_t stream_;
 
   Eigen::CudaStreamDevice* eigen_stream_;
-  DEVICE_GPU* eigen_device_;
+  Eigen::GpuDevice* eigen_device_;
 
   cublasHandle_t blas_handle_{nullptr};
 
diff --git a/paddle/platform/cuda_device_test.cc b/paddle/platform/cuda_device_test.cc
deleted file mode 100644
index ea647be876..0000000000
--- a/paddle/platform/cuda_device_test.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/cuda_device.h"
-#include "gtest/gtest.h"
-
-TEST(Device, Init) {
-  int count = paddle::platform::GetDeviceCount();
-  for (int i = 0; i < count; i++) {
-    paddle::platform::Device<DEVICE_GPU>* device =
-        new paddle::platform::Device<DEVICE_GPU>(i);
-    Eigen::GpuDevice gpu_device = device->eigen_device();
-    ASSERT_NE(nullptr, gpu_device.stream());
-    cudnnHandle_t cudnn_handle = device->cudnn_handle();
-    ASSERT_NE(nullptr, cudnn_handle);
-    cublasHandle_t cublas_handle = device->cublas_handle();
-    ASSERT_NE(nullptr, cublas_handle);
-    curandGenerator_t curand_handle = device->curand_generator();
-    ASSERT_NE(nullptr, curand_handle);
-    delete device;
-  }
-}
diff --git a/paddle/platform/device.h b/paddle/platform/device.h
deleted file mode 100644
index 9ae41cbcb0..0000000000
--- a/paddle/platform/device.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "unsupported/Eigen/CXX11/Tensor"
-
-using DEVICE_CPU = Eigen::DefaultDevice;
-
-namespace paddle {
-namespace platform {
-
-template <typename DeviceType>
-class Device;
-
-template <>
-class Device<DEVICE_CPU> {
- public:
-  DEVICE_CPU eigen_handle() {
-    if (!eigen_handle_) {
-      eigen_handle_ = new Eigen::DefaultDevice();
-    }
-    return *eigen_handle_;
-  }
-
- private:
-  DEVICE_CPU* eigen_handle_{nullptr};
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 8b0bac6280..11a05702cd 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -13,23 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/enforce.h"
-#include "paddle/platform/device.h"
 #include "unsupported/Eigen/CXX11/Tensor"
-#ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda_device.h"
-#endif
+
+using DEVICE_CPU = Eigen::DefaultDevice;
 
 namespace paddle {
 namespace platform {
 
-struct DeviceContext {
-  void* device_context{nullptr};
+class CPUDeviceContext;
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
 
   template <typename DeviceType>
-  inline paddle::platform::Device<DeviceType>* device_context() {
-    return static_cast<paddle::platform::Device<DeviceType>*>(device_context);
+  DeviceType get_eigen_device();
+};
+
+template <>
+DEVICE_CPU DeviceContext::get_eigen_device<DEVICE_CPU>() {
+  return static_cast<CPUDeviceContext*>(this)->eigen_handle();
+}
+
+class CPUDeviceContext : public DeviceContext {
+ public:
+  Eigen::DefaultDevice eigen_handle() {
+    if (!eigen_handle_) {
+      eigen_handle_ = new Eigen::DefaultDevice();
+    }
+    return *eigen_handle_;
   }
+
+ private:
+  Eigen::DefaultDevice* eigen_handle_{nullptr};
 };
 
 }  // namespace platform
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index ab8a6d8195..8390e97b15 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -12,19 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/cuda_device.h"
 
-TEST(DeviceContext, Init) {
+TEST(Device, Init) {
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
-    paddle::platform::Device<DEVICE_GPU>* device =
-        new paddle::platform::Device<DEVICE_GPU>(i);
-    paddle::platform::DeviceContext context;
-    context.device_context = device;
+    paddle::platform::DeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
     Eigen::GpuDevice gpu_device =
-        context.device_context<DEVICE_GPU>->eigen_device();
+        device_context->get_eigen_device<DEVICE_GPU>();
     ASSERT_NE(nullptr, gpu_device.stream());
-    delete device;
+    delete device_context;
   }
-}
\ No newline at end of file
+}
+
+TEST(Device, CUDADeviceContext) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CUDADeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device.stream());
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device_context->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device_context;
+  }
+}

From e0ea87c99d242ea19f23301bd97492e47cacf231 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 12 Jul 2017 15:38:14 +0800
Subject: [PATCH 091/205] fix pybind compile question

---
 paddle/pybind/pybind.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 55aebc59ec..f9f87acf15 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <Python.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
 
@@ -43,4 +44,4 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   return m.ptr();
-}
\ No newline at end of file
+}

From 8ee50a35d408634c817d3da849a15217e57dcba1 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 07:50:08 +0000
Subject: [PATCH 092/205] fix gpu build error

---
 paddle/platform/CMakeLists.txt         |  1 -
 paddle/platform/cuda_device_context.h  | 15 +++++++--------
 paddle/platform/device_context.h       | 24 +++++++++++-------------
 paddle/platform/device_context_test.cc |  5 +++--
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d40e49b546..e93592cc4c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -5,5 +5,4 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-nv_test(cuda_device_test SRCS cuda_device_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
 nv_test(device_context_test SRCS device_context_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device_context.h
index c38dcd5a61..8a9d15e8a8 100644
--- a/paddle/platform/cuda_device_context.h
+++ b/paddle/platform/cuda_device_context.h
@@ -20,19 +20,13 @@ limitations under the License. */
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
 
-class CUDADeviceContext;
-
-template <>
-Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
-  return static_cast<CUDADeviceContext*>(this)->eigen_handle();
-}
-
 class GPUPlaceGuard {
  public:
   explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
@@ -49,7 +43,7 @@ class GPUPlaceGuard {
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit Device(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
     GPUPlaceGuard guard(gpu_place_);
     paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
                                      "cudaStreamCreate failed");
@@ -156,5 +150,10 @@ class CUDADeviceContext : public DeviceContext {
   int random_seed_;
   curandGenerator_t rand_generator_{nullptr};
 };
+
+template <>
+Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+  return dynamic_cast<CUDADeviceContext*>(this)->eigen_device();
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index d2a5169991..d2f7cf6216 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -20,30 +20,23 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-class CPUDeviceContext;
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
 
   template <typename DeviceType>
-  DeviceType get_eigen_device();
+  inline DeviceType get_eigen_device();
 
   virtual Place GetPlace() const = 0;
 };
 
-template <>
-Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
-  return static_cast<CPUDeviceContext*>(this)->eigen_handle();
-}
-
 class CPUDeviceContext : public DeviceContext {
  public:
-  Eigen::DefaultDevice eigen_handle() {
-    if (!eigen_handle_) {
-      eigen_handle_ = new Eigen::DefaultDevice();
+  Eigen::DefaultDevice eigen_device() {
+    if (!eigen_device_) {
+      eigen_device_ = new Eigen::DefaultDevice();
     }
-    return *eigen_handle_;
+    return *eigen_device_;
   }
 
   Place GetPlace() const override {
@@ -52,7 +45,12 @@ class CPUDeviceContext : public DeviceContext {
   }
 
  private:
-  Eigen::DefaultDevice* eigen_handle_{nullptr};
+  Eigen::DefaultDevice* eigen_device_{nullptr};
 };
+
+template <>
+Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
+  return dynamic_cast<CPUDeviceContext*>(this)->eigen_device();
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 8390e97b15..abaaaececf 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/platform/cuda_device.h"
+#include "paddle/platform/cuda_device_context.h"
 
+using DEVICE_GPU = Eigen::GpuDevice;
 TEST(Device, Init) {
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
     paddle::platform::DeviceContext* device_context =
         new paddle::platform::CUDADeviceContext(i);
     Eigen::GpuDevice gpu_device =
-        device_context->get_eigen_device<DEVICE_GPU>();
+        device_context->template get_eigen_device<DEVICE_GPU>();
     ASSERT_NE(nullptr, gpu_device.stream());
     delete device_context;
   }

From 85806e75850aa6284afa4456daab7990186a0493 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 16:23:10 +0800
Subject: [PATCH 093/205] follow comments

---
 paddle/platform/CMakeLists.txt         |   9 +-
 paddle/platform/cuda_device_context.h  | 159 -------------------------
 paddle/platform/device_context.cc      |  24 ++++
 paddle/platform/device_context.h       | 151 +++++++++++++++++++++--
 paddle/platform/device_context_test.cc |   2 +-
 5 files changed, 177 insertions(+), 168 deletions(-)
 delete mode 100644 paddle/platform/cuda_device_context.h
 create mode 100644 paddle/platform/device_context.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index e93592cc4c..358d14f455 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -5,4 +5,11 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynload_cuda dynamic_loader eigen3 place)
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
+
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
diff --git a/paddle/platform/cuda_device_context.h b/paddle/platform/cuda_device_context.h
deleted file mode 100644
index 8a9d15e8a8..0000000000
--- a/paddle/platform/cuda_device_context.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/enforce.h"
-#include "paddle/platform/cuda.h"
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
-#define EIGEN_USE_GPU
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace platform {
-
-class GPUPlaceGuard {
- public:
-  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
-    }
-  }
-
-  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
-
- private:
-  GPUPlace previous_;
-};
-
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    GPUPlaceGuard guard(gpu_place_);
-    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
-                                     "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
-  }
-
-  Place GetPlace() const override {
-    Place retv = GPUPlace();
-    return retv;
-  }
-
-  void Wait() {
-    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
-                                     "cudaStreamSynchronize failed");
-  }
-
-  cudaStream_t stream() { return stream_; }
-
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
-
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
-                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
-                     "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
-                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
-                     "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
-                         CURAND_STATUS_SUCCESS,
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
-                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
-                     "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
-  ~CUDADeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
-                         rand_generator_) == CURAND_STATUS_SUCCESS,
-                     "curandDestroyGenerator failed");
-    }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
-    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
-                                     "cudaStreamDestroy failed");
-  }
-
- private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
-
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
-
-  cublasHandle_t blas_handle_{nullptr};
-
-  cudnnHandle_t dnn_handle_{nullptr};
-
-  int random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
-};
-
-template <>
-Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
-  return dynamic_cast<CUDADeviceContext*>(this)->eigen_device();
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000..8d800ec499
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+namespace dummy {
+// Make DeviceContext A library.
+int DUMMY_VAR_FOR_DEV_CTX = 0;
+
+}  // namespace dummy
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index d2f7cf6216..5b4b5e2999 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,9 +10,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/framework/enforce.h"
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#define EIGEN_USE_GPU
+#endif
+#include <paddle/platform/place.h>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 namespace paddle {
 namespace platform {
@@ -23,11 +28,10 @@ namespace platform {
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
 
   template <typename DeviceType>
   inline DeviceType get_eigen_device();
-
-  virtual Place GetPlace() const = 0;
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -52,5 +56,138 @@ template <>
 Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
   return dynamic_cast<CPUDeviceContext*>(this)->eigen_device();
 }
+
+#ifndef PADDLE_ONLY_CPU
+
+class GPUPlaceGuard {
+ public:
+  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+
+  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+
+ private:
+  GPUPlace previous_;
+};
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    GPUPlaceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+
+  Place GetPlace() const override {
+    Place retv = GPUPlace();
+    return retv;
+  }
+
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+
+  cudaStream_t stream() { return stream_; }
+
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
+                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
+                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
+  ~CUDADeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+
+    if (rand_generator_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
+                         rand_generator_) == CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
+    }
+
+    delete eigen_stream_;
+    delete eigen_device_;
+
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+
+  cublasHandle_t blas_handle_{nullptr};
+
+  cudnnHandle_t dnn_handle_{nullptr};
+
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+
+template <>
+Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+  return dynamic_cast<CUDADeviceContext*>(this)->eigen_device();
+}
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index abaaaececf..913e3c0aa9 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
-#include "paddle/platform/cuda_device_context.h"
 
 using DEVICE_GPU = Eigen::GpuDevice;
 TEST(Device, Init) {

From 6986a89331673df8c449a2894747c027cc52cc34 Mon Sep 17 00:00:00 2001
From: gangliao <liaogang@baidu.com>
Date: Wed, 12 Jul 2017 17:02:53 +0800
Subject: [PATCH 094/205] FIX: add -lrt for link

---
 cmake/generic.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 83e3d155d0..a30cdeff62 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
 endif(NOT APPLE)
 
 function(merge_static_libs TARGET_NAME)

From a07deac9efb1dc2ff7cea2a9534847512533a8b1 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 09:09:12 +0000
Subject: [PATCH 095/205] follow comments

---
 paddle/platform/device_context.cc | 20 +++++++++++++-------
 paddle/platform/device_context.h  | 11 +----------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 8d800ec499..25ff352e8c 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -13,12 +13,18 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-namespace dynload {
-namespace dummy {
-// Make DeviceContext A library.
-int DUMMY_VAR_FOR_DEV_CTX = 0;
 
-}  // namespace dummy
-}  // namespace dynload
+template <>
+Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
+  return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+  return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
+}
+#endif
+
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 5b4b5e2999..d6cf114216 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -31,7 +31,7 @@ class DeviceContext {
   virtual Place GetPlace() const = 0;
 
   template <typename DeviceType>
-  inline DeviceType get_eigen_device();
+  DeviceType get_eigen_device();
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -52,11 +52,6 @@ class CPUDeviceContext : public DeviceContext {
   Eigen::DefaultDevice* eigen_device_{nullptr};
 };
 
-template <>
-Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
-  return dynamic_cast<CPUDeviceContext*>(this)->eigen_device();
-}
-
 #ifndef PADDLE_ONLY_CPU
 
 class GPUPlaceGuard {
@@ -183,10 +178,6 @@ class CUDADeviceContext : public DeviceContext {
   curandGenerator_t rand_generator_{nullptr};
 };
 
-template <>
-Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
-  return dynamic_cast<CUDADeviceContext*>(this)->eigen_device();
-}
 #endif
 
 }  // namespace platform

From be2c1a3b99947ace0717ac79ebbf1b25ecb1055d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 09:41:33 +0000
Subject: [PATCH 096/205] follow comments

---
 paddle/platform/device_context.cc      |  4 ++--
 paddle/platform/device_context.h       | 26 ++++++++++++--------------
 paddle/platform/device_context_test.cc |  8 ++++----
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 25ff352e8c..960ef0a595 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,13 +15,13 @@ namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
   return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
   return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
 }
 #endif
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index d6cf114216..94f54d705d 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -31,16 +31,16 @@ class DeviceContext {
   virtual Place GetPlace() const = 0;
 
   template <typename DeviceType>
-  DeviceType get_eigen_device();
+  DeviceType* get_eigen_device();
 };
 
 class CPUDeviceContext : public DeviceContext {
  public:
-  Eigen::DefaultDevice eigen_device() {
+  Eigen::DefaultDevice* eigen_device() {
     if (!eigen_device_) {
-      eigen_device_ = new Eigen::DefaultDevice();
+      eigen_device_.reset(new Eigen::DefaultDevice());
     }
-    return *eigen_device_;
+    return eigen_device_.get();
   }
 
   Place GetPlace() const override {
@@ -49,7 +49,7 @@ class CPUDeviceContext : public DeviceContext {
   }
 
  private:
-  Eigen::DefaultDevice* eigen_device_{nullptr};
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -74,8 +74,8 @@ class CUDADeviceContext : public DeviceContext {
     GPUPlaceGuard guard(gpu_place_);
     paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
                                      "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
   }
 
   Place GetPlace() const override {
@@ -90,7 +90,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudaStream_t stream() { return stream_; }
 
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+  Eigen::GpuDevice* eigen_device() { return eigen_device_.get(); }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
@@ -155,10 +155,8 @@ class CUDADeviceContext : public DeviceContext {
                          rand_generator_) == CURAND_STATUS_SUCCESS,
                      "curandDestroyGenerator failed");
     }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
+    eigen_stream_.reset();
+    eigen_device_.reset();
     paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
                                      "cudaStreamDestroy failed");
   }
@@ -167,8 +165,8 @@ class CUDADeviceContext : public DeviceContext {
   GPUPlace gpu_place_;
   cudaStream_t stream_;
 
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
 
   cublasHandle_t blas_handle_{nullptr};
 
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 913e3c0aa9..af2ce17fc2 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -21,9 +21,9 @@ TEST(Device, Init) {
   for (int i = 0; i < count; i++) {
     paddle::platform::DeviceContext* device_context =
         new paddle::platform::CUDADeviceContext(i);
-    Eigen::GpuDevice gpu_device =
+    Eigen::GpuDevice* gpu_device =
         device_context->template get_eigen_device<DEVICE_GPU>();
-    ASSERT_NE(nullptr, gpu_device.stream());
+    ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
 }
@@ -33,8 +33,8 @@ TEST(Device, CUDADeviceContext) {
   for (int i = 0; i < count; i++) {
     paddle::platform::CUDADeviceContext* device_context =
         new paddle::platform::CUDADeviceContext(i);
-    Eigen::GpuDevice gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device.stream());
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
     ASSERT_NE(nullptr, cudnn_handle);
     cublasHandle_t cublas_handle = device_context->cublas_handle();

From 69d99d481dc553c2f26d967d365b7ebc7e228e07 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 12 Jul 2017 17:58:35 +0800
Subject: [PATCH 097/205] Add Tensor::CopyFrom and Tensor::mutable_data(Place
 place)

1. Add `Tensor::CopyFrom`. Current version can only support CPU memory
copy. The support of GPU will be provided later by `paddle::memory`.
The current implementation of `Tensor::CopyFrom` is a little inefficient:
Every time `CopyFrom` is called, tensor will re-allocate its memory. However, if
we try to check and reuse `placeholder_`, we have to provide a template
parameter for `CopyFrom` to indicate the data type. It seems strange for
a simple copy function.

2. Add `Tensor::mutable_data(Place place)`, which directly use member
variable `dims_` as its dim parameter. This interface is required by
`Op::InferShape`.
---
 paddle/framework/tensor.h       | 34 +++++++++++++++++++++++++++++++--
 paddle/framework/tensor_test.cc | 25 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a0945e8055..7f3894bb3c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <type_traits>
 #include "paddle/framework/ddim.h"
@@ -44,11 +45,17 @@ class Tensor {
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
     dims_ = dims;
+    return mutable_data<T>(place);
+  }
+
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  T* mutable_data(paddle::platform::Place place) {
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+        || holder_->Size() < product(dims_) * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, product(dims_) * sizeof(T)));
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
@@ -63,6 +70,15 @@ class Tensor {
     offset_ = src.offset_;
   }
 
+  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+    PADDLE_ENFORCE(src.holder_ != nullptr,
+                   "Can not copy from an uninitialized tensor.");
+    size_t size = product(src.dims()) * src.holder_->TypeSize();
+    holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
+    dims_ = src.dims();
+    offset_ = 0;
+  }
+
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "The sliced tenosr has not been initialized.");
@@ -95,6 +111,8 @@ class Tensor {
     virtual paddle::platform::Place Place() const = 0;
     virtual size_t Size() const = 0;
     virtual size_t TypeSize() const = 0;
+    virtual Placeholder* Clone(size_t begin, size_t size,
+                               paddle::platform::Place place) const = 0;
   };
 
   template <typename T>
@@ -122,6 +140,18 @@ class Tensor {
     virtual size_t Size() const { return size_; }
     virtual paddle::platform::Place Place() const { return place_; }
     virtual size_t TypeSize() const { return sizeof(T); }
+    // TODO: Clone only support CPU now. GPU support is needed.
+    virtual Placeholder* Clone(size_t begin, size_t size,
+                               paddle::platform::Place place) const {
+      PADDLE_ENFORCE(paddle::platform::is_cpu_place(place_) &&
+                         paddle::platform::is_cpu_place(place),
+                     "PlaceholderImpl::Clone only support CPU now.");
+      PlaceholderImpl<T>* dst = new PlaceholderImpl<T>(place, size);
+      void* begin_ptr =
+          reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(Ptr()) + begin);
+      memcpy(dst->Ptr(), begin_ptr, size);
+      return dst;
+    }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index f4822838cf..6db0ba8c79 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -178,4 +178,29 @@ TEST(Tensor, Slice) {
   }
 }
 
+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor src_tensor;
+  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  Tensor dst_tensor;
+  dst_tensor.CopyFrom(src_tensor, CPUPlace());
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  dst_tensor.CopyFrom(slice_tensor, CPUPlace());
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+}
 */
\ No newline at end of file

From 06748210d4771b37bd964e25513102cd2e0fccbf Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 18:05:41 +0800
Subject: [PATCH 098/205] Fix some link errors about NNPACK.

---
 CMakeLists.txt                                |  3 ++-
 .../nnpack => cmake/external}/nnpack.cmake    | 14 +++++++++++
 paddle/function/CMakeLists.txt                |  1 -
 paddle/function/nnpack/NNPACKConvOp.cpp       | 23 +++++++++++--------
 4 files changed, 29 insertions(+), 12 deletions(-)
 rename {paddle/function/nnpack => cmake/external}/nnpack.cmake (54%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c713db3e3..af58957ea8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,8 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
 add_subdirectory(proto)
diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake
similarity index 54%
rename from paddle/function/nnpack/nnpack.cmake
rename to cmake/external/nnpack.cmake
index 7182730ae8..d42bcb0f32 100644
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/cmake/external/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
   set(NNPACK_FOUND ON)
   INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
   message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654..a5b14c0c71 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
   if(WITH_TESTING)
     add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e8080c3d71..e83bca5d9f 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -58,18 +58,10 @@ public:
     workspaceBuffer_ = nullptr;
     workspaceSize_ = 0;
 
-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
   }
 
   ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
     if (workspaceBuffer_) {
       free(workspaceBuffer_);
     }
@@ -225,14 +217,25 @@ public:
     }
   }
 
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
   size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
 
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 
 }  // namespace paddle

From 70d937c595fb7f945bfae21d7d2a81f2a7ccc45a Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 12 Jul 2017 10:17:26 +0000
Subject: [PATCH 099/205] add memory header file

---
 paddle/platform/device_context.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 94f54d705d..7de07d06be 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #endif
 #include <paddle/platform/place.h>
+#include <memory>
 #include <unsupported/Eigen/CXX11/Tensor>
 
 namespace paddle {

From 891e5dcc48590375d37364634838b6da260fd41e Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 20:13:07 +0800
Subject: [PATCH 100/205] Modify the default value of nnpack_allocate_outside.

---
 paddle/function/nnpack/NNPACKConvOp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e83bca5d9f..f0ec77a5d0 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
             "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
              0,

From be441f7d162bd9638e07a6558cf2de9dd3c8b412 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 12 Jul 2017 20:36:40 +0800
Subject: [PATCH 101/205] test OpKernel (#2820)

Add unit test for OpKernel
---
 paddle/framework/CMakeLists.txt      |  2 +-
 paddle/framework/op_registry.h       | 14 ++++----
 paddle/framework/op_registry_test.cc |  4 +--
 paddle/framework/operator_test.cc    | 52 +++++++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index aac49fdb7a..b8642ca22a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -12,7 +12,7 @@ cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 02c99d50bb..248c7a1a3b 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -147,13 +147,13 @@ class OpRegisterHelper {
   }
 };
 
-#define REGISTER_OP(__op_class, __op_maker_class, __op_type)         \
-  class __op_class##Register {                                       \
-   private:                                                          \
-    const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
-  };                                                                 \
-  const OpRegisterHelper<__op_class, __op_maker_class>               \
-      __op_class##Register::reg(#__op_type);
+#define REGISTER_OP(type, op_class, op_maker_class)                         \
+  class op_class##Register {                                                \
+   private:                                                                 \
+    const static OpRegisterHelper<op_class, op_maker_class> reg;            \
+  };                                                                        \
+  const OpRegisterHelper<op_class, op_maker_class> op_class##Register::reg( \
+      #type)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index f5d45a80bb..f5162fb870 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -26,7 +26,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker);
 
 class MyTestOp : public OperatorBase {
  public:
@@ -53,7 +53,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker);
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 86f45f108a..be8c4be2d4 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -45,7 +45,7 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
+REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker);
 
 TEST(OperatorBase, all) {
   OpDesc op_desc;
@@ -69,5 +69,55 @@ TEST(OperatorBase, all) {
   delete op;
 }
 
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("test_operator");
+    AddComment("This is test op");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+};
+
+class CPUKernelTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& context) const {
+    float scale = context.op_.GetAttr<float>("scale");
+    ASSERT_NEAR(scale, 3.14, 1e-5);
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << context.op_.DebugString() << std::endl;
+  }
+};
+
+REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest);
+
+TEST(OpKernel, all) {
+  OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_device_context);
+
+  delete op;
+}
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file

From ea3a1df18bad2dd7e3ab12d776d7a3e9734ad9c6 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 21:15:34 +0800
Subject: [PATCH 102/205] Fix some compilation errors in some Android
 environments.

---
 CMakeLists.txt      | 8 ++++++--
 cmake/generic.cmake | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c713db3e3..a3ab046289 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,9 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-find_package(Boost QUIET)
+if(NOT ANDROID)
+    find_package(Boost QUIET)
+endif()
 
 include(simd)
 
@@ -147,7 +149,9 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 
 add_subdirectory(paddle)
-add_subdirectory(python)
+if(WITH_PYTHON)
+  add_subdirectory(python)
+endif()
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 83e3d155d0..6e648cb53a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,7 +90,7 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")

From e4be077ffa44465fe19f47c892164452fdecfdfb Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 11 Jul 2017 17:47:28 -0400
Subject: [PATCH 103/205] Add go testing into cmake and fix
 libpaddle_go_optimizer.a link path

---
 CMakeLists.txt                        |  4 ++++
 cmake/generic.cmake                   | 21 ++++++++++++---------
 go/CMakeLists.txt                     |  3 +++
 go/master/CMakeLists.txt              |  3 +++
 go/pserver/CMakeLists.txt             |  3 +++
 go/pserver/client/c/CMakeLists.txt    |  8 ++++++++
 go/pserver/optimizer.go               |  3 +--
 go/utils/networkhelper/CMakeLists.txt |  3 +++
 paddle/CMakeLists.txt                 |  1 -
 9 files changed, 37 insertions(+), 12 deletions(-)
 create mode 100644 go/master/CMakeLists.txt
 create mode 100644 go/pserver/CMakeLists.txt
 create mode 100644 go/utils/networkhelper/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c713db3e3..6bc6a8077c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,10 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
+# "add_subdirectory(go)" should be placed after the following loine,
+# because it depends on paddle/optimizer.
+add_subdirectory(paddle/optimizer)
+
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 83e3d155d0..f88b9dff2b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -301,7 +301,7 @@ function(go_library TARGET_NAME)
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-  # FIXME: link path
+
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
@@ -309,7 +309,7 @@ function(go_library TARGET_NAME)
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)
 
@@ -322,8 +322,8 @@ function(go_binary TARGET_NAME)
 
   # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
-      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
+    GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
@@ -335,15 +335,18 @@ endfunction(go_binary)
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
 function(proto_library TARGET_NAME)
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index f00c70a058..18fee46d19 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -17,3 +17,6 @@ add_subdirectory(pserver/client/c)
 add_subdirectory(cmd/pserver)
 add_subdirectory(cmd/master)
 add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(utils/networkhelper)
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
new file mode 100644
index 0000000000..30531e6469
--- /dev/null
+++ b/go/master/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
new file mode 100644
index 0000000000..6267040a6e
--- /dev/null
+++ b/go/pserver/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index 93a0a27f85..c6333eab55 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,13 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   # FIXME: this test requires pserver which is not managed by the test
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 2d7882d1a7..0ebf4a26fa 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,8 +1,7 @@
 package pserver
 
 // #cgo CFLAGS: -I ../../
-// //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
new file mode 100644
index 0000000000..db6cf211d8
--- /dev/null
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 58a35564f8..0b5e9a2599 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,7 +8,6 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
-add_subdirectory(optimizer)
 add_subdirectory(string)
 
 if(Boost_FOUND)

From 59287cd1cad1e2d6006eff68d8f025af3dd0c310 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 12 Jul 2017 22:30:44 +0000
Subject: [PATCH 104/205] add .gitignore

---
 go/pserver/client/c/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 go/pserver/client/c/.gitignore

diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
new file mode 100644
index 0000000000..4bf05c8538
--- /dev/null
+++ b/go/pserver/client/c/.gitignore
@@ -0,0 +1 @@
+libpaddle_go_optimizer.a

From 2231b92a89ea560934be92987c27068be398c6fd Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 12 Jul 2017 23:20:06 +0000
Subject: [PATCH 105/205] go_binary: remove hardcoded library link path

---
 cmake/generic.cmake          | 5 +----
 go/cmd/master/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b13400d125..71ee266611 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -320,14 +320,11 @@ function(go_binary TARGET_NAME)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
-  # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
-    GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  # TODO: don't know what ${TARGET_NAME}_link does
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
index 1058ffa86b..9e149967e7 100644
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(master SRC master.go DEPS paddle_go_optimizer)
+go_binary(master SRC master.go)

From b04986da9f57cfba0657194c7e35b7e9229a6676 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 12 Jul 2017 23:48:06 +0000
Subject: [PATCH 106/205] add pserver client test

---
 go/CMakeLists.txt                | 1 +
 go/pserver/client/CMakeLists.txt | 3 +++
 2 files changed, 4 insertions(+)
 create mode 100644 go/pserver/client/CMakeLists.txt

diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index 18fee46d19..29ce909c64 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -19,4 +19,5 @@ add_subdirectory(cmd/master)
 add_subdirectory(master/c)
 add_subdirectory(master)
 add_subdirectory(pserver)
+add_subdirectory(pserver/client)
 add_subdirectory(utils/networkhelper)
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
new file mode 100644
index 0000000000..0052bb460b
--- /dev/null
+++ b/go/pserver/client/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()

From 19bfb8a1f2153f0d5368808bc09580c7e4c7b07c Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Thu, 13 Jul 2017 09:52:26 +0800
Subject: [PATCH 107/205] PServer recovery from checkpoint (#2741)

* Server recovery from checkpoint
---
 .gitignore                |   3 ++
 go/cmd/pserver/pserver.go |  39 +++++++-------
 go/glide.lock             |  14 ++---
 go/glide.yaml             |   1 +
 go/pserver/etcd_client.go |  22 ++++++--
 go/pserver/service.go     | 104 ++++++++++++++++++++++++++------------
 6 files changed, 121 insertions(+), 62 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5c2fb134ae..c84b2fc8c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,6 @@ cmake-build-*
 
 # generated while compiling
 python/paddle/v2/framework/core.so
+CMakeFiles
+cmake_install.cmake
+
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 0ecb1242c3..48351ab6d0 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/namsral/flag"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
@@ -18,53 +19,47 @@ func main() {
 	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
-	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
 	level, err := log.ParseLevel(*logLevel)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	log.SetLevel(level)
 
 	var idx int
-	var cp pserver.Checkpoint
+
+	var cp *pserver.Checkpoint
 	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
-		timeout := time.Second * time.Duration((*etcdTimeout))
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
 		idx, err = e.Register()
+		candy.Must(err)
+
+		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			panic(err)
+			log.Errorf("Fetch checkpoint failed, %s", err)
 		}
 	}
 
 	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	err = rpc.Register(s)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
-
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 }
diff --git a/go/glide.lock b/go/glide.lock
index 190a222338..f71ae643d6 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,8 +1,8 @@
-hash: b8f18ce6784bd3fadd9fed0b8443e7b658234ea785ae1f220723ae2c1f652aa7
-updated: 2017-06-27T14:05:48.925262819+08:00
+hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
+updated: 2017-07-11T10:04:40.786745417+08:00
 imports:
 - name: github.com/coreos/etcd
-  version: 61fc123e7a8b14a0a258aa3f5c4159861b1ec2e7
+  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
   subpackages:
   - auth/authpb
   - clientv3
@@ -22,7 +22,9 @@ imports:
 - name: github.com/PaddlePaddle/recordio
   version: edfb82af0739c84f241c87390ec5649c7b28c129
 - name: github.com/sirupsen/logrus
-  version: 202f25545ea4cf9b191ff7f846df5d87c9382c2b
+  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -34,11 +36,11 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: f7928cfef4d09d1b080aa2b6fd3ca9ba1567c733
+  version: abf9c25f54453410d0c6668e519582a9e1115027
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: 4e9ab9ee170f2a39bd66c92b3e0a47ff47a4bc77
+  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
   subpackages:
   - secure/bidirule
   - transform
diff --git a/go/glide.yaml b/go/glide.yaml
index 05c5d15ca2..ab472c7cda 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -10,3 +10,4 @@ import:
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
+- package: github.com/topicai/candy
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 1f77787150..4a694b97f4 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -16,7 +16,7 @@ import (
 const (
 	// PsDesired is etcd path for store desired pserver count
 	PsDesired = "/ps_desired"
-	// PsAddr is the base dir for pserver to store their addr
+	// PsPath is the base dir for pserver to store their addr
 	PsPath = "/ps/"
 	// PsCheckpoint is the etcd path for store checkpoints information
 	PsCheckpoint = "/checkpoints/"
@@ -189,9 +189,25 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 	return idx, nil
 }
 
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.etcdClient.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return []byte{}, err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
 // PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
 	if err != nil {
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 6b52d0d896..65db6970a7 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,6 +9,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -21,14 +22,14 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// RPC error message.
 const (
-	// AlreadyInitialized is true if pserver is initialized
-	AlreadyInitialized = "pserver already initialized"
-	// Uninitialized is true if pserver not fully initialized
-	Uninitialized = "pserver not fully initialized"
+	AlreadyInitialized  = "pserver already initialized"
+	Uninitialized       = "pserver not fully initialized"
+	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
 )
 
-// Supported element types
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -51,21 +52,15 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
-// ParameterCheckpoint is Parameter and State checkpoint
-type ParameterCheckpoint struct {
-	ParamConfig ParameterWithConfig
-	State       []byte
-}
-
-// checkpoint signature
+// checkpointMeta saves checkpoint metadata
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
-	Md5sum    string `json:"md5sum"`
-	Timestamp string `json:"timestamp"`
+	MD5       string `json:"md5"`
+	Timestamp int64  `json:"timestamp"`
 }
 
 // Checkpoint is the pserver shard persist in file
-type Checkpoint []ParameterCheckpoint
+type Checkpoint []parameterCheckpoint
 
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
@@ -81,12 +76,53 @@ type Service struct {
 	optMap             map[string]*optimizer
 }
 
+// parameterCheckpoint saves parameter checkpoint
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+// NewCheckpointFromFile loads parameters and state from checkpoint file
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (*Checkpoint, error) {
+	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+	if err != nil {
+		return nil, err
+	}
+
+	var cpMeta checkpointMeta
+	if err = json.Unmarshal(v, &cpMeta); err != nil {
+		return nil, err
+	}
+
+	fn := filepath.Join(cpPath, cpMeta.UUID)
+	if _, err = os.Stat(fn); os.IsNotExist(err) {
+		return nil, err
+	}
+	content, err := ioutil.ReadFile(fn)
+	if err != nil {
+		return nil, err
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(content))
+	if md5 != cpMeta.MD5 {
+		return nil, errors.New(CheckpointMD5Failed)
+	}
+
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	cp := &Checkpoint{}
+	if err = dec.Decode(cp); err != nil {
+		return nil, err
+	}
+	return cp, nil
+}
+
 // NewService creates a new service, will bypass etcd registration if no
-// endpoints specified.
-func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp *Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
-		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointInterval: interval,
 		checkpointPath:     path,
 		client:             client,
 	}
@@ -94,10 +130,12 @@ func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkp
 	s.initialized = make(chan struct{})
 
 	if cp != nil {
-		for _, item := range cp {
-			p := item.ParamConfig
-			st := item.State
-			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		for _, item := range *cp {
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
 		}
 	}
 	return s, nil
@@ -186,13 +224,13 @@ func (s *Service) doCheckpoint() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	cp := make([]parameterCheckpoint, len(s.optMap))
 	index := 0
 	for name, opt := range s.optMap {
-		var pc ParameterCheckpoint
-		pc.ParamConfig.Param.Name = name
-		pc.ParamConfig.Param.ElementType = opt.elementType
-		pc.ParamConfig.Param.Content = opt.GetWeights()
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
 		pc.State = opt.GetStates()
 		cp[index] = pc
 		index++
@@ -206,12 +244,12 @@ func (s *Service) doCheckpoint() error {
 
 	cpMeta := checkpointMeta{}
 	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
-	cpMeta.Timestamp = time.Now().String()
+	cpMeta.Timestamp = time.Now().UnixNano()
 	h := md5.New()
-	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
 
 	cpMetajson, _ := json.Marshal(cpMeta)
-	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
 		return err
 	}
@@ -219,7 +257,11 @@ func (s *Service) doCheckpoint() error {
 		log.Info("checkpoint does not exists.")
 	} else {
 		err = os.Remove(cpMeta.UUID)
-		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		if err != nil {
+			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
+		} else {
+			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		}
 	}
 	f, err := os.Create(cpMeta.UUID)
 	defer f.Close()

From 4e918377d0a37bd4216062ee4d9c19778bd284bd Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 13 Jul 2017 11:10:03 +0800
Subject: [PATCH 108/205] fix bug in dynload

---
 paddle/framework/CMakeLists.txt  |  2 +-
 paddle/platform/dynload/cublas.h | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index b8642ca22a..de31952e79 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,7 +11,7 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
+cc_library(operator SRCS operator.cc DEPS op_desc protobuf device_context)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 47c7a8ec21..c44b7240a8 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -67,20 +67,20 @@ extern void *cublas_dso_handle;
   __macro(cublasSgemm);                   \
   __macro(cublasDgemm);                   \
   __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);
-
-DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate);
-DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy);
-DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream);
-DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode);
-DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched);
+  __macro(cublasDgeam);                   \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 

From 46c704ecf0eb259187681b4d2f184e6b1e0ba2ca Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 13 Jul 2017 11:39:08 +0800
Subject: [PATCH 109/205] "fix init error"

---
 go/cmd/pserver/pserver.go | 2 +-
 go/pserver/service.go     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 48351ab6d0..b331b8126c 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -34,7 +34,7 @@ func main() {
 
 	var idx int
 
-	var cp *pserver.Checkpoint
+	var cp pserver.Checkpoint
 	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 65db6970a7..a0319000ee 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -119,7 +119,7 @@ func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (*Checkpoint,
 
 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
-func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp *Checkpoint) (*Service, error) {
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
 		checkpointInterval: interval,
@@ -130,7 +130,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 	s.initialized = make(chan struct{})
 
 	if cp != nil {
-		for _, item := range *cp {
+		for _, item := range cp {
 			p := ParameterWithConfig{
 				Param:  item.Param,
 				Config: item.Config,

From 7cfcda5f4f184ee5958139d023cc72474ec2e81a Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 13 Jul 2017 12:46:11 +0800
Subject: [PATCH 110/205] "fix checkpoint pointer"

---
 go/pserver/service.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/go/pserver/service.go b/go/pserver/service.go
index a0319000ee..fec2ec61dc 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -83,7 +83,7 @@ type parameterCheckpoint struct {
 }
 
 // NewCheckpointFromFile loads parameters and state from checkpoint file
-func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (*Checkpoint, error) {
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
 	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
 	if err != nil {
 		return nil, err
@@ -110,7 +110,7 @@ func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (*Checkpoint,
 	}
 
 	dec := gob.NewDecoder(bytes.NewReader(content))
-	cp := &Checkpoint{}
+	cp := Checkpoint{}
 	if err = dec.Decode(cp); err != nil {
 		return nil, err
 	}

From a7c7a82626df16a4a9244499afd63f96d10adf94 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 13 Jul 2017 13:03:38 +0800
Subject: [PATCH 111/205] Fix a small mistake.

---
 cmake/generic.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 2c1335d205..fcbdc64c63 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -94,7 +94,7 @@ if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
-endif(NOT APPLE)
+endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})

From ff98e3c1ece983403ebdfa57f07d3bdf58f85647 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 14:26:48 +0800
Subject: [PATCH 112/205] ENH: Remove comments

---
 paddle/memory/detail/system_allocator.h | 12 +++++-------
 paddle/platform/gpu_info.cc             |  1 -
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 04efcd9709..82ba322e05 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,13 +20,11 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 9b917f9d35..a1383d3524 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -65,7 +65,6 @@ size_t GpuMinChunkSize() {
 }
 
 size_t GpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
   size_t total = 0;
   size_t available = 0;
 

From 4a5c3714eaec33628259dd3c481f3d36597e0c58 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 13 Jul 2017 15:10:25 +0800
Subject: [PATCH 113/205] fix python dependency for voc2012 dataset

---
 python/paddle/v2/dataset/voc_seg.py | 10 ++++++----
 python/setup.py.in                  |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/dataset/voc_seg.py b/python/paddle/v2/dataset/voc_seg.py
index 0df4423ff0..617e212d67 100644
--- a/python/paddle/v2/dataset/voc_seg.py
+++ b/python/paddle/v2/dataset/voc_seg.py
@@ -22,7 +22,7 @@ with segmentation has been increased from 7,062 to 9,993.
 import tarfile
 import io
 import numpy as np
-from common import download
+from paddle.v2.dataset.common import download
 from paddle.v2.image import *
 from PIL import Image
 
@@ -36,6 +36,8 @@ SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
 DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
 LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
 
+CACHE_DIR = 'voc2012'
+
 
 def reader_creator(filename, sub_name):
 
@@ -66,18 +68,18 @@ def train():
     """
     Create a train dataset reader containing 2913 images in HWC order.
     """
-    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'trainval')
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
 
 
 def test():
     """
     Create a test dataset reader containing 1464 images in HWC order.
     """
-    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'train')
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
 
 
 def val():
     """
     Create a val dataset reader containing 1449 images in HWC order.
     """
-    return reader_creator(download(VOC_URL, 'voc_seg', VOC_MD5), 'val')
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/setup.py.in b/python/setup.py.in
index 271ee6e552..310ac403a9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,8 @@ setup_requires=["requests",
                 "recordio",
                 "matplotlib",
                 "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "Pillow"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]

From 728665d709811162ac1e2e136e44f88d6e68cb7f Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 13 Jul 2017 15:57:22 +0800
Subject: [PATCH 114/205] Add Init to OperatorBase (#2838)

---
 paddle/framework/op_registry.h    | 1 +
 paddle/framework/operator.h       | 4 ++++
 paddle/framework/operator_test.cc | 5 +++++
 3 files changed, 10 insertions(+)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 248c7a1a3b..e46da822c6 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -119,6 +119,7 @@ class OpRegistry {
       op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
     op_checkers().at(op_type).Check(op->attrs_);
+    op->Init();
     return op;
   }
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 0ce422e007..4336115670 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -49,6 +49,10 @@ class OperatorBase {
 
   std::string DebugString() const;
 
+  /// Init will be called after CreateOperator, you can put some initialization
+  /// logic here.
+  virtual void Init() {}
+
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
   virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index be8c4be2d4..01b87bb50e 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -21,14 +21,19 @@ namespace framework {
 
 class OperatorTest : public OperatorBase {
  public:
+  void Init() override { x = 1; }
   void InferShape(const std::shared_ptr<Scope>& scope) const override {}
   void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const override {
     float scale = GetAttr<float>("scale");
     ASSERT_NEAR(scale, 3.14, 1e-5);
     ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
     ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
   }
+
+ public:
+  float x = 0;
 };
 
 class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

From 95897fd1e153843ee52b99c1b58e5835eaf831ae Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 13 Jul 2017 08:39:44 +0000
Subject: [PATCH 115/205] Add build_android task on Travis CI.

---
 .travis.yml                            |  8 ++++++++
 paddle/scripts/travis/build_android.sh | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100755 paddle/scripts/travis/build_android.sh

diff --git a/.travis.yml b/.travis.yml
index 498674469b..8ac67e5720 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
+    - $HOME/android-toolchain-gcc
 sudo: required
 dist: trusty
 os:
@@ -11,6 +12,7 @@ os:
 env:
   - JOB=build_doc
   - JOB=check_style
+  - JOB=build_android
 addons:
   apt:
     packages:
@@ -33,6 +35,12 @@ addons:
       - ccache
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - if [[ "JOB" == "build_android" ]]; then
+      mkdir -p $HOME/tmp/$JOB;
+      cd $HOME/tmp/$JOB; wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip; unzip -q android-ndk-r14b-linux-x86_64.zip;
+      sh $HOME/tmp/$JOB/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=$HOME/android-toolchain-gcc;
+      cd $HOME; rm -rf $HOME/tmp/$JOB;
+    fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
new file mode 100755
index 0000000000..dd4ad29b18
--- /dev/null
+++ b/paddle/scripts/travis/build_android.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_android
+cd $TRAVIS_BUILD_DIR/build_android
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      ..
+
+make -j `nproc`

From a0aaafe9de7008db91f32e50d36ee7d623bf1fa4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Jul 2017 17:29:33 +0800
Subject: [PATCH 116/205] Add a sample op, `add_op`

* Refine register methods, make Op can get rid of whole-archieve
* `USE_OP` before a op is used.
* Add unittest for add_op.
---
 cmake/external/glog.cmake            |  3 +-
 paddle/CMakeLists.txt                |  1 +
 paddle/framework/CMakeLists.txt      |  4 +-
 paddle/framework/op_registry.h       | 78 ++++++++++++++++++++++++----
 paddle/framework/op_registry_test.cc | 22 +++-----
 paddle/framework/operator.h          | 53 +++++++++++++++----
 paddle/framework/operator_test.cc    | 44 +++++-----------
 paddle/op/CMakeLists.txt             |  6 +++
 paddle/op/add_op.cc                  | 44 ++++++++++++++++
 paddle/op/add_op.cu                  |  5 ++
 paddle/op/add_op.h                   | 17 ++++++
 paddle/op/add_op_test.cc             |  9 ++++
 12 files changed, 216 insertions(+), 70 deletions(-)
 create mode 100644 paddle/op/CMakeLists.txt
 create mode 100644 paddle/op/add_op.cc
 create mode 100644 paddle/op/add_op.cu
 create mode 100644 paddle/op/add_op.h
 create mode 100644 paddle/op/add_op_test.cc

diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bd401faa6e..8a594a825a 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 0b5e9a2599..61d0aac602 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -14,6 +14,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(op)  # because `operator` is a reserved word for CPP, so short to `op`
   add_subdirectory(pybind)
 endif()
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index de31952e79..8415ce67e9 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,8 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc protobuf device_context)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index e46da822c6..e9e150224e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <algorithm>
+#include <type_traits>
 #include "paddle/framework/attr_checker.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
@@ -101,8 +102,11 @@ class OpRegistry {
     OpProto& op_proto = protos()[op_type];
     OpAttrChecker& op_checker = op_checkers()[op_type];
     ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized(),
-                   "Fail to initialize %s's OpProto !", op_type);
+    *op_proto.mutable_type() = op_type;
+    PADDLE_ENFORCE(
+        op_proto.IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, op_proto.InitializationErrorString());
   }
 
   static OperatorBase* CreateOp(const OpDesc& op_desc) {
@@ -143,18 +147,72 @@ class OpRegistry {
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
  public:
-  OpRegisterHelper(std::string op_type) {
+  OpRegisterHelper(const char* op_type) {
     OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
   }
 };
 
-#define REGISTER_OP(type, op_class, op_maker_class)                         \
-  class op_class##Register {                                                \
-   private:                                                                 \
-    const static OpRegisterHelper<op_class, op_maker_class> reg;            \
-  };                                                                        \
-  const OpRegisterHelper<op_class, op_maker_class> op_class##Register::reg( \
-      #type)
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
+                                 "REGISTER_OP must be in global namespace"); \
+  static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \
+      __op_register_##__op_type##__(#__op_type);                             \
+  int __op_register_##__op_type##_handle__() { return 0; }
+
+#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      "REGISTER_OP_KERNEL must be in global namespace");                  \
+  struct __op_kernel_register__##type##__ {                               \
+    __op_kernel_register__##type##__() {                                  \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new KernelType());                                       \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
+  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+
+#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+
+#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+
+#define USE_OP_WITHOUT_KERNEL(op_type)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
+      __use_op_without_kernel_##op_type,                    \
+      "USE_OP_WITHOUT_KERNEL must be in global namespace"); \
+  extern int __op_register_##op_type##_handle__();          \
+  static int __use_op_ptr_##op_type##_without_kernel__      \
+      __attribute__((unused)) = __op_register_##op_type##_handle__()
+
+#define USE_OP_KERNEL(op_type, CPU_OR_GPU)                                     \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(__use_op_kernel_##op_type##_##CPU_OR_GPU##__, \
+                                 "USE_OP_KERNEL must be in global namespace"); \
+  extern int __op_kernel_register_##op_type##_handle_##CPU_OR_GPU##__();       \
+  static int __use_op_ptr_##op_type##_##CPU_OR_GPU##_kernel__                  \
+      __attribute__((unused)) =                                                \
+          __op_kernel_register_##op_type##_handle_##CPU_OR_GPU##__()
+
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type)           \
+  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_KERNEL(op_type, CPU);
+
+#else
+#define USE_OP(op_type)           \
+  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_KERNEL(op_type, CPU);    \
+  USE_OP_KERNEL(op_type, GPU)
+#endif
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index f5162fb870..b3460838f9 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,8 +1,6 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
-using namespace paddle::framework;
-
 namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
@@ -26,8 +24,6 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker);
-
 class MyTestOp : public OperatorBase {
  public:
   void InferShape(const std::shared_ptr<Scope>& scope) const override {}
@@ -52,11 +48,14 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddComment("This is my_test op");
   }
 };
-
-REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker);
 }  // namespace framework
 }  // namespace paddle
 
+REGISTER_OP(cos_sim, paddle::framework::CosineOp,
+            paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP(my_test_op, paddle::framework::MyTestOp,
+            paddle::framework::MyTestOpProtoAndCheckerMaker);
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
@@ -71,7 +70,7 @@ TEST(OpRegistry, CreateOp) {
 
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   float scale_get = op->GetAttr<float>("scale");
@@ -114,7 +113,7 @@ TEST(OpRegistry, DefaultValue) {
 
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
@@ -169,13 +168,8 @@ TEST(OpRegistry, CustomChecker) {
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
   op->Run(scope, dev_ctx);
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 4336115670..d3c55e0ceb 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <paddle/framework/attr_checker.h>
 #include <paddle/framework/op_desc.pb.h>
 #include <paddle/framework/scope.h>
+#include <paddle/framework/tensor.h>
 #include <paddle/platform/device_context.h>
 #include <paddle/platform/place.h>
 #include <paddle/utils/Error.h>
@@ -103,6 +104,19 @@ class OpKernel {
   virtual ~OpKernel() {}
 };
 
+template <typename T>
+struct VarToTensor {};
+
+template <>
+struct VarToTensor<Tensor*> {
+  Tensor* operator()(Variable* var) { return var->GetMutable<Tensor>(); }
+};
+
+template <>
+struct VarToTensor<const Tensor*> {
+  const Tensor* operator()(Variable* var) { return &var->Get<Tensor>(); }
+};
+
 class OperatorWithKernel : public OperatorBase {
  public:
   struct OpKernelKey {
@@ -136,19 +150,36 @@ class OperatorWithKernel : public OperatorBase {
   AllOpKernels() {
     static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
     return g_all_op_kernels;
+  }
+  void InferShape(const std::shared_ptr<Scope>& scope) const final {
+    std::vector<const Tensor*> ins;
+    VarNamesToTensors(scope, inputs_, &ins);
+    std::vector<Tensor*> outs;
+    VarNamesToTensors(scope, outputs_, &outs);
+    InferShape(ins, outs);
   };
+
+ private:
+  template <typename T>
+  void VarNamesToTensors(const std::shared_ptr<Scope>& scope,
+                         const std::vector<std::string>& var_names,
+                         std::vector<T>* container) const {
+    container->reserve(var_names.size());
+    VarToTensor<T> convert;
+    for (auto& name : var_names) {
+      auto var = scope->GetVariable(name);
+      if (var != nullptr) {
+        container->push_back(convert(var));
+      } else {
+        container->push_back(nullptr);
+      }
+    }
+  }
+
+ protected:
+  virtual void InferShape(const std::vector<const Tensor*>& inputs,
+                          const std::vector<Tensor*>& outputs) const = 0;
 };
 
 }  // namespace framework
 }  // namespace paddle
-
-#define REGISTER_OP_KERNEL(type, PlaceType, KernelType)                   \
-  struct __op_kernel_register__##type##__ {                               \
-    __op_kernel_register__##type##__() {                                  \
-      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
-      key.place_ = PlaceType();                                           \
-      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
-    }                                                                     \
-  };                                                                      \
-  static __op_kernel_register__##type##__ __reg_kernel_##type##__
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 01b87bb50e..a033ee1661 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -50,30 +50,6 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker);
-
-TEST(OperatorBase, all) {
-  OpDesc op_desc;
-  op_desc.set_type("test_operator");
-  *op_desc.mutable_inputs()->Add() = "IN1";
-  *op_desc.mutable_outputs()->Add() = "OUT1";
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
-  attr->set_f(scale);
-
-  platform::CPUDeviceContext device_context;
-  auto scope = std::make_shared<Scope>();
-
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
-  scope->CreateVariable("OUT1");
-  op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
-  delete op;
-}
-
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -83,14 +59,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .LargerThan(0.0);
-    AddType("test_operator");
     AddComment("This is test op");
   }
 };
 
 class OpWithKernelTest : public OperatorWithKernel {
- public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+ protected:
+  void InferShape(const std::vector<const Tensor*>& inputs,
+                  const std::vector<Tensor*>& outputs) const override {}
 };
 
 class CPUKernelTest : public OpKernel {
@@ -103,10 +79,16 @@ class CPUKernelTest : public OpKernel {
   }
 };
 
-REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest);
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
 
 TEST(OpKernel, all) {
+  using namespace paddle::framework;
+
   OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   *op_desc.mutable_inputs()->Add() = "IN1";
@@ -116,7 +98,7 @@ TEST(OpKernel, all) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUDeviceContext cpu_device_context;
   auto scope = std::make_shared<Scope>();
 
   OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
@@ -124,5 +106,3 @@ TEST(OpKernel, all) {
 
   delete op;
 }
-}  // namespace framework
-}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/op/CMakeLists.txt b/paddle/op/CMakeLists.txt
new file mode 100644
index 0000000000..40bb326512
--- /dev/null
+++ b/paddle/op/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(WITH_GPU)
+    nv_library(add_op SRCS add_op.cc add_op.cu DEPS operator op_registry glog ddim)
+else()
+    cc_library(add_op SRCS add_op.cc DEPS operator op_registry glog ddim)
+endif()
+cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
diff --git a/paddle/op/add_op.cc b/paddle/op/add_op.cc
new file mode 100644
index 0000000000..71fbe30289
--- /dev/null
+++ b/paddle/op/add_op.cc
@@ -0,0 +1,44 @@
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/op/add_op.h>
+
+namespace paddle {
+namespace op {
+
+class AddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(
+        inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr,
+        "Inputs/Outputs of AddOp must all be set");
+    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+                   "Two input of Add Op's dimension must be same.");
+    // Need set dims in Tensor
+    // outputs[0]->set_dims(inputs[0]->dims())
+  }
+};
+
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of add op");
+    AddInput("Y", "The second input of add op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Two Element Add Operator.
+
+The equation is: Out = X + Y
+)DOC");
+  }
+};
+}  // namespace op
+}  // namespace paddle
+
+REGISTER_OP(add_two, paddle::op::AddOp, paddle::op::AddOpMaker);
+REGISTER_OP_CPU_KERNEL(add_two,
+                       ::paddle::op::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
diff --git a/paddle/op/add_op.cu b/paddle/op/add_op.cu
new file mode 100644
index 0000000000..d3d73d868b
--- /dev/null
+++ b/paddle/op/add_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/op/add_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(add_two,
+                       paddle::op::AddKernel<paddle::platform::GPUPlace>);
\ No newline at end of file
diff --git a/paddle/op/add_op.h b/paddle/op/add_op.h
new file mode 100644
index 0000000000..3a5a4fb00e
--- /dev/null
+++ b/paddle/op/add_op.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace op {
+
+template <typename Place>
+class AddKernel : public framework::OpKernel {
+public:
+  void Compute(const KernelContext &context) const override {
+    LOG(INFO) << "Add kernel in " << typeid(Place).name();
+  }
+};
+
+}  // namespace op
+}  // namespace paddle
diff --git a/paddle/op/add_op_test.cc b/paddle/op/add_op_test.cc
new file mode 100644
index 0000000000..f554ac1bef
--- /dev/null
+++ b/paddle/op/add_op_test.cc
@@ -0,0 +1,9 @@
+#include <gtest/gtest.h>
+#define private public
+#include <paddle/framework/op_registry.h>
+USE_OP(add_two);
+TEST(AddOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("add_two");
+  ASSERT_NE(it, protos.end());
+}
\ No newline at end of file

From 00572aa451d44ccb32b1c59a59241d7000c68fda Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 19:14:09 +0800
Subject: [PATCH 117/205] Add memory alignment test

---
 paddle/memory/memory_test.cc | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 9fdcd03b1a..4c9b3311bb 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+#include "gtest/gtest.h"
 #include "paddle/platform/place.h"
 
-#include "gtest/gtest.h"
+template <typename T>
+inline bool is_aligned(T *p, size_t n = alignof(T)) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+}
 
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
@@ -36,11 +40,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   std::vector<void *> ps;
   ps.reserve(8);
 
-  for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps.emplace_back(paddle::memory::Alloc(cpu, size));
   }
 
   for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
     paddle::memory::Free(cpu, p);
   }
 }
@@ -60,4 +66,21 @@ TEST(BuddyAllocator, GPUAllocation) {
   paddle::memory::Free(gpu, p);
 }
 
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::vector<void *> ps;
+  ps.reserve(8);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
+    paddle::memory::Free(gpu, p);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU

From 79b70c2d697b6538eb55a0d6f092886630fbb13b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Jul 2017 20:14:03 +0800
Subject: [PATCH 118/205] Follow comments

* Convert `op` --> `operators`
* Remove AddType in OpProtoMaker, because type is part of registry.
* Rename CPU_OR_GPU --> DEVICE_TYPE in registry macro.
---
 paddle/CMakeLists.txt                   |  2 +-
 paddle/framework/op_registry.h          | 17 ++++++++---------
 paddle/framework/op_registry_test.cc    |  2 --
 paddle/framework/operator_test.cc       | 14 --------------
 paddle/{op => operators}/CMakeLists.txt |  0
 paddle/{op => operators}/add_op.cc      | 10 +++++-----
 paddle/{op => operators}/add_op.cu      |  0
 paddle/{op => operators}/add_op.h       |  2 +-
 paddle/{op => operators}/add_op_test.cc |  0
 9 files changed, 15 insertions(+), 32 deletions(-)
 rename paddle/{op => operators}/CMakeLists.txt (100%)
 rename paddle/{op => operators}/add_op.cc (84%)
 rename paddle/{op => operators}/add_op.cu (100%)
 rename paddle/{op => operators}/add_op.h (94%)
 rename paddle/{op => operators}/add_op_test.cc (100%)

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 61d0aac602..4b06966fba 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -14,7 +14,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
-  add_subdirectory(op)  # because `operator` is a reserved word for CPP, so short to `op`
+  add_subdirectory(operators)
   add_subdirectory(pybind)
 endif()
 
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index e9e150224e..61dfcb7049 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -82,8 +82,6 @@ class OpProtoAndCheckerMaker {
     return op_checker_->AddAttrChecker<T>(name);
   }
 
-  void AddType(const std::string& op_type) { proto_->set_type(op_type); }
-
   void AddComment(const std::string& comment) {
     *(proto_->mutable_comment()) = comment;
   }
@@ -194,13 +192,14 @@ class OpRegisterHelper {
   static int __use_op_ptr_##op_type##_without_kernel__      \
       __attribute__((unused)) = __op_register_##op_type##_handle__()
 
-#define USE_OP_KERNEL(op_type, CPU_OR_GPU)                                     \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(__use_op_kernel_##op_type##_##CPU_OR_GPU##__, \
-                                 "USE_OP_KERNEL must be in global namespace"); \
-  extern int __op_kernel_register_##op_type##_handle_##CPU_OR_GPU##__();       \
-  static int __use_op_ptr_##op_type##_##CPU_OR_GPU##_kernel__                  \
-      __attribute__((unused)) =                                                \
-          __op_kernel_register_##op_type##_handle_##CPU_OR_GPU##__()
+#define USE_OP_KERNEL(op_type, DEVICE_TYPE)                               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
+      "USE_OP_KERNEL must be in global namespace");                       \
+  extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \
+  static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__            \
+      __attribute__((unused)) =                                           \
+          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
 
 #ifdef PADDLE_ONLY_CPU
 #define USE_OP(op_type)           \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b3460838f9..9bcc0407ad 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -19,7 +19,6 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .LargerThan(0.0);
-    AddType("cos");
     AddComment("This is cos op");
   }
 };
@@ -44,7 +43,6 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
-    AddType("my_test_op");
     AddComment("This is my_test op");
   }
 };
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index a033ee1661..204b601a4a 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -36,20 +36,6 @@ class OperatorTest : public OperatorBase {
   float x = 0;
 };
 
-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("test_operator");
-    AddComment("This is test op");
-  }
-};
-
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/op/CMakeLists.txt b/paddle/operators/CMakeLists.txt
similarity index 100%
rename from paddle/op/CMakeLists.txt
rename to paddle/operators/CMakeLists.txt
diff --git a/paddle/op/add_op.cc b/paddle/operators/add_op.cc
similarity index 84%
rename from paddle/op/add_op.cc
rename to paddle/operators/add_op.cc
index 71fbe30289..2766f0bf25 100644
--- a/paddle/op/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -1,9 +1,9 @@
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/tensor.h>
-#include <paddle/op/add_op.h>
+#include <paddle/operators/add_op.h>
 
 namespace paddle {
-namespace op {
+namespace operators {
 
 class AddOp : public framework::OperatorWithKernel {
 protected:
@@ -39,6 +39,6 @@ The equation is: Out = X + Y
 }  // namespace op
 }  // namespace paddle
 
-REGISTER_OP(add_two, paddle::op::AddOp, paddle::op::AddOpMaker);
-REGISTER_OP_CPU_KERNEL(add_two,
-                       ::paddle::op::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
+REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
diff --git a/paddle/op/add_op.cu b/paddle/operators/add_op.cu
similarity index 100%
rename from paddle/op/add_op.cu
rename to paddle/operators/add_op.cu
diff --git a/paddle/op/add_op.h b/paddle/operators/add_op.h
similarity index 94%
rename from paddle/op/add_op.h
rename to paddle/operators/add_op.h
index 3a5a4fb00e..17d459dbc8 100644
--- a/paddle/op/add_op.h
+++ b/paddle/operators/add_op.h
@@ -3,7 +3,7 @@
 #include <paddle/framework/operator.h>
 
 namespace paddle {
-namespace op {
+namespace operators {
 
 template <typename Place>
 class AddKernel : public framework::OpKernel {
diff --git a/paddle/op/add_op_test.cc b/paddle/operators/add_op_test.cc
similarity index 100%
rename from paddle/op/add_op_test.cc
rename to paddle/operators/add_op_test.cc

From 62908dcc24414e0d2f69a39f04c39f9f2e1c77fe Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 13 Jul 2017 09:55:06 +0000
Subject: [PATCH 119/205] Move the download of ndk to build_android.sh script
 file.

---
 .travis.yml                            |  8 +-------
 paddle/scripts/travis/build_android.sh | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8ac67e5720..2cf7666fb5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,7 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
-    - $HOME/android-toolchain-gcc
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -35,12 +35,6 @@ addons:
       - ccache
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - if [[ "JOB" == "build_android" ]]; then
-      mkdir -p $HOME/tmp/$JOB;
-      cd $HOME/tmp/$JOB; wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip; unzip -q android-ndk-r14b-linux-x86_64.zip;
-      sh $HOME/tmp/$JOB/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=$HOME/android-toolchain-gcc;
-      cd $HOME; rm -rf $HOME/tmp/$JOB;
-    fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
index dd4ad29b18..004067a8f5 100755
--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
@@ -1,13 +1,24 @@
 #!/bin/bash
 set -e
 
+ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
+TMP_DIR=$HOME/$JOB/tmp
+mkdir -p $TMP_DIR
+cd $TMP_DIR
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
+$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+cd $HOME
+rm -rf $TMP_DIR
+
 # Create the build directory for CMake.
 mkdir -p $TRAVIS_BUILD_DIR/build_android
 cd $TRAVIS_BUILD_DIR/build_android
 
 # Compile paddle binaries
 cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
       -DANDROID_ABI=armeabi-v7a \
       -DANDROID_ARM_NEON=ON \
       -DANDROID_ARM_MODE=ON \

From 11660eab0e78d27304a8b7579537912c6e06f564 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Thu, 13 Jul 2017 19:47:24 +0000
Subject: [PATCH 120/205] Fix optimizer parameter buffer allocation size.

The buffer allocation size should be number of bytes, not number of
floats.
---
 go/pserver/client/client_test.go | 13 +++++++------
 go/pserver/optimizer.go          | 10 ++++++++--
 go/pserver/service_test.go       | 30 +++++++++++++++---------------
 paddle/optimizer/optimizer.cc    |  7 ++++---
 4 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 2b72a202b5..21dac92417 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -100,13 +100,13 @@ func (l lister) List() []client.Server {
 	return l
 }
 
-func ClientTest(t *testing.T, c *client.Client) {
+func testClient(t *testing.T, c *client.Client) {
 	selected := c.BeginInitParams()
 	if !selected {
 		t.Fatal("should be selected.")
 	}
 
-	const numParameter = 100
+	const numParameter = 1000
 	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
@@ -128,7 +128,7 @@ func ClientTest(t *testing.T, c *client.Client) {
 	}
 
 	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
+	for i := 0; i < numParameter; i++ {
 		var g pserver.Gradient
 		g.Name = "p_" + strconv.Itoa(i)
 		g.ElementType = pserver.Float32
@@ -169,13 +169,14 @@ func TestNativeClient(t *testing.T) {
 		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
 	}
 	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	ClientTest(t, c1)
+	testClient(t, c1)
 }
 
-// TODO: tmperary disable etcdClient test for dependency of etcd)
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
 	etcdClient := client.NewEtcd(etcdEndpoints)
 	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
-	ClientTest(t, c2)
+	testClient(t, c2)
 }
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index a6b73dd5a1..d6b7fafd59 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -19,6 +19,7 @@ var nullPtr = unsafe.Pointer(uintptr(0))
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
+	contentLen  int
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
@@ -37,10 +38,11 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
 	s := State
-	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
+	paramBufferSize := C.size_t(len(p.Content))
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
 		"ParamSize":   paramBufferSize,
@@ -78,7 +80,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
 
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
+	}
+
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 9bf1a48a59..a191f689fe 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -31,7 +31,7 @@ func TestServiceFull(t *testing.T) {
 
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var p1 pserver.Parameter
@@ -40,40 +40,40 @@ func TestServiceFull(t *testing.T) {
 	p1.ElementType = pserver.Float32
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
 
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	// don't compare content, since it's already changed by
@@ -82,7 +82,7 @@ func TestServiceFull(t *testing.T) {
 	p.Content = nil
 
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
 
@@ -90,16 +90,16 @@ func TestMultipleInit(t *testing.T) {
 	var cp pserver.Checkpoint
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
@@ -108,7 +108,7 @@ func TestUninitialized(t *testing.T) {
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
@@ -154,12 +154,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	wg.Wait()
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index 54662dc378..eb7125adee 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -44,8 +44,8 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
                                           const int state_len) {
   paddle_optimizer* optimizer = new paddle_optimizer;
   std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter =
-      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
+                                 num_bytes / sizeof(float));
   optimizer->impl = ParameterOptimizer::Create(config, parameter);
   if (state != nullptr) {
     std::string s(state, state + state_len);
@@ -65,7 +65,8 @@ int paddle_update_parameter(paddle_optimizer* o,
                             int num_bytes) {
   // TOOD(zhihong): datatype not work. need to add the runtime datatype
   auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  Tensor* gradient =
+      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
   o->impl->Update(gradient);
   return PADDLE_SUCCESS;
 }

From 777a5cca91dcc9617e85be4be037534040f3fbc7 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Thu, 13 Jul 2017 20:07:26 +0000
Subject: [PATCH 121/205] Client test: concurrently init param. Concurrently
 send grad and get param

---
 go/pserver/client/client_test.go | 90 ++++++++++++++++++++++++--------
 1 file changed, 68 insertions(+), 22 deletions(-)

diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 21dac92417..27f4ff2380 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -3,11 +3,13 @@ package client_test
 import (
 	"context"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -111,16 +113,23 @@ func testClient(t *testing.T, c *client.Client) {
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
+
+	var wg sync.WaitGroup
 	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-		if err != nil {
-			t.Fatal(err)
-		}
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 
 	err = c.FinishInitParams()
 	if err != nil {
@@ -136,9 +145,31 @@ func testClient(t *testing.T, c *client.Client) {
 		grads = append(grads, g)
 	}
 
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err = c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
 	}
 
 	names := make([]string, numParameter)
@@ -146,20 +177,35 @@ func testClient(t *testing.T, c *client.Client) {
 		names[i] = "p_" + strconv.Itoa(i)
 	}
 
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
 
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
 
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
-		}
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
 	}
+
+	wg.Wait()
 }
 
 func TestNativeClient(t *testing.T) {

From e588730147069e967e4c9108d24d663a5a5b55f6 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 14 Jul 2017 09:56:27 +0800
Subject: [PATCH 122/205] change op to operators

---
 paddle/operators/add_op.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index d3d73d868b..5979345fff 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,5 +1,5 @@
-#include <paddle/op/add_op.h>
+#include <paddle/operators/add_op.h>
 #include <paddle/framework/op_registry.h>
 
 REGISTER_OP_GPU_KERNEL(add_two,
-                       paddle::op::AddKernel<paddle::platform::GPUPlace>);
\ No newline at end of file
+                       paddle::operators::AddKernel<paddle::platform::GPUPlace>);
\ No newline at end of file

From a751c79331b7cc6066bf6da403dc72c9367aae27 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Thu, 13 Jul 2017 22:39:07 +0000
Subject: [PATCH 123/205] turn on race detector for all go tests

---
 cmake/generic.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 716955c7b4..25946f7a7b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -337,7 +337,7 @@ function(go_test TARGET_NAME)
   string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")

From ab5fe1e9071ef67850683442035f27c6c602e126 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 11:52:03 +0800
Subject: [PATCH 124/205] ENH: memory test: check alignment and memory size

---
 paddle/memory/memory_test.cc | 80 ++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 4c9b3311bb..458c8b2e24 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,14 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
-#include "gtest/gtest.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
-template <typename T>
-inline bool is_aligned(T *p, size_t n = alignof(T)) {
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p, const size_t n) {
   return 0 == (reinterpret_cast<uintptr_t>(p) % n);
 }
 
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void update_size(size_t &total_size, const size_t size) {}
+
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 
@@ -37,17 +59,33 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(cpu, size));
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(cpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 }
 
@@ -69,17 +107,33 @@ TEST(BuddyAllocator, GPUAllocation) {
 TEST(BuddyAllocator, GPUMultAlloc) {
   paddle::platform::GPUPlace gpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(gpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 }
 

From 21b7915d9122d29bdb7506ab2e30049653ccf52a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 12:03:09 +0800
Subject: [PATCH 125/205] Fix condition compile

---
 paddle/memory/memory_test.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 458c8b2e24..e13cbabb26 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -34,13 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-size_t align(size_t size, paddle::platform::GPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
 void update_size(size_t &total_size, const size_t size) {}
 
 TEST(BuddyAllocator, CPUAllocation) {
@@ -91,6 +84,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 
 #ifndef PADDLE_ONLY_CPU
 
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
 TEST(BuddyAllocator, GPUAllocation) {
   void *p = nullptr;
 

From cd5113c19766c4ae16b2298272263e6e78317225 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Jul 2017 19:57:06 +0800
Subject: [PATCH 126/205] Init commit

---
 paddle/framework/op_registry.h | 10 +++++-----
 paddle/pybind/CMakeLists.txt   |  2 +-
 paddle/pybind/pybind.cc        | 16 ++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 61dfcb7049..bf91b577c7 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -125,17 +125,17 @@ class OpRegistry {
     return op;
   }
 
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+
  private:
   static std::unordered_map<std::string, OpCreator>& creators() {
     static std::unordered_map<std::string, OpCreator> creators_;
     return creators_;
   }
 
-  static std::unordered_map<std::string, OpProto>& protos() {
-    static std::unordered_map<std::string, OpProto> protos_;
-    return protos_;
-  };
-
   static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index af85fdeecb..8564a5f5fe 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1 +1 @@
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f9f87acf15..6a1e9291cb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <Python.h>
+#include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
+#include <vector>
 
 namespace py = pybind11;
 namespace pd = paddle::framework;
 
+USE_OP(add_two);
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
@@ -43,5 +47,17 @@ All parameter, weight, gradient are variables in Paddle.
            &pd::Scope::CreateVariable,
            py::return_value_policy::reference);
 
+  m.def("get_all_op_protos", []() -> std::vector<std::string> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<std::string> ret_values;
+    ret_values.reserve(protos.size());
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      ret_values.emplace_back();
+      PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+    }
+    return ret_values;
+  });
+
   return m.ptr();
 }

From ea916c84874e4a036650bccb2b2eae142a6c36ae Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:00:03 +0800
Subject: [PATCH 127/205] Fix: alignment metric

---
 paddle/memory/memory_test.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index e13cbabb26..2a2cb98a31 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 
 inline bool is_aligned(void const *p, const size_t n) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
 size_t align(size_t size, paddle::platform::CPUPlace place) {
@@ -34,8 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-void update_size(size_t &total_size, const size_t size) {}
-
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 

From 033523ea9d01bb0e860dbbfd6c5bab72d2c5b149 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:02:17 +0800
Subject: [PATCH 128/205] update

---
 paddle/memory/memory_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 2a2cb98a31..53cc63a098 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <unordered_map>
 
-inline bool is_aligned(void const *p, const size_t n) {
+inline bool is_aligned(void const *p) {
   return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
@@ -68,7 +68,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(cpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk
@@ -123,7 +123,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(gpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk

From 8da5587205a0f613ed32273226739df3e82e8d8d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 13:28:49 +0800
Subject: [PATCH 129/205] Init commit

---
 .../v2/framework/create_op_creation_methods.py    | 12 ++++++++++++
 python/paddle/v2/framework/tests/CMakeLists.txt   |  2 +-
 .../framework/tests/test_op_creation_methods.py   | 15 +++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/v2/framework/create_op_creation_methods.py
 create mode 100644 python/paddle/v2/framework/tests/test_op_creation_methods.py

diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
new file mode 100644
index 0000000000..14beaadc9a
--- /dev/null
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -0,0 +1,12 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+
+
+def get_all_op_protos():
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = op_proto_pb2.OpProto()
+        op_proto.ParseFromString(pbstr)
+        ret_values.append(op_proto)
+    return ret_values
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 7023e82b5f..86fc60f26a 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py)
+    test_default_scope_funcs.py test_op_creation_methods.py)
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
new file mode 100644
index 0000000000..b205e2cabb
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -0,0 +1,15 @@
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+
+
+class TestOpCreationsMethods(unittest.TestCase):
+    def test_all_protos(self):
+        all_protos = creation.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()

From 010adb99b4d79d0babea132ca8ffb2b9dc048017 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 14 Jul 2017 13:40:49 +0800
Subject: [PATCH 130/205] Remove useless empty pointer check.

---
 paddle/api/ConfigParser.cpp                 | 12 ++----------
 paddle/api/ParameterOptimizer.cpp           | 12 ++----------
 paddle/api/Vector.cpp                       |  6 +-----
 paddle/gserver/dataproviders/DataProvider.h |  6 ++----
 paddle/math/Storage.cpp                     |  4 +---
 paddle/trainer/TrainerConfigHelper.cpp      |  6 +-----
 6 files changed, 9 insertions(+), 37 deletions(-)

diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd..b6ff6ec789 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e..120eea3f70 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929..500bc448c9 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4003676217..265dbb5493 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207..4adaaef983 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a1..133e2be104 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 

From 2462d0c5fedb783a322170ff15f828e63b612ead Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 00:50:46 -0500
Subject: [PATCH 131/205] Let OpProto support multiple and temporary (#2860)

* Let OpProto support multiple and temporary

* Each input/output of Paddle's Op could be a list. Add multiple mark to
  OpProto. Also add a `input_format`/`output_format` attribute if that
  Op has multiple input or output. The format of that attribute please
  reference the comments in `op_proto.proto`
* Add temporary mark, because some output of an Op is not used by user
  but used by other op for faster computation. Explicitly mark which
  output is temporary could let future memory/computation optimization.
* Add generated field to AttrProto.

* Add `AddInputs`/`AddOutputs` function

* It is more readable to invoke `AddInputs` not
  `AddInput(multiple=true)`.
---
 paddle/framework/op_proto.proto      | 39 +++++++++++
 paddle/framework/op_registry.h       | 97 +++++++++++++++++++++++++++-
 paddle/framework/op_registry_test.cc | 15 ++++-
 3 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
index 22df6f9c6b..596b8588e7 100644
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -34,6 +34,11 @@ message AttrProto {
 
     // Supported attribute comments. It helps 3rd-party language generate doc-string.
     required string comment = 3;
+
+    // If that attribute is generated, it means the Paddle third language
+    // binding has responsibility to fill that attribute. End-User should
+    // not set that attribute.
+    optional bool generated = 4 [default=false];
 }
 
 // Input or output message for 3rd-party language binding.
@@ -45,6 +50,40 @@ message VarProto {
 
     // The comment for that input. It helps 3rd-party language generate doc-string.
     required string comment = 2;
+
+    // Is that input/output could be a list or not.
+    // If so, that Op should write a attributed named `input_format` or
+    // `output_format`.
+    //
+    // e.g.
+    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+    //   will hold a attribute of them.
+    //
+    //   The Op desc of same fc could be
+    //   {
+    //      "type": "fc",
+    //      "input": ["X1", "X2", "W1", "W2", "b"],
+    //      "output": "fc.out",
+    //      "attrs" : {
+    //        "input_format": [0, 2, 4, 5]
+    //      }
+    //   }
+    //
+    optional bool multiple = 3 [default=false];
+
+    // It marks that output is a temporary output. That output is not used by
+    // user, but used by other op internally as input. If other op is not use
+    // that output, it could be optimized early.
+    //
+    // Attribute temporary_index will be set in OpDesc if there is some
+    // outputs are temporary.
+    //
+    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+    // attrs = {
+    //   "temporary_index": [1]
+    // }
+    optional bool temporary = 4 [default=false];
 }
 
 // Op protocol message for 3rd-party language binding.
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 61dfcb7049..d049599a2f 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -2,6 +2,8 @@
 
 #include <algorithm>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/framework/attr_checker.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
@@ -59,25 +61,52 @@ class OpProtoAndCheckerMaker {
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
+  ~OpProtoAndCheckerMaker() { CheckNoDuplicatedAttrs(); }
+
  protected:
-  void AddInput(const std::string& name, const std::string& comment) {
+  void AddInput(const std::string& name, const std::string& comment,
+                bool multiple = false) {
     auto input = proto_->mutable_inputs()->Add();
     *input->mutable_name() = name;
     *input->mutable_comment() = comment;
+    input->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleInput();
+    }
+  }
+
+  void AddInputs(const std::string& name, const std::string& comment) {
+    AddInput(name, comment, true);
   }
 
-  void AddOutput(const std::string& name, const std::string& comment) {
+  void AddOutput(const std::string& name, const std::string& comment,
+                 bool temporary = false, bool multiple = false) {
     auto output = proto_->mutable_outputs()->Add();
     *output->mutable_name() = name;
     *output->mutable_comment() = comment;
+    output->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleOutput();
+    }
+    output->set_temporary(temporary);
+    if (temporary) {
+      SetHasTemporaryOutput();
+    }
+  }
+
+  void AddOutputs(const std::string& name, const std::string& comment,
+                  bool temporary = false) {
+    AddOutput(name, comment, temporary, true);
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
+                               const std::string& comment,
+                               bool generated = false) {
     auto attr = proto_->mutable_attrs()->Add();
     *attr->mutable_name() = name;
     *attr->mutable_comment() = comment;
+    attr->set_generated(generated);
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
@@ -86,8 +115,70 @@ class OpProtoAndCheckerMaker {
     *(proto_->mutable_comment()) = comment;
   }
 
+ private:
+  void SetHasMultiple(const std::string& in_out, bool* flag) {
+    if (!*flag) {
+      AddAttr<std::vector<int>>(in_out + "_format",
+                                "The multiple index of " + in_out +
+                                    "\n"
+                                    R"DOC(
+This attribute is used by Paddle core framework. Paddle's Op support each input
+or output could be a list of variable. This attribute is used to show how that
+list organized.
+
+e.g.
+  input = ["a", "b", "c", "d", "e", "f"]
+  input_format = [0, 4, 5, 6]
+
+means
+  The number of all input variables this op is six, and they are segmented into
+  three inputs.
+
+  The first input is input[0:4], second is input[4:5], third is input[5:6].
+)DOC",
+                                /*generated*/ true);
+      *flag = true;
+    }
+  }
+
+  void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
+  void SetHasMultipleOutput() {
+    SetHasMultiple("output", &has_multiple_output_);
+  }
+
+  void SetHasTemporaryOutput() {
+    if (!has_temporary_output_) {
+      AddAttr<std::vector<int>>("temporary_index",
+                                R"DOC(The temporary index of output.
+
+Not all output of Paddle Op is used by user. For faster computation, each op
+could output some its internal state to other op, other op could take that
+output to make compute faster.
+
+Add a mark to which output is temporary is helpful for future optimization.
+)DOC",
+                                /*generated*/ true)
+          .SetDefault(std::vector<int>());
+      has_temporary_output_ = true;
+    }
+  }
+
+  void CheckNoDuplicatedAttrs() {
+    std::unordered_set<std::string> names;
+    size_t cnt = 0;
+    for (auto& attr : proto_->attrs()) {
+      names.insert(attr.name());
+      ++cnt;
+    }
+    PADDLE_ENFORCE(names.size() == cnt,
+                   "Cannot register two attribute in same name!");
+  }
+
   OpProto* proto_;
   OpAttrChecker* op_checker_;
+  bool has_multiple_input_{false};
+  bool has_multiple_output_{false};
+  bool has_temporary_output_{false};
 };
 
 class OpRegistry {
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 9bcc0407ad..1adafa3714 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -36,8 +36,9 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
+    AddInputs("input", "input of cosine op");
+    AddOutput("output", "output of cosine op",
+              /*temporary*/ true);
     auto my_checker = [](int i) {
       PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
     };
@@ -117,11 +118,20 @@ TEST(OpRegistry, DefaultValue) {
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
+static void SetInputFormat(paddle::framework::OpDesc* desc) {
+  auto attr = desc->add_attrs();
+  attr->set_name("input_format");
+  attr->set_type(paddle::framework::INTS);
+  attr->mutable_ints()->Add(0);
+  attr->mutable_ints()->Add(1);
+}
+
 TEST(OpRegistry, CustomChecker) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("my_test_op");
   op_desc.add_inputs("ii");
   op_desc.add_outputs("oo");
+  SetInputFormat(&op_desc);
 
   // attr 'test_attr' is not set
   bool caught = false;
@@ -163,6 +173,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
+  SetInputFormat(&op_desc);
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;

From 58f3de95cf34d8c826221781e8a8dbea954e7069 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 14 Jul 2017 14:56:49 +0800
Subject: [PATCH 132/205] Optimize ptr (#2851)

* use OperatorPtr = std::shared_ptr<OperatorBase>;
* use ScopePtr = std::share_ptr<Scope>;
---
 paddle/framework/net.cc              |  4 +-
 paddle/framework/net.h               | 13 +++---
 paddle/framework/op_registry.h       |  4 +-
 paddle/framework/op_registry_test.cc | 20 +++++-----
 paddle/framework/operator.h          | 12 +++---
 paddle/framework/operator_test.cc    | 59 +++++++++++++++++++++++-----
 paddle/framework/scope.h             |  7 +++-
 7 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 73b3051235..854ad8e33e 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -5,13 +5,13 @@ namespace framework {
 
 PlainNet::PlainNet(const NetDesc& def) {}
 
-void PlainNet::InferShape(Scope* scope) {
+void PlainNet::InferShape(const ScopePtr& scope) const {
   for (auto& op : ops_) {
     op.InferShape();
   }
 }
 
-void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
+void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
   for (auto& op : ops_) {
     op.Run(ctx);
   }
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 76992e0728..0481d8f47c 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -37,8 +37,8 @@ struct OpAttrs {};
 class Operator {
  public:
   Operator(const OpDesc &def) {}
-  void InferShape() {}
-  void Run(DeviceContext *ctx) {}
+  void InferShape() const {}
+  void Run(const DeviceContext &ctx) const {}
 };
 
 /**
@@ -60,7 +60,7 @@ class Net {
   /**
    * @brief Infer shapes of all inputs and outputs of operators.
    */
-  virtual void InferShape(Scope *scope) = 0;
+  virtual void InferShape(const ScopePtr &scope) const = 0;
   /**
    * @brief Run the network.
    *
@@ -69,7 +69,7 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
+  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -114,7 +114,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output varialbes' shapes, will be called
    * before every mini-batch
    */
-  virtual void InferShape(Scope *scope) override;
+  virtual void InferShape(const ScopePtr &scope) const override;
 
   /**
    * @brief Run the network.
@@ -123,7 +123,8 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
+  virtual void Run(const ScopePtr &scope,
+                   const DeviceContext &ctx) const override;
 
   /**
    * @brief Add an operator to this network.
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index d049599a2f..6be6ae15c2 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -198,9 +198,9 @@ class OpRegistry {
         op_type, op_proto.InitializationErrorString());
   }
 
-  static OperatorBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
-    OperatorBase* op = creators().at(op_type)();
+    OperatorPtr op(creators().at(op_type)());
     op->desc_ = op_desc;
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 1adafa3714..4791d4aaab 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -5,9 +5,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
  public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -25,8 +25,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 class MyTestOp : public OperatorBase {
  public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {}
 
  public:
@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -89,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -110,7 +110,7 @@ TEST(OpRegistry, DefaultValue) {
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -136,7 +136,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -155,7 +155,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -174,7 +174,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
   SetInputFormat(&op_desc);
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
   auto scope = std::make_shared<paddle::framework::Scope>();
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d3c55e0ceb..cf79f379fa 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -30,7 +30,7 @@ namespace paddle {
 namespace framework {
 
 class OperatorBase;
-
+using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -56,10 +56,10 @@ class OperatorBase {
 
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const ScopePtr& scope) const = 0;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
  protected:
@@ -82,7 +82,7 @@ class OpKernel {
    */
   class KernelContext {
    public:
-    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+    KernelContext(const OperatorBase* op, const ScopePtr& scope,
                   const platform::DeviceContext& device_context)
         : op_(*op), scope_(scope), device_context_(device_context) {}
 
@@ -95,7 +95,7 @@ class OpKernel {
     }
 
     const OperatorBase& op_;
-    const std::shared_ptr<Scope>& scope_;
+    const ScopePtr& scope_;
     const platform::DeviceContext& device_context_;
   };
 
@@ -140,7 +140,7 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
     opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 204b601a4a..d0c3153fae 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -22,8 +22,8 @@ namespace framework {
 class OperatorTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {
     float scale = GetAttr<float>("scale");
     ASSERT_NEAR(scale, 3.14, 1e-5);
@@ -36,6 +36,50 @@ class OperatorTest : public OperatorBase {
   float x = 0;
 };
 
+class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(test_operator, paddle::framework::OperatorTest,
+            paddle::framework::OperatorTestProtoAndCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  float scale = 3.14;
+  attr->set_f(scale);
+
+  paddle::platform::CPUDeviceContext device_context;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
+  scope->CreateVariable("OUT1");
+  op->Run(scope, device_context);
+  std::cout << op->DebugString() << std::endl;
+}
+
+namespace paddle {
+namespace framework {
+
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,9 +117,7 @@ REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
 REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
 
 TEST(OpKernel, all) {
-  using namespace paddle::framework;
-
-  OpDesc op_desc;
+  paddle::framework::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   *op_desc.mutable_inputs()->Add() = "IN1";
   *op_desc.mutable_outputs()->Add() = "OUT1";
@@ -85,10 +127,9 @@ TEST(OpKernel, all) {
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
 
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
-
-  delete op;
 }
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a4470f726f..ec62c9189f 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Scope;
+using ScopePtr = std::shared_ptr<Scope>;
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -41,7 +44,7 @@ class Scope {
   /**
    * @brief Initialize a Scope with parent.
    */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
 
   /**
    * @brief Create Variable
@@ -88,7 +91,7 @@ class Scope {
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  ScopePtr parent_{nullptr};
 };
 
 }  // namespace framework

From a76f7ed2eb02c9beb5e7b16ff7fede0f13477df8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 15:04:44 +0800
Subject: [PATCH 133/205] Get OpProtos in Python

* PyBind and SWIG of paddle cannot be load in a single Python process,
  lazy import all SWIG library of Paddle. Otherwise, the glog, gflags
  are imported twice in a same Python process.
* Note that all PyBind11 return C++ std::string as an unicode. For
  protobuf, it is need be cast to `str` before use them.
* Add unit test for Get `OpProtos`
---
 paddle/pybind/pybind.cc                               |  7 ++++++-
 python/paddle/v2/__init__.py                          |  4 +---
 python/paddle/v2/data_feeder.py                       |  1 -
 python/paddle/v2/event.py                             |  3 +--
 .../paddle/v2/framework/create_op_creation_methods.py |  3 +--
 python/paddle/v2/inference.py                         |  4 ++--
 python/paddle/v2/optimizer.py                         |  5 +++--
 python/paddle/v2/parameters.py                        |  5 +++--
 python/paddle/v2/trainer.py                           | 11 +++++------
 python/setup.py.in                                    |  3 ++-
 10 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6a1e9291cb..c1a025ed04 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <fstream>
 #include <vector>
 
 namespace py = pybind11;
@@ -47,11 +49,14 @@ All parameter, weight, gradient are variables in Paddle.
            &pd::Scope::CreateVariable,
            py::return_value_policy::reference);
 
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<std::string> {
     auto& protos = pd::OpRegistry::protos();
     std::vector<std::string> ret_values;
-    ret_values.reserve(protos.size());
     for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
       ret_values.emplace_back();
       PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
                      "Serialize OpProto Error. This could be a bug of Paddle.");
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3ba5c31871..3c75ca4c3a 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,7 +30,6 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
@@ -47,7 +45,6 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'data_feeder',
     'dataset',
     'reader',
     'topology',
@@ -61,6 +58,7 @@ __all__ = [
 
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e..98dfb85a0e 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa33..7589cc9917 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
     'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
index 14beaadc9a..2fcdfead25 100644
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -6,7 +6,6 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = op_proto_pb2.OpProto()
-        op_proto.ParseFromString(pbstr)
+        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
         ret_values.append(op_proto)
     return ret_values
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601..40134a3270 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,7 @@
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
 
 __all__ = ['infer', 'Inference']
 
@@ -28,6 +26,7 @@ class Inference(object):
     """
 
     def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
             topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -40,6 +39,7 @@ class Inference(object):
         self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 390c22ee55..3dec340cfb 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,5 +1,3 @@
-import py_paddle.swig_paddle as swig_api
-
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -26,6 +24,8 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
+        if swig_api is None:
+            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 
@@ -268,6 +268,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bbaf8bfa97..a9cba8ca0b 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,5 +1,4 @@
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -124,6 +123,7 @@ class Parameters(object):
         :return: parameter value
         :rtype: np.ndarray
         """
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
@@ -223,7 +223,7 @@ class Parameters(object):
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
@@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 96c6c4b89a..92fdf98e90 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -59,6 +53,7 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
@@ -124,6 +119,8 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
@@ -187,6 +184,8 @@ class SGD(object):
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/setup.py.in b/python/setup.py.in
index 271ee6e552..b1041f6102 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,8 @@ setup_requires=["requests",
                 "recordio",
                 "matplotlib",
                 "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "nltk"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]

From ac69f7730a1013842836b534036aa5a953db0001 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 14 Jul 2017 11:05:56 +0800
Subject: [PATCH 134/205] add gated linear unit into config helper.

---
 doc/api/v2/config/layer.rst                   |   5 +
 .../paddle/trainer_config_helpers/layers.py   |  96 +++++++++++++++-
 .../tests/configs/file_list.sh                |   2 +-
 .../protostr/test_gated_unit_layer.protostr   | 106 ++++++++++++++++++
 .../tests/configs/test_gated_unit_layer.py    |  16 +++
 5 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 4f4a9187bc..daee55b7f9 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b0524a507b..f0ee46262d 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -126,6 +126,7 @@ __all__ = [
     'row_conv_layer',
     'dropout_layer',
     'prelu_layer',
+    'gated_unit_layer',
 ]
 
 
@@ -5862,7 +5863,7 @@ def prelu_layer(input,
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
     assert isinstance(param_attr, ParameterAttribute)
 
     l = Layer(
@@ -5876,3 +5877,96 @@ def prelu_layer(input,
         layer_type=LayerType.PRELU,
         parents=input,
         size=l.config.size)
+
+
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_bias_attr=True,
+                     gate_param_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     inproj_layer_attr=None,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_layer_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_layer_attr: ExtraLayerAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        param_attr=inproj_param_attr,
+        layer_attr=inproj_layer_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        param_attr=gate_param_attr,
+        layer_attr=gate_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 70e342fb79..cdf9b2eab7 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology)
+test_recursive_topology test_gated_unit_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000..f1e4d894a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000..83aa51bf28
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)

From e2fd06c386107d518ebfe315d89d5ed70e5ee780 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 14 Jul 2017 16:02:44 +0800
Subject: [PATCH 135/205] refine name of the input parameter.

---
 .../paddle/trainer_config_helpers/layers.py   | 22 +++++++++----------
 .../tests/configs/test_gated_unit_layer.py    |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index f0ee46262d..78aa0778f8 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -5879,19 +5879,19 @@ def prelu_layer(input,
         size=l.config.size)
 
 
-@layer_support(ERROR_CLIPPING, DROPOUT)
 @wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 @wrap_act_default(act=LinearActivation())
 def gated_unit_layer(input,
                      size,
                      act=None,
                      name=None,
                      gate_attr=None,
-                     gate_bias_attr=True,
                      gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
                      inproj_param_attr=None,
                      inproj_bias_attr=True,
-                     inproj_layer_attr=None,
                      layer_attr=None):
     """
     The gated unit layer implements a simple gating mechanism over the input.
@@ -5923,18 +5923,18 @@ def gated_unit_layer(input,
         clipping threshold, dropout and so on. See ExtraLayerAttribute for
         more details.
     :type gate_attr: ExtraLayerAttribute|None
-    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
-    :type gate_bias_attr: ParameterAttribute|None
     :param gate_param_attr: Attributes to tune the learnable projected matrix
         parameter of the gate.
     :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
     :param inproj_param_attr: Attributes to tune the learnable parameter of
         the projection of input.
     :type inproj_param_attr: ParameterAttribute|None
-    :param inproj_layer_attr: Attributes to the tune the projected input, for
-        example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
-    :type inproj_layer_attr: ExtraLayerAttribute|None
     :param inproj_bias_attr: Attributes to tune the learnable bias of
         projection of the input.
     :type inproj_bias_attr: ParameterAttribute|None
@@ -5954,8 +5954,8 @@ def gated_unit_layer(input,
         name="%s_input_proj" % name,
         size=size,
         act=act,
+        layer_attr=inproj_attr,
         param_attr=inproj_param_attr,
-        layer_attr=inproj_layer_attr,
         bias_attr=inproj_bias_attr)
 
     gate = fc_layer(
@@ -5963,8 +5963,8 @@ def gated_unit_layer(input,
         name="%s_gate" % name,
         act=SigmoidActivation(),
         input=input,
-        param_attr=gate_param_attr,
         layer_attr=gate_attr,
+        param_attr=gate_param_attr,
         bias_attr=gate_bias_attr)
     return mixed_layer(
         name="%s_gated_act" % name,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
index 83aa51bf28..9dab45519c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -5,11 +5,11 @@ glu = gated_unit_layer(
     size=512,
     input=data,
     act=TanhActivation(),
-    gate_param_attr=ParamAttr(initial_std=1e-4),
     gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
     gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
     inproj_param_attr=ParamAttr(initial_std=1e-4),
-    inproj_layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
     inproj_bias_attr=ParamAttr(initial_std=1),
     layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
 

From bac1426d47727a9ea101dd42135a0800c2c5e023 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 14 Jul 2017 16:57:03 +0800
Subject: [PATCH 136/205] add_op kernel implementation

---
 paddle/framework/operator.cc | 12 +++++++
 paddle/framework/operator.h  | 67 +++++++++++++++++++++++-------------
 paddle/framework/tensor.h    | 16 ++++++++-
 paddle/operators/add_op.cc   | 11 +++---
 paddle/operators/add_op.cu   |  8 +++--
 paddle/operators/add_op.h    | 21 +++++++----
 6 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8f7adff8b3..25d120c9a9 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -17,6 +17,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <>
+DeviceType* KernelContext::get_eigen_device<CPUPlace>() {
+  return device_context_.get_eigen_device<DeviceType>();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+DeviceType* KernelContext::get_eigen_device<GPUPlace>() {
+  return device_context_.get_eigen_device<DeviceType>();
+}
+#endif
+
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
   ss << "=================\n";
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d3c55e0ceb..48cfeeb731 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -29,6 +29,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
 class OperatorBase;
 
 /**
@@ -72,33 +87,39 @@ class OperatorBase {
   AttributeMap attrs_;
 };
 
-class OpKernel {
+/**
+ * KernelContext is the only parameter of Kernel Run function.
+ * Run will get input/output variables, state such as momentum and
+ * device resource such as CUDA stream, cublas handle, etc. from
+ * KernelContext. User should construct it before run the Operator.
+ */
+class KernelContext {
  public:
-  /**
-   * KernelContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * KernelContext. User should construct it before run the Operator.
-   */
-  class KernelContext {
-   public:
-    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                  const platform::DeviceContext& device_context)
-        : op_(*op), scope_(scope), device_context_(device_context) {}
-
-    const Variable* Input(int index) const {
-      return scope_->GetVariable(op_.inputs_[index]);
-    }
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                const platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(device_context) {}
 
-    Variable* Output(int index) const {
-      return scope_->GetVariable(op_.outputs_[index]);
-    }
+  const Variable* Input(int index) const {
+    return scope_->GetVariable(op_.inputs_[index]);
+  }
 
-    const OperatorBase& op_;
-    const std::shared_ptr<Scope>& scope_;
-    const platform::DeviceContext& device_context_;
-  };
+  Variable* Output(int index) const {
+    return scope_->GetVariable(op_.outputs_[index]);
+  }
+
+  platform::DeviceContext& device_context() const { return device_context_; }
 
+  template <typename PlaceType, typename DeviceType = EigenDeviceConverter<
+                                    PlaceType>::EigenDeviceType>
+  DeviceType* get_eigen_device();
+
+  const OperatorBase& op_;
+  const std::shared_ptr<Scope>& scope_;
+  const platform::DeviceContext& device_context_;
+};
+
+class OpKernel {
+ public:
   virtual void Compute(const KernelContext& context) const = 0;
 
   virtual ~OpKernel() {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index e14b75d0e0..01244f617c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -35,7 +35,7 @@ class Tensor {
 
   template <typename T>
 
-  T* data() const {
+  const T* data() const {
     PADDLE_ENFORCE(
         holder_ != nullptr,
         "Tenosr has not been initialized. Call Tensor::mutable_data first.");
@@ -58,6 +58,20 @@ class Tensor {
                                 offset_);
   }
 
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  T* mutable_data(paddle::platform::Place place) {
+    if (holder_ == nullptr ||
+        !(holder_->Place() ==
+          place) /* some versions of boost::variant don't have operator!= */
+        || holder_->Size() < product(dims_) * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, product(dims_) * sizeof(T)));
+      offset_ = 0;
+    }
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+                                offset_);
+  }
+
   size_t NumElements() const { return product(dims_); }
 
   template <typename T, size_t NDIMS>
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 2766f0bf25..ef39e426fd 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -1,6 +1,6 @@
-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/operators/add_op.h>
+#include "paddle/operators/add_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -36,9 +36,10 @@ The equation is: Out = X + Y
 )DOC");
   }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
+    add_two,
+    ::paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 5979345fff..f4a4fb16a6 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,5 +1,7 @@
-#include <paddle/operators/add_op.h>
-#include <paddle/framework/op_registry.h>
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/add_op.h"
+#include "paddle/framework/op_registry.h"
 
 REGISTER_OP_GPU_KERNEL(add_two,
-                       paddle::operators::AddKernel<paddle::platform::GPUPlace>);
\ No newline at end of file
+                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 17d459dbc8..27a477a3ac 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -1,17 +1,26 @@
 #pragma once
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/operator.h"
+//#include "paddle/operators/add_op_functor.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+// Place can be CPUPlace or GPUPlace
+template <typename Place, typename DataType>
 class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
-    LOG(INFO) << "Add kernel in " << typeid(Place).name();
+  void Compute(const KernelContext& context) const override {
+    auto* input0 = context.Input(0);
+    auto* input1 = context.Input(1);
+
+    auto* output = context.Output(0);
+    output->mutable_data<DataType>(Place());
+
+    output->flat<T>().device(*(context.get_eigen_device<Place>())) =
+        input0->flat<T>() + input1->flat<T>();
   }
 };
 
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle

From a1dc4311a30cbdd5af2c66deedf1162668d9001f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:02:03 +0800
Subject: [PATCH 137/205] Refactor `Tensor::CopyFrom()`

---
 paddle/framework/tensor.h | 61 +++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7f3894bb3c..e164f57abc 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -28,34 +28,33 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : offset_(0) {}
+  Tensor() : offset_(0) { numel_ = product(dims_); }
 
-  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+  Tensor& operator=(const Tensor& src) = delete;
 
   template <typename T>
   const T* data() const {
-    PADDLE_ENFORCE(
-        holder_ != nullptr,
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
+    CheckDimsValidity();
     return reinterpret_cast<const T*>(
         reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
-    dims_ = dims;
+    set_dims(dims);
     return mutable_data<T>(place);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T>
   T* mutable_data(paddle::platform::Place place) {
+    PADDLE_ENFORCE(numel_ > 0,
+                   "Tensor::numel_ must be larger than zero to call "
+                   "Tensor::mutable_data.");
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims_) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims_) * sizeof(T)));
+        || holder_->Size() < numel_ * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
@@ -63,25 +62,24 @@ class Tensor {
   }
 
   void ShareDataFrom(const Tensor& src) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not share data from an uninitialized tensor.");
+    src.CheckDimsValidity();
     holder_ = src.holder_;
-    dims_ = src.dims_;
+    dims_ = src.dims();
+    numel_ = src.numel_;
     offset_ = src.offset_;
   }
 
   void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not copy from an uninitialized tensor.");
-    size_t size = product(src.dims()) * src.holder_->TypeSize();
+    src.CheckDimsValidity();
+    size_t size = src.numel_ * src.holder_->TypeSize();
     holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
     dims_ = src.dims();
+    numel_ = src.numel_;
     offset_ = 0;
   }
 
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "The sliced tenosr has not been initialized.");
+    CheckDimsValidity();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -94,12 +92,22 @@ class Tensor {
     }
     Tensor dst;
     dst.holder_ = holder_;
-    dst.dims_ = dims_;
-    dst.dims_[0] = end_idx - begin_idx;
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.set_dims(dst_dims);
     dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
     return dst;
   }
 
+  void set_dims(const DDim& dims) {
+    if (dims == dims_) {
+      return;
+    }
+    dims_ = dims;
+    numel_ = product(dims_);
+    return;
+  }
+
   DDim dims() const { return dims_; }
 
  private:
@@ -158,8 +166,17 @@ class Tensor {
     size_t size_;                    // size of the memory block.
   };
 
+  inline void CheckDimsValidity() {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_ENFORCE(holder_->Size() > numel_ * sizeof(T) + offset_,
+                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                   "first to re-allocate memory.");
+  }
+
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
+  int numel_;      // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
 };
 

From dcfcf6872a6a7b5c9d58eec2e30e08e7f7897cf4 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:03:54 +0800
Subject: [PATCH 138/205] Refactor Tensor::CopyFrom()

1. Add template T which indicates data type to `CopyFrom()`, `Slice()`
and `ShareData()` functions. This makes `CopyData()` code much clearer.

2. Add `set_dim()`.

3. `product(DDim)` transforms `DDim` to `vector<int>` first and then calculate
its product. That might be quite slow. For `product(dims_)` is frequently
used in Tensor, we add a mumber variable `numel_` as a cache of the
product result.
TODO: refactor `product()` to make it more efficient.

4. Unable Tensor::operator=

5. Remove the limit of POD type, because `float16` and `int8` are not POD type.
---
 paddle/framework/tensor.h       | 73 ++++++++++++++-------------------
 paddle/framework/tensor_test.cc |  5 ++-
 2 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index e164f57abc..8cb4d1793c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
-#include <type_traits>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
@@ -28,15 +27,15 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : offset_(0) { numel_ = product(dims_); }
+  Tensor() : numel_(0), offset_(0) {}
 
   Tensor& operator=(const Tensor& src) = delete;
 
   template <typename T>
   const T* data() const {
-    CheckDimsValidity();
+    CheckDimsValidity<T>();
     return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
   template <typename T>
@@ -51,35 +50,40 @@ class Tensor {
                    "Tensor::numel_ must be larger than zero to call "
                    "Tensor::mutable_data.");
     if (holder_ == nullptr ||
-        !(holder_->Place() ==
+        !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < numel_ * sizeof(T) + offset_) {
       holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
       offset_ = 0;
     }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
   }
 
+  template <typename T>
   void ShareDataFrom(const Tensor& src) {
-    src.CheckDimsValidity();
+    src.CheckDimsValidity<T>();
     holder_ = src.holder_;
-    dims_ = src.dims();
-    numel_ = src.numel_;
+    set_dims(src.dims());
     offset_ = src.offset_;
   }
 
+  template <typename T>
   void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
-    src.CheckDimsValidity();
-    size_t size = src.numel_ * src.holder_->TypeSize();
-    holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
-    dims_ = src.dims();
-    numel_ = src.numel_;
-    offset_ = 0;
+    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       platform::is_cpu_place(dst_place),
+                   "Tensor::CopyFrom only support CPU now.");
+    src.CheckDimsValidity<T>();
+    size_t size = src.numel_ * sizeof(T);
+    set_dims(src.dims());
+    void* src_ptr = static_cast<void*>(src.data<T>());
+    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+    memcpy(dst_ptr, src_ptr, size);
   }
 
+  template <typename T>
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDimsValidity();
+    CheckDimsValidity<T>();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -95,7 +99,7 @@ class Tensor {
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.set_dims(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
     return dst;
   }
 
@@ -115,12 +119,9 @@ class Tensor {
   // parameter of Variable.
   struct Placeholder {
     virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
-    virtual size_t TypeSize() const = 0;
-    virtual Placeholder* Clone(size_t begin, size_t size,
-                               paddle::platform::Place place) const = 0;
+    virtual void* ptr() const = 0;
+    virtual paddle::platform::Place place() const = 0;
+    virtual size_t size() const = 0;
   };
 
   template <typename T>
@@ -144,32 +145,20 @@ class Tensor {
           place_(place),
           size_(size) {}
 
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
-    virtual size_t TypeSize() const { return sizeof(T); }
-    // TODO: Clone only support CPU now. GPU support is needed.
-    virtual Placeholder* Clone(size_t begin, size_t size,
-                               paddle::platform::Place place) const {
-      PADDLE_ENFORCE(paddle::platform::is_cpu_place(place_) &&
-                         paddle::platform::is_cpu_place(place),
-                     "PlaceholderImpl::Clone only support CPU now.");
-      PlaceholderImpl<T>* dst = new PlaceholderImpl<T>(place, size);
-      void* begin_ptr =
-          reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(Ptr()) + begin);
-      memcpy(dst->Ptr(), begin_ptr, size);
-      return dst;
-    }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual size_t size() const { return size_; }
+    virtual paddle::platform::Place place() const { return place_; }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
     size_t size_;                    // size of the memory block.
   };
 
-  inline void CheckDimsValidity() {
+  template <typename T>
+  inline void CheckDimsValidity() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->Size() > numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() > numel_ * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 6db0ba8c79..eef9cfcd9e 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -18,7 +18,8 @@
 TEST(Tensor, Dims) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  Tensor tt(make_ddim({2, 3, 4}));
+  Tensor tt;
+  tt.set_dims(make_ddim({2, 3, 4}));
   DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
@@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) {
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);

From 1f97388a3410275e663483f2b7d4de20561c2e66 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:22:06 +0800
Subject: [PATCH 139/205] fix several compile error

---
 paddle/framework/tensor.h       |  2 +-
 paddle/framework/tensor_test.cc | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 8cb4d1793c..7f731813ef 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -76,7 +76,7 @@ class Tensor {
     src.CheckDimsValidity<T>();
     size_t size = src.numel_ * sizeof(T);
     set_dims(src.dims());
-    void* src_ptr = static_cast<void*>(src.data<T>());
+    const void* src_ptr = static_cast<const void*>(src.data<T>());
     void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
     memcpy(dst_ptr, src_ptr, size);
   }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index eef9cfcd9e..255f69372f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -105,19 +105,18 @@ TEST(Tensor, ShareDataFrom) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataFrom(src_tensor);
+      dst_tensor.ShareDataFrom<float>(src_tensor);
     } catch (EnforceNotMet err) {
       caught = true;
-      std::string msg = "Can not share data from an uninitialized tensor.";
-      const char* what = err.what();
-      for (size_t i = 0; i < msg.length(); ++i) {
-        ASSERT_EQ(what[i], msg[i]);
+      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
+first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
+++i) { ASSERT_EQ(what[i], msg[i]);
       }
     }
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
@@ -125,7 +124,7 @@ TEST(Tensor, ShareDataFrom) {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 }
@@ -136,7 +135,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -159,7 +158,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -188,15 +187,15 @@ TEST(Tensor, CopyFrom) {
   int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
   memcpy(src_ptr, arr, 9 * sizeof(int));
   Tensor dst_tensor;
-  dst_tensor.CopyFrom(src_tensor, CPUPlace());
+  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
   const int* dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(src_ptr, dst_ptr);
   for (size_t i = 0; i < 9; ++i) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
-  Tensor slice_tensor = src_tensor.Slice(1, 2);
-  dst_tensor.CopyFrom(slice_tensor, CPUPlace());
+  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(dst_ptr, slice_ptr);

From d7405e1eaa48c546323a086c170093673fdf435a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 17:53:42 +0800
Subject: [PATCH 140/205] Fix a typo before

---
 python/paddle/v2/optimizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 3dec340cfb..b6ee51cfe8 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -16,6 +16,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -24,8 +25,6 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
-        if swig_api is None:
-            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 

From 450cf18b531f104387fad516f8879590ec75dd16 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 14 Jul 2017 09:58:38 +0000
Subject: [PATCH 141/205] Add Go compiler to Dockfile.android and rename the
 build directory to build-android. The newest developing image was push to
 dockerhub, named xreki/paddle-android:dev.

---
 Dockerfile.android                     | 11 +++++++++++
 paddle/scripts/docker/build_android.sh |  8 +++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.android b/Dockerfile.android
index fa24f6f06c..c0fa58c384 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index bfa10c9155..53e1b818cb 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 
 set -xe
 
-mkdir -p /paddle/build
-cd /paddle/build
-rm -f /paddle/install 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
       -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
       -DANDROID_ABI=armeabi-v7a \
@@ -22,5 +22,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
 make -j `nproc`
 make install
 
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version

From 6c3027571a5d7f6e54776aa44daf950f0c3134e9 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 14 Jul 2017 10:08:51 +0000
Subject: [PATCH 142/205] Disable invalid cross-compiling variables for cmake
 of higher version.

---
 cmake/cross_compiling/android.cmake | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index dcfbc5d012..5e3e437a8d 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -108,6 +108,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
         IF(ANDROID_ABI STREQUAL "arm64-v8a")
             SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
         ENDIF()
         SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
     ENDIF()
@@ -166,7 +167,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
     ENDIF()
 
     IF(ANDROID_ABI STREQUAL "arm64-v8a")
-      LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
     ENDIF()
 
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
@@ -193,6 +194,10 @@ ELSE()
         SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
     ENDIF()
     SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()

From 8bcd1faffcbe17f1879a18b04bab1bbf5a0eadd2 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 18:12:14 +0800
Subject: [PATCH 143/205] refactor product(DDim ddim)

---
 paddle/framework/ddim.cc      | 15 +++++++++------
 paddle/framework/ddim_test.cc |  3 +++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595..a1ae079f4a 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -178,13 +178,16 @@ std::vector<int> vectorize(const DDim& ddim) {
   return result;
 }
 
-ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
   }
-  return result;
+};
+
+ssize_t product(const DDim& ddim) {
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
 }
 
 ///\cond HIDDEN
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 36eef02370..8ce7886f8a 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -52,6 +52,9 @@ TEST(DDim, Equality) {
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
 }
 
 TEST(DDim, Print) {

From 8594d5c31424e346fa2b8cdbada5188de1c13264 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 18:13:32 +0800
Subject: [PATCH 144/205] change int numel_ to size_t numel

---
 paddle/framework/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7f731813ef..8b2a65dca9 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -165,7 +165,7 @@ class Tensor {
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
-  int numel_;      // cache of `product(dims_)`
+  size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
 };
 

From 34beec0f7a645ee4c5c0ce47592619c5ed05facc Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 19:32:01 +0800
Subject: [PATCH 145/205] update tensor.h

---
 paddle/framework/tensor.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 8b2a65dca9..d98706e6ed 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -33,7 +33,7 @@ class Tensor {
 
   template <typename T>
   const T* data() const {
-    CheckDimsValidity<T>();
+    CheckDims<T>();
     return reinterpret_cast<const T*>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
@@ -62,7 +62,7 @@ class Tensor {
 
   template <typename T>
   void ShareDataFrom(const Tensor& src) {
-    src.CheckDimsValidity<T>();
+    src.CheckDims<T>();
     holder_ = src.holder_;
     set_dims(src.dims());
     offset_ = src.offset_;
@@ -73,7 +73,7 @@ class Tensor {
     PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
-    src.CheckDimsValidity<T>();
+    src.CheckDims<T>();
     size_t size = src.numel_ * sizeof(T);
     set_dims(src.dims());
     const void* src_ptr = static_cast<const void*>(src.data<T>());
@@ -83,7 +83,7 @@ class Tensor {
 
   template <typename T>
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDimsValidity<T>();
+    CheckDims<T>();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -109,7 +109,6 @@ class Tensor {
     }
     dims_ = dims;
     numel_ = product(dims_);
-    return;
   }
 
   DDim dims() const { return dims_; }
@@ -155,10 +154,10 @@ class Tensor {
   };
 
   template <typename T>
-  inline void CheckDimsValidity() const {
+  inline void CheckDims() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() > numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }

From 57a22db3fad1251a50d3d3dd2f241ad7f1949d77 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 19:43:59 +0800
Subject: [PATCH 146/205] update PADDLE_ENFORCE message

---
 paddle/framework/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index d98706e6ed..62e0710a82 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -48,7 +48,7 @@ class Tensor {
   T* mutable_data(paddle::platform::Place place) {
     PADDLE_ENFORCE(numel_ > 0,
                    "Tensor::numel_ must be larger than zero to call "
-                   "Tensor::mutable_data.");
+                   "Tensor::mutable_data. Call Tensor::set_dim first.");
     if (holder_ == nullptr ||
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */

From 03b3d0d8a8b8e90f997e1a2cec49bb04486adc8a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 20:12:35 +0800
Subject: [PATCH 147/205] Follow comments

---
 paddle/platform/cpu_info.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index 3da04420e5..1905cfeee6 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -54,8 +54,8 @@ size_t CpuMaxAllocSize() {
 }
 
 size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
 }
 
 size_t CpuMaxChunkSize() {

From fbfbe93a78e9cc411dab6c2b54a5516b16dae430 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 18:59:05 +0000
Subject: [PATCH 148/205] cmake: do not run glide install every time.

---
 cmake/configure.cmake | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a4f98ec7d4..7afab5d534 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
       message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
     endif()
 
-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
   endif()
 
 endif(WITH_GOLANG)

From 9eb9b2c29c97c63e4f0ca32e5d69e5dd5b26d89d Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 20:20:50 +0000
Subject: [PATCH 149/205] fix race condition in test

---
 go/pserver/client/client_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 27f4ff2380..aab91556b4 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -164,7 +164,7 @@ func testClient(t *testing.T, c *client.Client) {
 
 		wg.Add(1)
 		go func(gs []pserver.Gradient) {
-			err = c.SendGrads(gs)
+			err := c.SendGrads(gs)
 			if err != nil {
 				t.Fatal(err)
 			}

From 13b0dcd295d49fa49f65de2219462999e34294b0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:01:37 +0800
Subject: [PATCH 150/205] ENH: add cpplint

---
 cmake/cpplint.cmake | 14 +++++++-------
 cmake/generic.cmake |  4 ++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 6bbcd730e1..656e1a0803 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
     .*cblas\\.h.*
     .*\\.pb\\.txt
     .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                 endif()
             endforeach()
             if(LINT MATCHES ON)
+                # cpplint code style
                 get_filename_component(base_filename ${filename} NAME)
                 set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 48c054d17f..b3204e863a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -185,6 +185,10 @@ function(cc_library TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
   else(cc_library_SRCS)
     if (cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})

From 620575b6eb7f59abc0b1fd2052159590d6a29113 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:02:03 +0800
Subject: [PATCH 151/205] FIX: cpplint code style

---
 paddle/framework/ddim.cc             | 42 ++++++++++++++++++----------
 paddle/framework/ddim.h              |  2 +-
 paddle/framework/net.cc              | 16 +++++++++++
 paddle/framework/op_registry.cc      | 16 ++++++++++-
 paddle/framework/operator.cc         |  2 +-
 paddle/memory/detail/memory_block.cc |  6 ++--
 paddle/memory/memory.cc              |  2 --
 paddle/platform/cpu_info.cc          |  4 +--
 paddle/platform/place.cc             | 16 ++++++++++-
 9 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595..87a3618e09 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,9 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
 
 namespace paddle {
 namespace framework {
 
-///@cond HIDDEN
+/// @cond HIDDEN
 
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +64,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
   }
 }
 
-///@endcond
+/// @endcond
 
 DDim make_ddim(std::initializer_list<int> dims) {
   DDim result(make_dim(0));
@@ -64,11 +78,11 @@ DDim make_ddim(const std::vector<int>& dims) {
   return result;
 }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
  public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int& operator()(Dim<D>& dim) const {
@@ -81,7 +95,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
  public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int operator()(const Dim<D>& dim) const {
@@ -92,7 +106,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
   int idx_;
 };
 
-///@endcond
+/// @endcond
 
 int& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -155,11 +169,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
   std::vector<int>& vector;
 
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -169,7 +183,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 
   void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 
 std::vector<int> vectorize(const DDim& ddim) {
   std::vector<int> result;
@@ -187,7 +201,7 @@ ssize_t product(const DDim& ddim) {
   return result;
 }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
   template <int D>
@@ -196,15 +210,15 @@ struct ArityVisitor : boost::static_visitor<int> {
   }
 };
 
-///\endcond
+/// \endcond
 
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct DDimPrinter : boost::static_visitor<void> {
   std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -212,7 +226,7 @@ struct DDimPrinter : boost::static_visitor<void> {
   }
 };
 
-///\endcond
+/// \endcond
 
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180be..f8714acf32 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -27,7 +27,7 @@ struct DDim {
   DDim() : var(Dim<1>()) {}
 
   template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 854ad8e33e..a0e8788846 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "paddle/framework/net.h"
 
 namespace paddle {
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 4b35e04e68..1d14535c50 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 
 namespace paddle {
@@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
   attr->set_type(paddle::framework::AttrType::STRINGS);
 }
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8f7adff8b3..d065670829 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -40,4 +40,4 @@ std::string OperatorBase::DebugString() const {
 }
 
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index bc67bcef0f..fc40993208 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -152,6 +152,6 @@ MemoryBlock* MemoryBlock::metadata() const {
       reinterpret_cast<const Metadata*>(this) - 1));
 }
 
-}  // detail
-}  // memory
-}  // paddle
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 430ce98bfc..df3d57d629 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 
-#include <boost/variant.hpp>
-
 namespace paddle {
 namespace memory {
 
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index 1905cfeee6..dfab391cfb 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -41,8 +41,8 @@ inline size_t CpuTotalPhysicalMemory() {
   if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
   return 0L;
 #else
-  long pages = sysconf(_SC_PHYS_PAGES);
-  long page_size = sysconf(_SC_PAGE_SIZE);
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
   return pages * page_size;
 #endif
 }
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 0704820aa0..b31515e1f0 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 
 class PlacePrinter : public boost::static_visitor<> {
  public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 

From 569f7e83a6fa37a82e341cf4a588af7abfbcfca8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:02:27 +0800
Subject: [PATCH 152/205] FIX: cppint code style

---
 paddle/function/GemmConvOp.cpp                |  9 +--
 paddle/function/NaiveConvOp.cpp               |  3 +-
 .../gradientmachines/NeuralNetwork.cpp        |  2 +-
 .../RecurrentGradientMachine.cpp              |  2 +-
 paddle/gserver/layers/AgentLayer.cpp          |  2 +-
 paddle/operators/add_op.cc                    | 18 +++++-
 paddle/optimizer/parameter_optimizer_test.cpp | 60 ++++++++++++-------
 paddle/optimizer/serialization_test.cpp       | 27 ++++++---
 paddle/utils/DynamicLoader.h                  |  5 +-
 paddle/utils/ThreadLocal.h                    | 12 ++--
 10 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e..00880effc5 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 4348f0f775..e0692fa06d 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 2e839f6405..cfa80a8936 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
       : layerName_(layerName) {
     addEvaluator(std::move(evaluator));
   }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
     const LayerPtr& layer = nn.getLayer(layerName_);
     CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                  << nn.getName();
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 9a972466d6..9ddd449de7 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
   }
   starts.back() = pos;
 }
-}
+}  // namespace
 
 void RecurrentGradientMachine::calcSequenceStartPositions() {
   std::vector<int> starts(commonSeqInfo_.size() + 1);
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 15e7411b5f..bdae7e623a 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
     dest[index[i]] = src[i];
   }
 }
-}
+}  // namespace
 
 void GatherAgentLayer::forwardIds(PassType passType) {
   IVectorPtr realId = realLayers_[0]->getOutputLabel();
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 2766f0bf25..522b23cbc4 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/tensor.h>
 #include <paddle/operators/add_op.h>
@@ -36,9 +50,9 @@ The equation is: Out = X + Y
 )DOC");
   }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
+    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 4e6254d9e4..60a3b32789 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = (float)rand() / (float)RAND_MAX;
   }
   return param;
 }
 
-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = i;
   }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest();
+  // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 
   virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
   virtual void TearDown() {}
 
   void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
     config_.mutable_sgd()->set_momentum(0.0);
     config_.mutable_sgd()->set_decay(0.0);
     config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
     config_.mutable_adam()->set_beta_1(0.9);
     config_.mutable_adam()->set_beta_2(0.1);
     config_.mutable_adam()->set_epsilon(1e-3);
     config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
   }
 
   void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       opts_[i]->Update(g);
     }
@@ -91,8 +107,8 @@ public:
   }
 
 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };
 
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index d2454140dc..e4d97cbdba 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -1,19 +1,32 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
     t1[i] = 0;
   }
 
-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
   for (size_t i = 0; i < t1.size(); ++i) {
     EXPECT_EQ(t1[i], t[i]);
   }
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724..2e5ff76a06 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b5e2862546..0a27b8b97b 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 

From ab8c7f11f3b376153d97f00abaca6dab7f041cf9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:18:49 +0800
Subject: [PATCH 153/205] FIX: virtual OptimizerTest

---
 paddle/optimizer/parameter_optimizer_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 60a3b32789..edf4ae37a9 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -41,7 +41,7 @@ paddle::optimizer::Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  virtual ~OptimizerTest();
+  virtual ~OptimizerTest() {}
   // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 

From ee90c2d22b2ff72fcc9983583bdb78962cb5ef72 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 12:44:01 +0800
Subject: [PATCH 154/205] add slice_dim draft

---
 paddle/framework/dim.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 883fdc55eb..8dc1bab06d 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -401,5 +401,20 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
+template <int D, int S>
+Dim<D> slice(const Dim<S>& dim, int begin, int end) {
+  PADDLE_ENFORCE(begin < end,
+                 "Begin index must be less than end index in Dim slice.");
+  PADDLE_ENFORCE(begin >= 0 && end <= S && end - begin == D,
+                 "Index error occurs in Dim slice.");
+  if (begin > 0) {
+    return slice<D>(dim.tail, begin - 1, end - 1);
+  }
+  if (D == 1) {
+    return Dim<1>(dim.head);
+  }
+  return Dim<D>(dim.head, slice<D - 1>(dim.tail, 0, end - 1));
+}
+
 }  // namespace framework
 }  // namespace paddle

From 68adb9541d339ffd0df43a7a45a5a4adf16f2067 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 15:00:18 +0800
Subject: [PATCH 155/205] enbale tensor memory test

---
 paddle/framework/CMakeLists.txt |   2 +-
 paddle/framework/tensor.h       |  50 ++++++++------
 paddle/framework/tensor_test.cc | 118 +++++++++++++++++---------------
 3 files changed, 92 insertions(+), 78 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8415ce67e9..f7f606e4b8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,7 +2,7 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim paddle_memory)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 62e0710a82..81db722c99 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,8 +29,6 @@ class Tensor {
  public:
   Tensor() : numel_(0), offset_(0) {}
 
-  Tensor& operator=(const Tensor& src) = delete;
-
   template <typename T>
   const T* data() const {
     CheckDims<T>();
@@ -39,13 +37,13 @@ class Tensor {
   }
 
   template <typename T>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+  T* mutable_data(DDim dims, platform::Place place) {
     set_dims(dims);
     return mutable_data<T>(place);
   }
 
   template <typename T>
-  T* mutable_data(paddle::platform::Place place) {
+  T* mutable_data(platform::Place place) {
     PADDLE_ENFORCE(numel_ > 0,
                    "Tensor::numel_ must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
@@ -53,7 +51,18 @@ class Tensor {
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->size() < numel_ * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
+      switch (place.which()) {
+        case 0:
+          holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
+          break;
+
+        case 1:
+          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+          break;
+      }
+
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -69,7 +78,7 @@ class Tensor {
   }
 
   template <typename T>
-  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
     PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
@@ -119,38 +128,37 @@ class Tensor {
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
-    virtual paddle::platform::Place place() const = 0;
+    virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
   };
 
-  template <typename T>
+  template <typename T, typename PlaceType>
   struct PlaceholderImpl : public Placeholder {
    private:
+    template <typename PType>
     class Deleter {
      public:
-      Deleter(platform::Place place) : place_(place) {}
-      void operator()(T* ptr) {
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
+      Deleter(PType place) : place_(place) {}
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
 
      private:
-      paddle::platform::Place place_;
+      PType place_;
     };
 
    public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
-               Deleter(place)),
+    PlaceholderImpl(PlaceType place, size_t size)
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
+               Deleter<PlaceType>(place)),
           place_(place),
           size_(size) {}
 
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
 
-    std::unique_ptr<T, Deleter> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    platform::Place place_;  // record the place of ptr_.
+    size_t size_;            // size of the memory block.
   };
 
   template <typename T>
@@ -166,7 +174,7 @@ class Tensor {
   DDim dims_;
   size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};
+};                 // namespace framework
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 255f69372f..79bd0cc607 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -47,7 +47,7 @@ TEST(Tensor, DataAssert) {
 
 /* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
-
+*/
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
@@ -72,28 +72,29 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-
-  {
-    Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1, p2);
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
-    EXPECT_EQ(p1, p2);
-  }
+  /*
+    {
+      Tensor src_tensor;
+      float* p1 = nullptr;
+      float* p2 = nullptr;
+      // initialization
+      p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+      EXPECT_NE(p1, nullptr);
+      // set src_tensor a new dim with large size
+      // momery is supposed to be re-allocated
+      p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+      EXPECT_NE(p2, nullptr);
+      EXPECT_NE(p1, p2);
+      // set src_tensor a new dim with same size
+      // momery block is supposed to be unchanged
+      p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+      EXPECT_EQ(p1, p2);
+      // set src_tensor a new dim with smaller size
+      // momery block is supposed to be unchanged
+      p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+      EXPECT_EQ(p1, p2);
+    }
+    */
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -108,9 +109,11 @@ TEST(Tensor, ShareDataFrom) {
       dst_tensor.ShareDataFrom<float>(src_tensor);
     } catch (EnforceNotMet err) {
       caught = true;
-      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
-first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
-++i) { ASSERT_EQ(what[i], msg[i]);
+      std::string msg =
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
       }
     }
     ASSERT_TRUE(caught);
@@ -120,13 +123,15 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
+  /*
+    {
+      Tensor src_tensor;
+      Tensor dst_tensor;
+      src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+      dst_tensor.ShareDataFrom<int>(src_tensor);
+      ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+    }
+    */
 }
 
 TEST(Tensor, Slice) {
@@ -155,27 +160,29 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  {
-    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
-    DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
+  /*
+    {
+      Tensor src_tensor;
+      src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+      Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+      DDim slice_dims = slice_tensor.dims();
+      ASSERT_EQ(arity(slice_dims), 2);
+      EXPECT_EQ(slice_dims[0], 4);
+      EXPECT_EQ(slice_dims[1], 9);
+
+      uintptr_t src_data_address =
+          reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+      uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+          src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+      uintptr_t slice_data_address =
+          reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+      uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+          slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+      EXPECT_EQ(src_data_address, src_mutable_data_address);
+      EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+      EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+    }
+    */
 }
 
 TEST(Tensor, CopyFrom) {
@@ -202,5 +209,4 @@ TEST(Tensor, CopyFrom) {
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
-}
-*/
\ No newline at end of file
+}
\ No newline at end of file

From 66cf21c880fba791910dc449dfc716d11c52803f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 07:16:11 +0000
Subject: [PATCH 156/205] fix compile error

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/tensor_test.cc | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f7f606e4b8..b8bfab5320 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,7 +2,7 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim place paddle_memory)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 79bd0cc607..30b1448a9b 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,7 +72,7 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-  /*
+    #ifdef __CUDACC__
     {
       Tensor src_tensor;
       float* p1 = nullptr;
@@ -94,7 +94,7 @@ TEST(Tensor, MutableData) {
       p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
       EXPECT_EQ(p1, p2);
     }
-    */
+    #endif
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -123,7 +123,7 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  /*
+  #ifdef __CUDACC__
     {
       Tensor src_tensor;
       Tensor dst_tensor;
@@ -131,7 +131,7 @@ TEST(Tensor, ShareDataFrom) {
       dst_tensor.ShareDataFrom<int>(src_tensor);
       ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
     }
-    */
+    #endif
 }
 
 TEST(Tensor, Slice) {
@@ -160,7 +160,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  /*
+  #ifdef __CUDACC__
     {
       Tensor src_tensor;
       src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -182,7 +182,7 @@ TEST(Tensor, Slice) {
       EXPECT_EQ(slice_data_address, slice_mutable_data_address);
       EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
     }
-    */
+    #endif
 }
 
 TEST(Tensor, CopyFrom) {
@@ -209,4 +209,4 @@ TEST(Tensor, CopyFrom) {
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
-}
\ No newline at end of file
+}

From afa2a88d7896a03feb18b3cf6e6736c8ca79fcad Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 15:25:06 +0800
Subject: [PATCH 157/205] add conditional compilation for tensor

---
 paddle/framework/tensor.h       |   5 ++
 paddle/framework/tensor_test.cc | 108 ++++++++++++++++----------------
 2 files changed, 59 insertions(+), 54 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 81db722c99..29bad7a00a 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -51,6 +51,7 @@ class Tensor {
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->size() < numel_ * sizeof(T) + offset_) {
+#ifdef __CUDACC__
       switch (place.which()) {
         case 0:
           holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
@@ -62,6 +63,10 @@ class Tensor {
               boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
           break;
       }
+#else
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+#endif
 
       offset_ = 0;
     }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 30b1448a9b..84c6f0cf65 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,29 +72,29 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-    #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      float* p1 = nullptr;
-      float* p2 = nullptr;
-      // initialization
-      p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
-      EXPECT_NE(p1, nullptr);
-      // set src_tensor a new dim with large size
-      // momery is supposed to be re-allocated
-      p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
-      EXPECT_NE(p2, nullptr);
-      EXPECT_NE(p1, p2);
-      // set src_tensor a new dim with same size
-      // momery block is supposed to be unchanged
-      p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
-      EXPECT_EQ(p1, p2);
-      // set src_tensor a new dim with smaller size
-      // momery block is supposed to be unchanged
-      p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
-      EXPECT_EQ(p1, p2);
-    }
-    #endif
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+    EXPECT_EQ(p1, p2);
+  }
+#endif
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -123,15 +123,15 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      Tensor dst_tensor;
-      src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-      dst_tensor.ShareDataFrom<int>(src_tensor);
-      ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-    }
-    #endif
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataFrom<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
 }
 
 TEST(Tensor, Slice) {
@@ -160,29 +160,29 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-      Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
-      DDim slice_dims = slice_tensor.dims();
-      ASSERT_EQ(arity(slice_dims), 2);
-      EXPECT_EQ(slice_dims[0], 4);
-      EXPECT_EQ(slice_dims[1], 9);
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
 
-      uintptr_t src_data_address =
-          reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-      uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-          src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
-      uintptr_t slice_data_address =
-          reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-      uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-          slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
-      EXPECT_EQ(src_data_address, src_mutable_data_address);
-      EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-      EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-    }
-    #endif
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
 }
 
 TEST(Tensor, CopyFrom) {

From 9e0c6800c53701fc50dfb69a2c8b6de19c52c559 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 15 Jul 2017 20:18:54 +0800
Subject: [PATCH 158/205] Python Generate OpCreation Methods by OpProto

All OpCreation method are generated by
`create_op_creation_methods::__bootstrap__` method, and stores in
`op_creations` object and its methods.

There are three parts to implement this feature.

1. Get all registered `OpProto` from C++ side. It is implemented in
`get_all_op_protos` method.
1. Create a function to convert `kwargs` to `OpDesc` base on each op's
`OpProto`. The `OpDescCreationMethod` class.
1. Convert `OpProto` to `docstring` by `get_docstring_from_op_proto`
method.

All three methods are unit tested. The `__bootstrap__` just combines
them together and create a method in runtime.

For details, please reference the doc string in
`create_op_creation_methods.py` and the unit test
`test_op_creation_methods.py`.
---
 paddle/framework/op_registry.h                |  24 ++
 paddle/framework/operator.cc                  |  28 +-
 paddle/framework/operator.h                   |   8 +-
 paddle/pybind/pybind.cc                       |  17 ++
 .../framework/create_op_creation_methods.py   | 235 +++++++++++++++++
 .../tests/test_op_creation_methods.py         | 243 +++++++++++++++++-
 python/paddle/v2/optimizer.py                 |   2 +
 7 files changed, 539 insertions(+), 18 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index de20e7af05..3d67541db2 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <algorithm>
+#include <atomic>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -199,8 +200,12 @@ class OpRegistry {
   }
 
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
+    //! Create a OpPtr by type.
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
+
+    //! Fill op's data member. Not use constructor because it will be noising
+    //! for Op developer.
     op->desc_ = op_desc;
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
@@ -208,10 +213,18 @@ class OpRegistry {
     op->outputs_.reserve((size_t)op_desc.outputs_size());
     std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
               std::back_inserter(op->outputs_));
+
+    //! Fill attrs, and validate attrs.
     for (auto& attr : op_desc.attrs()) {
       op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
     op_checkers().at(op_type).Check(op->attrs_);
+
+    //! Convert Temporary variable name to an unique variable name.
+    AssignTempVariable(op.get());
+
+    //! Other op's custom Init for a complex Op. For simple Op, the Init
+    //! method do nothing.
     op->Init();
     return op;
   }
@@ -222,6 +235,17 @@ class OpRegistry {
   };
 
  private:
+  static void AssignTempVariable(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->Type();
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+
   static std::unordered_map<std::string, OpCreator>& creators() {
     static std::unordered_map<std::string, OpCreator> creators_;
     return creators_;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index d065670829..a467d328e1 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -19,23 +19,21 @@ namespace framework {
 
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
-  ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
-  ss << "inputs = [";
-  for (auto& ipt : inputs_) {
-    ss << ipt << ", ";
+  ss << "Op(" << Type() << "), inputs:(";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    ss << inputs_[i];
+    if (i != inputs_.size() - 1) {
+      ss << ", ";
+    }
   }
-  ss << "]\n";
-  ss << "outputs = [";
-  for (auto& opt : outputs_) {
-    ss << opt << ", ";
+  ss << "), outputs:(";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    ss << outputs_[i];
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
   }
-  ss << "]\n";
-  ss << "attr_keys = [";
-  for (auto& attr : attrs_) {
-    ss << attr.first << ", ";
-  }
-  ss << "]\n";
+  ss << ").";
   return ss.str();
 }
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index cf79f379fa..cc166048b7 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -39,6 +39,13 @@ using OperatorPtr = std::shared_ptr<OperatorBase>;
  */
 class OperatorBase {
  public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
+
   virtual ~OperatorBase() {}
 
   template <typename T>
@@ -62,7 +69,6 @@ class OperatorBase {
   virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
- protected:
   std::string Type() const { return desc_.type(); }
 
  public:
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c1a025ed04..b5ead21fd0 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -63,6 +63,23 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+
+  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
+      .def("__str__", &pd::OperatorBase::DebugString)
+      .def_static("create", [](const std::string& protobin) {
+        pd::OpDesc desc;
+        PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                       "Cannot parse user input to OpDesc");
+        PADDLE_ENFORCE(desc.IsInitialized(),
+                       "User OpDesc is not initialized, reason %s",
+                       desc.InitializationErrorString());
+        return pd::OpRegistry::CreateOp(desc);
+      });
 
   return m.ptr();
 }
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
index 2fcdfead25..c2a7ae7692 100644
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -1,11 +1,246 @@
 import paddle.v2.framework.core as core
 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import cStringIO
 
 
 def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
         op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
         ret_values.append(op_proto)
     return ret_values
+
+
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+    
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, op_proto_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = op_desc_pb2.OpDesc()
+
+        # Inputs
+        ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output(
+            "input", kwargs, self.__op_proto__.inputs)
+        op_desc.inputs.extend(ipts)
+        if ipt_format is not None:
+            op_desc.attrs.extend([ipt_format])
+
+        # Outputs
+        outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output(
+            "output", kwargs, self.__op_proto__.outputs)
+        op_desc.outputs.extend(outs)
+        if out_format is not None:
+            op_desc.attrs.extend([out_format])
+        if len(tmp_index) != 0:
+            tmp_index_attr = op_desc.attrs.add()
+            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.name = "temporary_index"
+            tmp_index_attr.ints.extend(tmp_index)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == attr_type_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == attr_type_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == attr_type_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == attr_type_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+
+        return op_desc
+
+    @staticmethod
+    def extract_input_or_output(in_out, kwargs, meta):
+        """
+        Extract input variable names or output variable names from key-word 
+        arguments, which base on VarProtos.
+        
+        :param in_out: "input" or "output"
+        :param kwargs: key-word arguments that user inputted.
+        :param meta: a list of VarProto
+        :return: The three object will be return. The variable names. The 
+        input_format or output_format attribute(None if the input or output is 
+        not multiple). The temporary variable index list.
+        """
+        multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta))
+        tmp_index = []
+        retv = []
+        if multiple:
+            var_format = op_desc_pb2.AttrDesc()
+            var_format.type = attr_type_pb2.INTS
+            var_format.name = "%s_format" % in_out
+            var_format.ints.append(0)
+
+            for var in meta:
+                var_name = var.name
+
+                if var.temporary:
+                    var_name = [core.var_names.temp()]
+                    tmp_index.append(len(retv))
+                else:
+                    var_name = kwargs.get(var_name, [])
+                if not isinstance(var_name, list):
+                    var_name = [var_name]
+                retv.extend(var_name)
+                var_format.ints.append(len(var_name) + var_format.ints[-1])
+            return retv, var_format, tmp_index
+        else:
+            for var in meta:
+                if var.temporary:
+                    retv.append(kwargs.get(var.name, core.var_names.temp()))
+                    tmp_index.append(len(retv))
+                else:
+                    retv.append(kwargs.get(var.name, core.var_names.empty()))
+            return retv, None, tmp_index
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+def get_docstring_from_op_proto(op_proto):
+    """
+    Generate docstring from a OpProto
+    :param op_proto: a OpProto instance.
+    :type op_proto: op_proto_pb2.OpProto
+    :return: docstring
+    """
+    if not isinstance(op_proto, op_proto_pb2.OpProto):
+        raise TypeError("Input must be OpProto")
+    f = cStringIO.StringIO()
+    f.write(op_proto.comment)
+    f.write("\n")
+
+    def __append_param__(name, comment, type):
+        # Maybe replace the following line with template engine is better.
+        f.write(":param ")
+        f.write(name)
+        f.write(": ")
+        f.write(comment)
+        f.write("\n")
+        f.write(":type ")
+        f.write(name)
+        f.write(": ")
+        f.write(type)
+        f.write("\n")
+
+    for ipt in op_proto.inputs:
+        __append_param__(ipt.name, ipt.comment, "list | basestr"
+                         if ipt.multiple else "basestr")
+
+    temp_var_prefix = \
+        "This is a temporary variable. It does not have to set by user. "
+    for opt in op_proto.outputs:
+        __append_param__(opt.name, opt.comment if not opt.temporary else
+                         temp_var_prefix + opt.comment, "list | basestr"
+                         if opt.multiple else "basestr")
+
+    for attr in op_proto.attrs:
+        attr_type = None
+        if attr.type == attr_type_pb2.INT:
+            attr_type = "int"
+        elif attr.type == attr_type_pb2.FLOAT:
+            attr_type = "float"
+        elif attr.type == attr_type_pb2.STRING:
+            attr_type = "basestr"
+        elif attr.type == attr_type_pb2.INTS:
+            attr_type = "list of int"
+        elif attr.type == attr_type_pb2.FLOATS:
+            attr_type = "list of float"
+        elif attr.type == attr_type_pb2.STRINGS:
+            attr_type = "list of basestr"
+
+        if attr_type is None:
+            raise RuntimeError("Not supported attribute type " + attr.type)
+
+        __append_param__(attr.name, attr.comment, attr_type)
+
+    return f.getvalue()
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    __impl__.__doc__ = get_docstring_from_op_proto(op_proto)
+    return __impl__
+
+
+class OpCreationsHolder(object):
+    """
+    A object will holds all op creation methods.
+    
+    Use `op_creations.xxx_op` to access them.
+    """
+    pass
+
+
+op_creations = OpCreationsHolder()
+
+
+def __bootstrap__():
+    """
+    Bootstrap function for this module. It will dynamic create all op creation
+    methods in runtime.
+    """
+    for op_proto in get_all_op_protos():
+        func = create_op_creation_method(op_proto)
+        func.__name__ = str(op_proto.type)
+        setattr(op_creations, func.__name__, func)
+
+
+__bootstrap__()
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
index b205e2cabb..41db7c0d53 100644
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -1,9 +1,13 @@
 import unittest
 import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
 
 
-class TestOpCreationsMethods(unittest.TestCase):
-    def test_all_protos(self):
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
         all_protos = creation.get_all_op_protos()
         self.assertNotEqual(0, len(all_protos))
 
@@ -11,5 +15,240 @@ class TestOpCreationsMethods(unittest.TestCase):
             self.assertTrue(each.IsInitialized())
 
 
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op.comment = "not matter"
+
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+        output = method(X="a", Y="b", Z="c")
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(["a", "b"])
+        expected.outputs.append("c")
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "fc"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = op_desc_pb2.OpDesc()
+        expected1.inputs.extend(['x', 'w', 'b'])
+        expected1.outputs.extend(['y'])
+        expected1.type = 'fc'
+        attr = expected1.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3])
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = op_desc_pb2.OpDesc()
+        expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b'])
+        expected2.outputs.extend(['y'])
+        expected2.type = 'fc'
+        attr = expected2.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 3, 6, 7])
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(['a'])
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = attr_type_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = attr_type_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = attr_type_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = attr_type_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = attr_type_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+    def test_input_temporary_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        out = op.outputs.add()
+        out.name = "OUT"
+        out.comment = ""
+
+        out = op.outputs.add()
+        out.name = "TMP"
+        out.comment = ""
+        out.temporary = True
+
+        out = op.outputs.add()
+        out.name = "OUT2"
+        out.comment = ""
+        op.comment = ""
+
+        method = creation.OpDescCreationMethod(op)
+        generated = method(OUT="a", OUT2="b")
+        desc = op_desc_pb2.OpDesc()
+        desc.outputs.extend(["a", core.var_names.temp(), "b"])
+        desc.type = "test"
+        attr = desc.attrs.add()
+        attr.name = "temporary_index"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.append(2)
+        self.assertEqual(generated, desc)
+
+
+class TestOpCreationDocStr(unittest.TestCase):
+    def test_all(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        op.comment = """Test Op.
+
+This op is used for unit test, not a real op.
+"""
+        a = op.inputs.add()
+        a.name = "a"
+        a.comment = "Input a for test op"
+        a.multiple = True
+
+        b = op.inputs.add()
+        b.name = "b"
+        b.comment = "Input b for test op"
+        self.assertTrue(op.IsInitialized())
+
+        o1 = op.outputs.add()
+        o1.name = "output"
+        o1.comment = "The output of test op"
+
+        o2 = op.outputs.add()
+        o2.name = "temp output"
+        o2.comment = "The temporary output of test op"
+        o2.temporary = True
+
+        test_str = op.attrs.add()
+        test_str.name = "str_attr"
+        test_str.type = attr_type_pb2.STRING
+        test_str.comment = "A string attribute for test op"
+
+        actual = creation.get_docstring_from_op_proto(op)
+        expected_docstring = '''Test Op.
+
+This op is used for unit test, not a real op.
+
+:param a: Input a for test op
+:type a: list | basestr
+:param b: Input b for test op
+:type b: basestr
+:param output: The output of test op
+:type output: basestr
+:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
+:type temp output: basestr
+:param str_attr: A string attribute for test op
+:type str_attr: basestr
+'''
+        self.assertEqual(expected_docstring, actual)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = creation.op_creations.add_two(X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
+                         str(add_op))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index b6ee51cfe8..173a30a411 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -25,6 +25,8 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
+        if swig_api is None:
+            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 

From d3a749a5bfb32c61b9faa24424d36bb0fa471edb Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sun, 16 Jul 2017 11:13:39 +0800
Subject: [PATCH 159/205] CMake `op_library` function

* It is used to create an operator library. It handles to split CPU and
  GPU sources and links operator common libraries.
* It also give a reasonable warning and error when operator developer
  not correctly implement an operator.
  * Warning for lack of GPU kernel.
* Same interface as `cc_library` to make code style consistent.
---
 paddle/operators/CMakeLists.txt | 48 +++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 40bb326512..b2ea8eb344 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -1,6 +1,44 @@
-if(WITH_GPU)
-    nv_library(add_op SRCS add_op.cc add_op.cu DEPS operator op_registry glog ddim)
-else()
-    cc_library(add_op SRCS add_op.cc DEPS operator op_registry glog ddim)
-endif()
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry glog ddim)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH cu_srcs cu_srcs_len)
+    if (${cu_srcs_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+
+op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)

From afa99d9ae6f96fff62e46e57d8a110121c1e9c6d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 16 Jul 2017 12:53:03 +0800
Subject: [PATCH 160/205] add ADD_OP_CPU to enable add op with only cpu kernel
 (#2896)

* add ADD_OP_CPU to enable add op with only cpu kernel
---
 paddle/framework/op_registry.h | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index de20e7af05..19cb4c7b3e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -241,12 +241,18 @@ class OpRegisterHelper {
   }
 };
 
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+/**
+ * Macro to Register Operator.
+ */
 #define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
   STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
                                  "REGISTER_OP must be in global namespace"); \
@@ -254,9 +260,12 @@ class OpRegisterHelper {
       __op_register_##__op_type##__(#__op_type);                             \
   int __op_register_##__op_type##_handle__() { return 0; }
 
-#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
       "REGISTER_OP_KERNEL must be in global namespace");                  \
   struct __op_kernel_register__##type##__ {                               \
     __op_kernel_register__##type##__() {                                  \
@@ -267,7 +276,7 @@ class OpRegisterHelper {
     }                                                                     \
   };                                                                      \
   static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
-  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
 
 #define REGISTER_OP_GPU_KERNEL(type, KernelType) \
   REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
@@ -275,6 +284,10 @@ class OpRegisterHelper {
 #define REGISTER_OP_CPU_KERNEL(type, KernelType) \
   REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
 
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
 #define USE_OP_WITHOUT_KERNEL(op_type)                      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
       __use_op_without_kernel_##op_type,                    \
@@ -292,15 +305,16 @@ class OpRegisterHelper {
       __attribute__((unused)) =                                           \
           __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
 
-#ifdef PADDLE_ONLY_CPU
-#define USE_OP(op_type)           \
+// use Operator with only cpu kernel.
+#define USE_OP_CPU(op_type)       \
   USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);
+  USE_OP_KERNEL(op_type, CPU)
 
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
 #else
-#define USE_OP(op_type)           \
-  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);    \
+#define USE_OP(op_type) \
+  USE_OP_CPU(op_type);  \
   USE_OP_KERNEL(op_type, GPU)
 #endif
 

From 85c4f488f32d5ca0dfc420f4f7b29fb5a969d946 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 16 Jul 2017 15:17:31 +0800
Subject: [PATCH 161/205] Refactor DDim's product() and add slice_ddim()

1. Refactor DDim's product() to make it more efficiently.

2. Add slice_ddim().
---
 paddle/framework/ddim.cc      |  41 +++++++++
 paddle/framework/ddim.h       |   2 +
 paddle/framework/ddim_test.cc |  17 ++++
 paddle/framework/dim.h        |  15 ----
 paddle/framework/dim_test.cu  | 163 +++++++++++++++++-----------------
 5 files changed, 142 insertions(+), 96 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index a1ae079f4a..c898b6e322 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,4 +1,5 @@
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -190,6 +191,46 @@ ssize_t product(const DDim& ddim) {
   return boost::apply_visitor(visitor, ddim);
 }
 
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
+  }
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
+}
+
 ///\cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180be..675f8680f6 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -81,6 +81,8 @@ std::vector<int> vectorize(const DDim& ddim);
 
 ssize_t product(const DDim& ddim);
 
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
 /**
  * \brief What is the length of this dimension?
  *
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 8ce7886f8a..408905b00b 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -55,6 +55,23 @@ TEST(DDim, Equality) {
   EXPECT_EQ(
       paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
       90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework ::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework ::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 
 TEST(DDim, Print) {
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 8dc1bab06d..883fdc55eb 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -401,20 +401,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
-template <int D, int S>
-Dim<D> slice(const Dim<S>& dim, int begin, int end) {
-  PADDLE_ENFORCE(begin < end,
-                 "Begin index must be less than end index in Dim slice.");
-  PADDLE_ENFORCE(begin >= 0 && end <= S && end - begin == D,
-                 "Index error occurs in Dim slice.");
-  if (begin > 0) {
-    return slice<D>(dim.tail, begin - 1, end - 1);
-  }
-  if (D == 1) {
-    return Dim<1>(dim.head);
-  }
-  return Dim<D>(dim.head, slice<D - 1>(dim.tail, 0, end - 1));
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 0521741519..3898d0a447 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -1,100 +1,101 @@
 #include <thrust/device_vector.h>
 #include <sstream>
 
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 
 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }
 
 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-    // linearization
-    auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
-
-    // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-    // dynamic access
-    paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
-    c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }
 
 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
 }
 
 TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
 }

From 7de10ee7e2c9febbe81b22ad83e3e4584ba4a988 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 16 Jul 2017 16:35:44 +0800
Subject: [PATCH 162/205] formate code

---
 paddle/framework/ddim_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 408905b00b..6a099f2aeb 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -59,12 +59,12 @@ TEST(DDim, Equality) {
   // slice a DDim
   paddle::framework::DDim ddim2 =
       paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
-  paddle::framework ::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
   EXPECT_EQ(arity(ss), 3);
   EXPECT_EQ(ss[0], 3);
   EXPECT_EQ(ss[1], 4);
   EXPECT_EQ(ss[2], 5);
-  paddle::framework ::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
   EXPECT_EQ(arity(ss2), 6);
   EXPECT_EQ(ss2[0], 1);
   EXPECT_EQ(ss2[1], 2);

From 45ce1649a13a730931bc911576caad2f61afb715 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 16 Jul 2017 18:08:55 +0800
Subject: [PATCH 163/205] change net to operator (#2846)

* OperatorBase should not store OpDesc because not All op contains an
  OpDesc and not all ops create from OpDesc.
  * Networks do not contain OpDesc and are not created by OpDesc
* Do not register Network to OpRegistry.
  * The network is directly created by the user in Python. Not from
    registry.
* Correctly handle the `inputs` and `outputs` of a Network.
  * Add CompleteAddOp() methods
* Remove `AddOp(OpDesc&)` in net-op. All op are added by OperatorPtr.
* Rewrite unit test for truly tested what networks do.
* optimise operator_test
---
 paddle/framework/CMakeLists.txt   |   5 +-
 paddle/framework/net.cc           |  41 ++++++--
 paddle/framework/net.h            | 163 +++++++++---------------------
 paddle/framework/net_op_test.cc   |  67 ++++++++++++
 paddle/framework/op_registry.h    |   2 +-
 paddle/framework/operator.cc      |   2 +-
 paddle/framework/operator.h       |   7 +-
 paddle/framework/operator_test.cc |  46 +++++----
 8 files changed, 179 insertions(+), 154 deletions(-)
 create mode 100644 paddle/framework/net_op_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8415ce67e9..cc5b05ff0d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,10 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+
 cc_library(operator SRCS operator.cc DEPS op_desc device_context)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
@@ -21,4 +23,5 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index a0e8788846..7311cda9a9 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -19,18 +19,41 @@
 namespace paddle {
 namespace framework {
 
-PlainNet::PlainNet(const NetDesc& def) {}
-
-void PlainNet::InferShape(const ScopePtr& scope) const {
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
   for (auto& op : ops_) {
-    op.InferShape();
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
   }
-}
-
-void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
-  for (auto& op : ops_) {
-    op.Run(ctx);
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
   }
+
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 0481d8f47c..19a1620e29 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -1,99 +1,51 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
-using namespace paddle::platform;
-
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-
-struct OpDesc;
-struct OpAttrs {};
-
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() const {}
-  void Run(const DeviceContext &ctx) const {}
-};
-
 /**
- * @brief Network that manage the operators it has.
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
  *
- * Network is the container and controller of a set of operators, user can build
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
+ * Network is the container and controller of a set of operators.
 
  * A network object knows all Operators belonging to this network. Variables,
  * which are inputs and outputs of these operators, are created and managed by a
  * hierarchy of Scope objects.
  *
- * This is the base class of network, all the networks should implement the apis
+ * This is the base class of network, all the networks should implement the APIs
  * it defines.
  */
-class Net {
+class Net : public OperatorBase {
  public:
-  /**
-   * @brief Infer shapes of all inputs and outputs of operators.
-   */
-  virtual void InferShape(const ScopePtr &scope) const = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
-
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-
-  virtual ~Net() {}
+  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void CompleteAddOp() = 0;
 };
 
+using NetPtr = std::shared_ptr<Net>;
+
 /**
  * @brief a basic implementation of Net.
  *
@@ -103,18 +55,14 @@ class Net {
 class PlainNet : public Net {
  public:
   /**
-   * @brief Initialize a PlainNet.
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
+   * Infer all the operators' input and output variables' shapes, will be called
    * before every mini-batch
    */
-  virtual void InferShape(const ScopePtr &scope) const override;
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
 
   /**
    * @brief Run the network.
@@ -123,49 +71,32 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(const ScopePtr &scope,
-                   const DeviceContext &ctx) const override;
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
 
   /**
-   * @brief Add an operator to this network.
+   * @brief Add an operator by ptr
    */
-  virtual OpIndex AddOp(const OpProto &def) override;
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }
 
-  /**
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
+  void CompleteAddOp() override;
 
-  /**
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-
-  virtual ~PlainNet() override {}
-
- protected:
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
+  std::vector<OperatorPtr> ops_;
 
  private:
-  // the operators owned by `Network`.
-  std::vector<Operator> ops_;
+  bool add_op_done_{false};
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
 };
 
 }  // namespace framework
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
new file mode 100644
index 0000000000..f5e1c22400
--- /dev/null
+++ b/paddle/framework/net_op_test.cc
@@ -0,0 +1,67 @@
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+
+namespace pd = paddle::framework;
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+
+  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 19cb4c7b3e..24f56b2812 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -201,7 +201,7 @@ class OpRegistry {
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
-    op->desc_ = op_desc;
+    op->type_ = op_desc.type();
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
               std::back_inserter(op->inputs_));
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index d065670829..7756162a87 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,7 +20,7 @@ namespace framework {
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
   ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
+  ss << "type = " << type_ << "\n";
   ss << "inputs = [";
   for (auto& ipt : inputs_) {
     ss << ipt << ", ";
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index cf79f379fa..f7ed6e9f3d 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -62,11 +62,8 @@ class OperatorBase {
   virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
- protected:
-  std::string Type() const { return desc_.type(); }
-
  public:
-  OpDesc desc_;
+  std::string type_;
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   AttributeMap attrs_;
@@ -142,7 +139,7 @@ class OperatorWithKernel : public OperatorBase {
 
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
     opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
   }
 
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index d0c3153fae..19ac4ecafa 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,14 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
   void InferShape(const ScopePtr& scope) const override {}
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+    op_run_num++;
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
+    ASSERT_NEAR(GetAttr<float>("scale"), 3.14, 1e-5);
     ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
     ASSERT_EQ(x, 1);
     ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -36,15 +40,14 @@ class OperatorTest : public OperatorBase {
   float x = 0;
 };
 
-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
     AddComment("This is test op");
   }
 };
@@ -52,8 +55,8 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_OP(test_operator, paddle::framework::OperatorTest,
-            paddle::framework::OperatorTestProtoAndCheckerMaker);
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
 
 TEST(OperatorBase, all) {
   paddle::framework::OpDesc op_desc;
@@ -63,18 +66,17 @@ TEST(OperatorBase, all) {
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
-  attr->set_f(scale);
+  attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext device_context;
   auto scope = std::make_shared<paddle::framework::Scope>();
 
   paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
   scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
   op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
 namespace paddle {
@@ -86,13 +88,13 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
     AddComment("This is test op");
   }
 };
 
+static int cpu_kernel_run_num = 0;
+
 class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(const std::vector<const Tensor*>& inputs,
@@ -102,10 +104,10 @@ class OpWithKernelTest : public OperatorWithKernel {
 class CPUKernelTest : public OpKernel {
  public:
   void Compute(const KernelContext& context) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
-    std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ((int)context.op_.inputs_.size(), 1);
+    ASSERT_EQ((int)context.op_.outputs_.size(), 1);
+    ASSERT_NEAR(context.op_.GetAttr<float>("scale"), 3.14, 1e-5);
   }
 };
 
@@ -131,5 +133,7 @@ TEST(OpKernel, all) {
 
   paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }

From 12fe514dd35d2ea16caecde559c6b192debe378f Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 16 Jul 2017 19:20:28 +0800
Subject: [PATCH 164/205] "fix unrar in docker"

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ed5910d93b..8cfb16928c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \

From 56fbed9a42978518af2fd5a00e4271bc5629fdf2 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 17 Jul 2017 00:42:45 +0800
Subject: [PATCH 165/205] "fix dense vector shape in mq2007"

---
 python/paddle/v2/dataset/mq2007.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index fd71b34166..cffb319ad8 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
         for j in range(i + 1, len(querylist)):
             query_right = querylist[j]
             if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_left.feature_vector),
                     np.array(query_right.feature_vector)
                 ])
             elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_right.feature_vector),
                     np.array(query_left.feature_vector)
                 ])
     for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 
 
 def gen_list(querylist):

From a8c427e0d44d543f85ea680dea95480184bc1411 Mon Sep 17 00:00:00 2001
From: jc <choijulie@baidu.com>
Date: Sun, 16 Jul 2017 12:52:11 -0700
Subject: [PATCH 166/205] Change to meet cpplint style

---
 paddle/function/RowConvOpGpu.cu | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index c0b947e224..d9dcc7d59d 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
       __syncthreads();
       if (tidy < (context - 1)) {
         yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
       }
       __syncthreads();
 
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
       __syncthreads();
 
       for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
         __syncthreads();
 
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock(32, 32);
     dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
     real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
       KeRowConvBwWeight<32, 32, 32>
         <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
         (dw, x, dy, starts, height, width, numSeq, contextLength);

From 02e56304c1f6ce07f374f285f666d6e46dd777ac Mon Sep 17 00:00:00 2001
From: jc <choijulie@baidu.com>
Date: Sun, 16 Jul 2017 15:16:56 -0700
Subject: [PATCH 167/205] fix calling swig_api before import

---
 python/paddle/v2/optimizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index b6ee51cfe8..a399799406 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -35,6 +35,7 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()

From 278f1a8499be2e45641c4a4f515a7bafcfc71e5c Mon Sep 17 00:00:00 2001
From: jc <choijulie@baidu.com>
Date: Sun, 16 Jul 2017 15:50:50 -0700
Subject: [PATCH 168/205] fix calling swig_api before import

---
 python/paddle/v2/optimizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index a399799406..39b99a8b06 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,3 +1,4 @@
+import py_paddle.swig_paddle as swig_api
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -16,7 +17,6 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
-        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -35,7 +35,6 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
-        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()

From d649dbf442bd7ba4ce63a2a4e479a27c8d40ca8d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 17 Jul 2017 09:40:06 +0800
Subject: [PATCH 169/205] implement add_op kernel

---
 paddle/framework/operator.cc      |  8 +++--
 paddle/framework/operator.h       | 59 +++++++++++++++----------------
 paddle/framework/tensor.h         |  6 ++--
 paddle/operators/add_op.cc        |  6 ++--
 paddle/operators/add_op.cu        |  5 ++-
 paddle/operators/add_op.h         | 13 ++++---
 paddle/platform/device_context.cc |  9 ++---
 paddle/platform/device_context.h  | 13 +++----
 8 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 25d120c9a9..3c6376c150 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -18,13 +18,15 @@ namespace paddle {
 namespace framework {
 
 template <>
-DeviceType* KernelContext::get_eigen_device<CPUPlace>() {
-  return device_context_.get_eigen_device<DeviceType>();
+Eigen::DefaultDevice* OpKernel::KernelContext::get_eigen_device<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-DeviceType* KernelContext::get_eigen_device<GPUPlace>() {
+DeviceType* OpKernel::KernelContext::get_eigen_device<platform::GPUPlace>()
+    const {
   return device_context_.get_eigen_device<DeviceType>();
 }
 #endif
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 48cfeeb731..558d4a0b67 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -33,13 +33,13 @@ template <typename T>
 struct EigenDeviceConverter;
 
 template <>
-struct EigenDeviceConverter<CPUPlace> {
+struct EigenDeviceConverter<platform::CPUPlace> {
   using EigenDeviceType = Eigen::DefaultDevice;
 };
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-struct EigenDeviceConverter<GPUPlace> {
+struct EigenDeviceConverter<platform::GPUPlace> {
   using EigenDeviceType = Eigen::GpuDevice;
 };
 #endif
@@ -87,39 +87,38 @@ class OperatorBase {
   AttributeMap attrs_;
 };
 
-/**
- * KernelContext is the only parameter of Kernel Run function.
- * Run will get input/output variables, state such as momentum and
- * device resource such as CUDA stream, cublas handle, etc. from
- * KernelContext. User should construct it before run the Operator.
- */
-class KernelContext {
+class OpKernel {
  public:
-  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                const platform::DeviceContext& device_context)
-      : op_(*op), scope_(scope), device_context_(device_context) {}
-
-  const Variable* Input(int index) const {
-    return scope_->GetVariable(op_.inputs_[index]);
-  }
-
-  Variable* Output(int index) const {
-    return scope_->GetVariable(op_.outputs_[index]);
-  }
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
+  class KernelContext {
+   public:
+    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                  const platform::DeviceContext& device_context)
+        : op_(*op), scope_(scope), device_context_(device_context) {}
+
+    const Variable* Input(int index) const {
+      return scope_->GetVariable(op_.inputs_[index]);
+    }
 
-  platform::DeviceContext& device_context() const { return device_context_; }
+    Variable* Output(int index) const {
+      return scope_->GetVariable(op_.outputs_[index]);
+    }
 
-  template <typename PlaceType, typename DeviceType = EigenDeviceConverter<
-                                    PlaceType>::EigenDeviceType>
-  DeviceType* get_eigen_device();
+    template <typename PlaceType,
+              typename DeviceType =
+                  typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+    DeviceType* get_eigen_device() const;
 
-  const OperatorBase& op_;
-  const std::shared_ptr<Scope>& scope_;
-  const platform::DeviceContext& device_context_;
-};
+    const OperatorBase& op_;
+    const std::shared_ptr<Scope>& scope_;
+    const platform::DeviceContext& device_context_;
+  };
 
-class OpKernel {
- public:
   virtual void Compute(const KernelContext& context) const = 0;
 
   virtual ~OpKernel() {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 01244f617c..784d52cc42 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -35,7 +35,7 @@ class Tensor {
 
   template <typename T>
 
-  const T* data() const {
+  T* data() const {
     PADDLE_ENFORCE(
         holder_ != nullptr,
         "Tenosr has not been initialized. Call Tensor::mutable_data first.");
@@ -90,7 +90,7 @@ class Tensor {
   // flat to rank = 1
   template <typename T>
   typename TTypes<T>::Flat flat() {
-    return shaped<T, 1>({NumElements()});
+    return shaped<T, 1>(make_ddim({static_cast<int>(NumElements())}));
   }
 
   // to TensorType Vec
@@ -114,7 +114,7 @@ class Tensor {
 
   template <typename T>
   typename TTypes<T>::ConstFlat flat() const {
-    return shaped<T, 1>({NumElements()});
+    return shaped<T, 1>(make_ddim({static_cast<int>(NumElements())}));
   }
 
   template <typename T>
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index ef39e426fd..7dc6414af2 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -40,6 +40,6 @@ The equation is: Out = X + Y
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    add_two,
-    ::paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>);
\ No newline at end of file
+typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
+    AddKernel_CPU_float;
+REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
\ No newline at end of file
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index f4a4fb16a6..0edf142ee4 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,7 +1,6 @@
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/add_op.h"
 #include "paddle/framework/op_registry.h"
 
+typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(add_two,
-                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+                       AddKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 27a477a3ac..568cb19742 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -6,19 +6,18 @@
 namespace paddle {
 namespace operators {
 
-// Place can be CPUPlace or GPUPlace
-template <typename Place, typename DataType>
+template <typename Place, typename T>
 class AddKernel : public framework::OpKernel {
 public:
   void Compute(const KernelContext& context) const override {
-    auto* input0 = context.Input(0);
-    auto* input1 = context.Input(1);
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
 
-    auto* output = context.Output(0);
-    output->mutable_data<DataType>(Place());
+    output->mutable_data<T>(Place());
 
     output->flat<T>().device(*(context.get_eigen_device<Place>())) =
-        input0->flat<T>() + input1->flat<T>();
+        input0.flat<T>() + input1.flat<T>();
   }
 };
 
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 960ef0a595..9c1d94e9e7 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,14 +15,15 @@ namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
-  return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+    const {
+  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
-  return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
 #endif
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 7de07d06be..2ec7b05599 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -32,17 +32,14 @@ class DeviceContext {
   virtual Place GetPlace() const = 0;
 
   template <typename DeviceType>
-  DeviceType* get_eigen_device();
+  DeviceType* get_eigen_device() const;
 };
 
 class CPUDeviceContext : public DeviceContext {
  public:
-  Eigen::DefaultDevice* eigen_device() {
-    if (!eigen_device_) {
-      eigen_device_.reset(new Eigen::DefaultDevice());
-    }
-    return eigen_device_.get();
-  }
+  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
+
+  Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
 
   Place GetPlace() const override {
     Place retv = CPUPlace();
@@ -91,7 +88,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudaStream_t stream() { return stream_; }
 
-  Eigen::GpuDevice* eigen_device() { return eigen_device_.get(); }
+  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {

From 23b8346072f4bc88fd88cfac82933de501f9f739 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Mon, 17 Jul 2017 10:15:42 +0800
Subject: [PATCH 170/205] Fault tolerant distributed training, just work
 version, with etcd (#2849)

* using etcd as fault tolerant training

* update

* workable version, ft not tested

* small fix

* update

* remove TODO
---
 go/cmd/pserver/pserver.go                    |  2 +-
 go/master/client.go                          |  5 ++--
 go/master/service.go                         |  1 +
 go/pserver/client/c/test/test_train.py       | 28 ++++++++++++++++----
 go/pserver/client/etcd_client.go             |  5 ++--
 go/pserver/etcd_client.go                    | 11 ++++----
 paddle/api/PaddleAPI.h                       |  3 ++-
 paddle/api/ParameterUpdater.cpp              |  5 ++--
 paddle/scripts/docker/build.sh               |  3 ++-
 paddle/trainer/NewRemoteParameterUpdater.cpp | 20 ++++++++++++--
 paddle/trainer/NewRemoteParameterUpdater.h   |  5 ++++
 python/paddle/v2/dataset/common.py           |  6 ++---
 python/paddle/v2/master/client.py            |  5 ++--
 python/paddle/v2/optimizer.py                |  8 +++---
 python/paddle/v2/trainer.py                  |  6 +++--
 15 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index b331b8126c..652d7ba315 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -40,7 +40,7 @@ func main() {
 		idx = *index
 	} else {
 		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
-		idx, err = e.Register()
+		idx, err = e.Register(*port)
 		candy.Must(err)
 
 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
diff --git a/go/master/client.go b/go/master/client.go
index a2ca3f3ef8..de883bf4b9 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master
 
 import (
 	"os"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}
 
diff --git a/go/master/service.go b/go/master/service.go
index a6050ab994..9cef2270ce 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index d6922672f4..e9264592b4 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -1,5 +1,23 @@
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+
+
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)
 
 
 def main():
@@ -22,13 +40,13 @@ def main():
     # create optimizer of new remote updater to pserver
     optimizer = paddle.optimizer.Momentum(momentum=0)
 
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
-
+    print "etcd endoint: ", etcd_endpoint
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
                                  is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)
 
     # event_handler to print training and testing info
     def event_handler(event):
@@ -47,11 +65,11 @@ def main():
                 print "Test %d, %.2f" % (event.pass_id, result.cost)
 
     # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
-            batch_size=2),
+                cloud_reader, buf_size=500), batch_size=2),
         feeding={'x': 0,
                  'y': 1},
         event_handler=event_handler,
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 1fd3479aa8..8eb2a4f451 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,6 +12,7 @@ import (
 )
 
 const (
+	// DefaultEtcdTimeout is the default etcd timeout
 	DefaultEtcdTimeout time.Duration = 5 * time.Second
 )
 
@@ -66,12 +67,12 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
+			cancel()
 			if err != nil {
-				log.Infof("Get psKey= %s error, %v", psKey, err)
+				log.Infof("Get psKey=%s error, %v", psKey, err)
 				time.Sleep(p.timeout)
 				continue
 			}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 4a694b97f4..66af4fa0b4 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {
 
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 }
 
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5fb3d1c73b..0b9b83d429 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                                bool useSparseUpdater);
   static ParameterUpdater* createNewRemoteUpdater(
       OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 1aaefdfb81..5934cb898b 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
     OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
   auto updater = new ParameterUpdater();
   updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
   return updater;
 #else
   throw UnsupportError();
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ab60f1a38d..3860facb09 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,8 @@ RUN apt-get update &&\
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
-
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index b359d9da21..a830ceba57 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -28,6 +28,17 @@ NewRemoteParameterUpdater::NewRemoteParameterUpdater(
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
 
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
 void NewRemoteParameterUpdater::init(
     const std::vector<ParameterPtr> &parameters) {
   ParameterUpdater::init(parameters);
@@ -38,8 +49,13 @@ void NewRemoteParameterUpdater::init(
   }
 
   // create parameter server client.
-  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                               FLAGS_trainer_id == 0);
+  if (useEtcd_) {
+    parameterClient_ = paddle_new_etcd_pserver_client(
+        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
 
   // init new parameter and gradient.
   newParameters_ = initNewParameter(PARAMETER_VALUE);
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index dfed00bc21..6223ba427c 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -32,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
   ~NewRemoteParameterUpdater() {
     releaseNewParameter(newParameters_);
     releaseNewParameter(newGradients_);
@@ -111,6 +114,8 @@ protected:
   paddle_parameter** newGradients_;
   /// the specification of parameter server "host1:port,host1:port"
   std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };
 
 }  // namespace paddle
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 4a2eb59c34..a799022274 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random
 
 __all__ = [
     'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -170,8 +172,6 @@ def convert(output_path,
             name_prefix,
             max_lines_to_shuffle=1000):
     import recordio
-    import cPickle as pickle
-    import random
     """
     Convert data from reader to recordio format files.
 
@@ -201,7 +201,7 @@ def convert(output_path,
     def write_data(w, lines):
         random.shuffle(lines)
         for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            d = cPickle.dumps(d)
             w[i % num_shards].write(d)
 
     w = open_writers()
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 70f9e43c96..4c041fb509 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -10,8 +10,9 @@ class client(object):
     client is a client to the master server.
     """
 
-    def __init__(self, addr, buf_size):
-        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def __init__(self, etcd_endpoints, timeout, buf_size):
+        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout,
+                                                   buf_size)
 
     def close(self):
         lib.paddle_release_master_client(self.c)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index b6ee51cfe8..755b1e09d7 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -46,12 +46,12 @@ class Optimizer(object):
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def __create_new_remote_updater__(self, pserver_spec):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec)
+            self.__opt_conf__, pserver_spec, use_etcd)
 
     def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec):
+                       pserver_spec, use_etcd):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -77,7 +77,7 @@ class Optimizer(object):
                     num_passes, use_sparse_updater)
             else:
                 parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec)
+                    pserver_spec, use_etcd)
         return parameter_updater
 
 
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 92fdf98e90..76bae0bb12 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -45,7 +45,8 @@ class SGD(object):
                  update_equation,
                  extra_layers=None,
                  is_local=True,
-                 pserver_spec=None):
+                 pserver_spec=None,
+                 use_etcd=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -61,6 +62,7 @@ class SGD(object):
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
         self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -127,7 +129,7 @@ class SGD(object):
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
             self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__)
+            self.__pserver_spec__, self.__use_etcd__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()

From 65dbeb6a24a0362fb696e9f67b3effc1691d4d9e Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 17 Jul 2017 03:01:33 +0000
Subject: [PATCH 171/205] fix gpu build error

---
 paddle/framework/operator.cc    |  6 +++---
 paddle/function/RowConvOpGpu.cu | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index aa859591f0..946bde5734 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -25,9 +25,9 @@ Eigen::DefaultDevice* OpKernel::KernelContext::get_eigen_device<
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-DeviceType* OpKernel::KernelContext::get_eigen_device<platform::GPUPlace>()
-    const {
-  return device_context_.get_eigen_device<DeviceType>();
+Eigen::GpuDevice* OpKernel::KernelContext::get_eigen_device<
+    platform::GPUPlace, Eigen::GpuDevice>() const {
+  return device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
 
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index c0b947e224..d9dcc7d59d 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
       __syncthreads();
       if (tidy < (context - 1)) {
         yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
       }
       __syncthreads();
 
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
       __syncthreads();
 
       for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
         __syncthreads();
 
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock(32, 32);
     dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
     real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
       KeRowConvBwWeight<32, 32, 32>
         <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
         (dw, x, dy, starts, height, width, numSeq, contextLength);

From 83f263e6ec0b7b32d665a493ca184b9e6bf4bb19 Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Mon, 17 Jul 2017 11:56:32 +0800
Subject: [PATCH 172/205] Fix fetch record from master failed  (#2848)

Fix fetch record from master
---
 go/cmd/master/master.go            | 14 +++++++++++---
 python/paddle/v2/dataset/common.py |  6 ++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 54fa254863..9eaf8c04ae 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (
 
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index a799022274..645f3cc0dc 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -201,8 +201,10 @@ def convert(output_path,
     def write_data(w, lines):
         random.shuffle(lines)
         for i, d in enumerate(lines):
-            d = cPickle.dumps(d)
-            w[i % num_shards].write(d)
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            o = pickle.dumps(d)
+            w[i % num_shards].write(o)
 
     w = open_writers()
     lines = []

From 778a1a9f8780405a32814d3f95accccb4304dc87 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 17 Jul 2017 12:03:46 +0800
Subject: [PATCH 173/205] add a comment for

---
 paddle/framework/ddim.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 675f8680f6..df26d73d48 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -81,6 +81,13 @@ std::vector<int> vectorize(const DDim& ddim);
 
 ssize_t product(const DDim& ddim);
 
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
 DDim slice_ddim(const DDim& dim, int begin, int end);
 
 /**

From c5bc126762031231eb8a144d3318c9dcbaea68ed Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 12:42:04 +0800
Subject: [PATCH 174/205] Follow comment, rename to `GenerateTempVariableName`

---
 paddle/framework/op_registry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index b627b4a60a..ec237950df 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -220,7 +220,7 @@ class OpRegistry {
     op_checkers().at(op_type).Check(op->attrs_);
 
     //! Convert Temporary variable name to an unique variable name.
-    AssignTempVariable(op.get());
+    GenerateTempVariableName(op.get());
 
     //! Other op's custom Init for a complex Op. For simple Op, the Init
     //! method do nothing.
@@ -234,7 +234,7 @@ class OpRegistry {
   };
 
  private:
-  static void AssignTempVariable(OperatorBase* op) {
+  static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
     for (auto& outname : op->outputs_) {
       if (outname == OperatorBase::TMP_VAR_NAME()) {

From cdec5634492ed088e8c0792aafbbc43de91f6692 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 17 Jul 2017 13:11:15 +0800
Subject: [PATCH 175/205] Add enforce switch for convient develop (#2850)

* add NDEBUG switch to PADDLE_ENFORCE
---
 paddle/framework/CMakeLists.txt | 10 ++++++----
 paddle/framework/enforce.cc     | 15 +++++++++++++++
 paddle/framework/enforce.h      |  6 ++++++
 3 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 paddle/framework/enforce.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index cc5b05ff0d..824d34d016 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,21 +2,23 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim glog gflags)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
+cc_library(enforce SRCS enforce.cc DEPS glog gflags)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 
-cc_library(operator SRCS operator.cc DEPS op_desc device_context)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context enforce)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
+
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
diff --git a/paddle/framework/enforce.cc b/paddle/framework/enforce.cc
new file mode 100644
index 0000000000..644930ff98
--- /dev/null
+++ b/paddle/framework/enforce.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/enforce.h"
diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h
index 56cb7f9564..ffce8148e9 100644
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <paddle/string/printf.h>
 #include <exception>
 #include <sstream>
@@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception {
 /**
  * @brief Enforce a condition, otherwise throw an EnforceNotMet
  */
+#ifdef NDEBUG
 #define PADDLE_ENFORCE(condition, ...) \
   do {                                 \
     if (UNLIKELY(!(condition))) {      \
       PADDLE_THROW(__VA_ARGS__);       \
     }                                  \
   } while (0)
+#else
+#define PADDLE_ENFORCE(condition, ...) \
+  CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__);
+#endif
 
 }  // namespace framework
 }  // namespace paddle

From c78a5e5da24e7e7edc7d5cfd92b349f3913773ac Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 13:11:47 +0800
Subject: [PATCH 176/205] Fix merge error before

---
 python/paddle/v2/optimizer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 260a509469..ba58198033 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,4 +1,3 @@
-import py_paddle.swig_paddle as swig_api
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -17,6 +16,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -25,8 +25,6 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
-        if swig_api is None:
-            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 
@@ -37,18 +35,22 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
     def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
             self.__opt_conf__, pserver_spec, use_etcd)
 

From 8a3e7353078b01d2d1ba133b6eb1e24ea0d20314 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 17 Jul 2017 05:57:03 +0000
Subject: [PATCH 177/205] Delete the blank line at the end of script file
 build_android.sh.

---
 paddle/scripts/docker/build_android.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index 53e1b818cb..56d290be4a 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -21,4 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 make -j `nproc`
 make install
-

From 80a26a63083cf002567cd2363d9d722ae94d17d2 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 17 Jul 2017 14:16:47 +0800
Subject: [PATCH 178/205] check duplicate of ProtoAndCheckerMaker (#2903)

---
 paddle/framework/op_registry.h       | 31 +++++++++++++++++-------
 paddle/framework/op_registry_test.cc | 36 ++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 24f56b2812..41bdb65f8e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -61,7 +61,14 @@ class OpProtoAndCheckerMaker {
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
-  ~OpProtoAndCheckerMaker() { CheckNoDuplicatedAttrs(); }
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate() {
+    validated_ = true;
+    CheckNoDuplicatedInOutAttrs();
+  }
 
  protected:
   void AddInput(const std::string& name, const std::string& comment,
@@ -163,19 +170,26 @@ Add a mark to which output is temporary is helpful for future optimization.
     }
   }
 
-  void CheckNoDuplicatedAttrs() {
+  void CheckNoDuplicatedInOutAttrs() {
     std::unordered_set<std::string> names;
-    size_t cnt = 0;
+    auto checker = [&](const std::string& name) {
+      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+      names.insert(name);
+    };
     for (auto& attr : proto_->attrs()) {
-      names.insert(attr.name());
-      ++cnt;
+      checker(attr.name());
+    }
+    for (auto& input : proto_->inputs()) {
+      checker(input.name());
+    }
+    for (auto& output : proto_->outputs()) {
+      checker(output.name());
     }
-    PADDLE_ENFORCE(names.size() == cnt,
-                   "Cannot register two attribute in same name!");
   }
 
   OpProto* proto_;
   OpAttrChecker* op_checker_;
+  bool validated_{false};
   bool has_multiple_input_{false};
   bool has_multiple_output_{false};
   bool has_temporary_output_{false};
@@ -190,7 +204,8 @@ class OpRegistry {
     creators()[op_type] = [] { return new OpType; };
     OpProto& op_proto = protos()[op_type];
     OpAttrChecker& op_checker = op_checkers()[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
+    auto maker = ProtoMakerType(&op_proto, &op_checker);
+    maker.Validate();
     *op_proto.mutable_type() = op_type;
     PADDLE_ENFORCE(
         op_proto.IsInitialized(),
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 4791d4aaab..d3a51a361a 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,6 +1,8 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
+namespace pd = paddle::framework;
+
 namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
@@ -28,8 +30,6 @@ class MyTestOp : public OperatorBase {
   void InferShape(const ScopePtr& scope) const override {}
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-
- public:
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -182,3 +182,35 @@ TEST(OpRegistry, CustomChecker) {
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
+
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}

From 38310f9349fedfeaac054eb6283f6c1a54ff5327 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 14:30:35 +0800
Subject: [PATCH 179/205] Refine CMake dependencies graph

---
 paddle/framework/CMakeLists.txt | 10 +++++-----
 paddle/framework/tensor.cc      | 19 +++++++++++++++++++
 paddle/operators/CMakeLists.txt |  2 +-
 3 files changed, 25 insertions(+), 6 deletions(-)
 create mode 100644 paddle/framework/tensor.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 824d34d016..e7d1c7203a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,19 +1,19 @@
-# ddim lib
+cc_library(enforce SRCS enforce.cc DEPS glog)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim glog gflags)
+cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_library(enforce SRCS enforce.cc DEPS glog gflags)
-cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 
-cc_library(operator SRCS operator.cc DEPS op_desc device_context enforce)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
new file mode 100644
index 0000000000..964f15ab66
--- /dev/null
+++ b/paddle/framework/tensor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/tensor.h>
+
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index b2ea8eb344..441b9e30c4 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -4,7 +4,7 @@ function(op_library TARGET)
     # for ops.
     set(cc_srcs)
     set(cu_srcs)
-    set(op_common_deps operator op_registry glog ddim)
+    set(op_common_deps operator op_registry)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)

From 5017b154689bd8cb595c1d37a54cb2fd072488bc Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 17 Jul 2017 15:37:42 +0800
Subject: [PATCH 180/205] refactor tensor mutable_data

---
 paddle/framework/operator.h      | 14 +++++++-------
 paddle/framework/tensor.h        | 22 ++++++++++------------
 paddle/platform/device_context.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index c48d990eb2..e6cae9c32b 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 30e00d0e0f..7ba4b29e7c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -62,21 +62,19 @@ class Tensor {
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->size() < numel_ * sizeof(T) + offset_) {
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+      }
 #ifdef __CUDACC__
-      switch (place.which()) {
-        case 0:
-          holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
-          break;
-
-        case 1:
-          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
-          break;
+      else if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
       }
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+      else if (platform::is_gpu_place(place)) {
+        PADDLE_ENFORCE(true, "GPU not support!");
+      }
 #endif
       offset_ = 0;
     }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 5f8ad15951..f226a75c20 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <paddle/platform/place.h>
 #include <memory>
-#include <unsupported/Eigen/CXX11/Tensor>
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {

From 1ed237c11ea1f9f0f68467dcf0f284bc54f9129c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 15:55:06 +0800
Subject: [PATCH 181/205] Add skeletons of `mul`, `rowwise_add`, `sigmoid`,
 `softmax` ops

* Implement InferShape and register them, give a stub Kernel method
  by LOG(INFO)
---
 paddle/framework/ddim.cc           |  6 +++
 paddle/framework/ddim.h            |  4 ++
 paddle/operators/CMakeLists.txt    |  5 +++
 paddle/operators/add_op.cc         |  3 +-
 paddle/operators/mul_op.cc         | 60 +++++++++++++++++++++++++++++
 paddle/operators/mul_op.cu         | 20 ++++++++++
 paddle/operators/mul_op.h          | 30 +++++++++++++++
 paddle/operators/rowwise_add_op.cc | 61 ++++++++++++++++++++++++++++++
 paddle/operators/rowwise_add_op.cu |  5 +++
 paddle/operators/rowwise_add_op.h  | 31 +++++++++++++++
 paddle/operators/sigmoid_op.cc     | 49 ++++++++++++++++++++++++
 paddle/operators/sigmoid_op.cu     |  5 +++
 paddle/operators/sigmoid_op.h      | 31 +++++++++++++++
 paddle/operators/softmax_op.cc     | 49 ++++++++++++++++++++++++
 paddle/operators/softmax_op.cu     |  5 +++
 paddle/operators/softmax_op.h      | 31 +++++++++++++++
 paddle/pybind/CMakeLists.txt       |  3 +-
 paddle/pybind/pybind.cc            |  4 ++
 18 files changed, 399 insertions(+), 3 deletions(-)
 create mode 100644 paddle/operators/mul_op.cc
 create mode 100644 paddle/operators/mul_op.cu
 create mode 100644 paddle/operators/mul_op.h
 create mode 100644 paddle/operators/rowwise_add_op.cc
 create mode 100644 paddle/operators/rowwise_add_op.cu
 create mode 100644 paddle/operators/rowwise_add_op.h
 create mode 100644 paddle/operators/sigmoid_op.cc
 create mode 100644 paddle/operators/sigmoid_op.cu
 create mode 100644 paddle/operators/sigmoid_op.h
 create mode 100644 paddle/operators/softmax_op.cc
 create mode 100644 paddle/operators/softmax_op.cu
 create mode 100644 paddle/operators/softmax_op.h

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 73f5499ad1..f3dd396613 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -278,5 +278,11 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+ssize_t DDim::size() const { return product(*this); }
+
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index a0c2a8a74a..3976c6c029 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -29,6 +29,8 @@ struct DDim {
   template <int D>
   explicit DDim(const Dim<D>& in) : var(in) {}
 
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
+
   template <int D>
   DDim& operator=(const Dim<D>& in) {
     var = in;
@@ -57,6 +59,8 @@ struct DDim {
   DDim operator+(DDim d) const;
 
   DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };
 
 /**
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 441b9e30c4..f47c3a4208 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -42,3 +42,8 @@ endfunction()
 
 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 522b23cbc4..355c92a504 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -31,8 +31,7 @@ protected:
         "Inputs/Outputs of AddOp must all be set");
     PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                    "Two input of Add Op's dimension must be same.");
-    // Need set dims in Tensor
-    // outputs[0]->set_dims(inputs[0]->dims())
+    outputs[0]->set_dims(inputs[0]->dims());
   }
 };
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
new file mode 100644
index 0000000000..713b2a5dc8
--- /dev/null
+++ b/paddle/operators/mul_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/operators/mul_op.h>
+
+namespace paddle {
+namespace operators {
+
+class MulOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
+    outputs[0]->set_dims({dim0[0], dim1[1]});
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
new file mode 100644
index 0000000000..201723df24
--- /dev/null
+++ b/paddle/operators/mul_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/operators/mul_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(mul,
+                       paddle::operators::MulKernel<paddle::platform
+                       ::GPUPlace>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
new file mode 100644
index 0000000000..ed8d26e136
--- /dev/null
+++ b/paddle/operators/mul_op.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class MulKernel : public framework::OpKernel {
+public:
+  void Compute(const KernelContext &context) const override {
+    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
new file mode 100644
index 0000000000..414bafd046
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+namespace paddle {
+namespace operators {
+
+class RowWiseAddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(rowwise_add,
+            paddle::operators::RowWiseAddOp,
+            paddle::operators::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
new file mode 100644
index 0000000000..95e29d1fa3
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    mul, paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
new file mode 100644
index 0000000000..3dfde93ba2
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class RowWiseAddKernel : public framework::OpKernel {
+public:
+  void Compute(const KernelContext &context) const override {
+    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
new file mode 100644
index 0000000000..45ae277c53
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddInput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(sigmoid,
+            paddle::operators::SigmoidOp,
+            paddle::operators::SigmoidOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
new file mode 100644
index 0000000000..79d5222348
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/operators/sigmoid_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
new file mode 100644
index 0000000000..191aa42e4a
--- /dev/null
+++ b/paddle/operators/sigmoid_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SigmoidKernel : public framework::OpKernel {
+public:
+  void Compute(const KernelContext &context) const override {
+    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
new file mode 100644
index 0000000000..4ca7be359e
--- /dev/null
+++ b/paddle/operators/softmax_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
+
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
new file mode 100644
index 0000000000..59f32b35cf
--- /dev/null
+++ b/paddle/operators/softmax_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
new file mode 100644
index 0000000000..fe97c9aafe
--- /dev/null
+++ b/paddle/operators/softmax_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SoftmaxKernel : public framework::OpKernel {
+public:
+  void Compute(const KernelContext &context) const override {
+    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 8564a5f5fe..00b14a9432 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1 +1,2 @@
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
+        add_op mul_op rowwise_add_op sigmoid_op softmax_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c1a025ed04..aa2b84799c 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -24,6 +24,10 @@ namespace py = pybind11;
 namespace pd = paddle::framework;
 
 USE_OP(add_two);
+USE_OP(softmax);
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);
 
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");

From a0caf23430545c12b4f714891d5437559a67ac07 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 17 Jul 2017 16:03:12 +0800
Subject: [PATCH 182/205] Op varient inputs (#2901)

* add inputs

* add ut for multiple inputs

* fix AddToLayer

* op_desc -> op_proto

* CreateArgumentOffsetMap -> CreateInOutOffsetMap

* move CreateInOutOffsetMap from OperatorBase to op registry

* arg_idxs_ -> in_out_idxs_
---
 paddle/framework/op_registry.h    |  11 +++
 paddle/framework/operator.cc      |  58 +++++++++++++++
 paddle/framework/operator.h       |  99 ++++++++++++++++++-------
 paddle/framework/operator_test.cc | 116 +++++++++++++++++++++++++++---
 paddle/operators/add_op.h         |   4 +-
 5 files changed, 251 insertions(+), 37 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 41bdb65f8e..a84364301a 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -216,21 +216,32 @@ class OpRegistry {
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
+    const OpProto& op_proto = protos().at(op_type);
+    // set op's inputs_ from desc.
     op->type_ = op_desc.type();
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
               std::back_inserter(op->inputs_));
+    // set op's outputs_ from desc.
     op->outputs_.reserve((size_t)op_desc.outputs_size());
     std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
               std::back_inserter(op->outputs_));
+    // set op's attr;
     for (auto& attr : op_desc.attrs()) {
       op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
     op_checkers().at(op_type).Check(op->attrs_);
+    // set argument offsets stored in op.
+    CreateInOutOffsetMap(op, op_proto);
     op->Init();
     return op;
   }
 
+  // init op.in_out_idxs_ to accelerate argument's offset lookup.
+  static void CreateInOutOffsetMap(OperatorPtr op, const OpProto& proto) {
+    op->CreateInOutOffsetMap(proto);
+  }
+
   static std::unordered_map<std::string, OpProto>& protos() {
     static std::unordered_map<std::string, OpProto> protos_;
     return protos_;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 7756162a87..58a34fca0f 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,11 +12,69 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+
 #include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 
+void OperatorBase::CreateInOutOffsetMap(const OpProto& proto) {
+  PADDLE_ENFORCE(in_out_idxs_.empty(), "duplicate call CreateInOutOffsetMap");
+  for (int i = 0; i < proto.inputs_size(); i++) {
+    const auto& name = proto.inputs()[i].name();
+    in_out_idxs_[name] = i;
+  }
+  for (int i = 0; i < proto.outputs_size(); i++) {
+    const auto& name = proto.outputs()[i].name();
+    in_out_idxs_[name] = i;
+  }
+}
+
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto it = in_out_idxs_.find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+
+  if (attrs_.count("input_format") == 0) {
+    return inputs_[it->second];
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at(idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_.at(name);
+
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto it = in_out_idxs_.find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+
+  if (attrs_.count("output_format") == 0) {
+    return outputs_[it->second];
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at(idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_.at(name);
+
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
+
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
   ss << "=================\n";
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index f7ed6e9f3d..6567950ce5 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
+
 namespace paddle {
 namespace framework {
 
@@ -62,11 +64,72 @@ class OperatorBase {
   virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
+  // Get a input with argument's name described in `op_proto`
+  const std::string& Input(const std::string& name) const;
+  // Get a input which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  // Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  // Get an output which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
+
+  // init in_out_idxs_ to accelerate argument's offset lookup.
+  void CreateInOutOffsetMap(const OpProto& proto);
+
  public:
   std::string type_;
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::unordered_map<std::string, int> in_out_idxs_;
+};
+
+class KernelContext {
+ public:
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                const platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(device_context) {}
+
+  const Variable* Input(int index) const {
+    return scope_->GetVariable(op_.inputs_[index]);
+  }
+
+  Variable* Output(int index) const {
+    return scope_->GetVariable(op_.outputs_[index]);
+  }
+
+  const Variable* Input(const std::string& name) const {
+    return scope_->GetVariable(op_.Input(name));
+  }
+
+  const Variable* Output(const std::string& name) const {
+    return scope_->GetVariable(op_.Output(name));
+  }
+
+  const std::vector<const Variable*> Inputs(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+
+  const std::vector<const Variable*> Outputs(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+
+  const OperatorBase& op_;
+  const std::shared_ptr<Scope>& scope_;
+  const platform::DeviceContext& device_context_;
 };
 
 class OpKernel {
@@ -77,25 +140,6 @@ class OpKernel {
    * device resource such as CUDA stream, cublas handle, etc. from
    * KernelContext. User should construct it before run the Operator.
    */
-  class KernelContext {
-   public:
-    KernelContext(const OperatorBase* op, const ScopePtr& scope,
-                  const platform::DeviceContext& device_context)
-        : op_(*op), scope_(scope), device_context_(device_context) {}
-
-    const Variable* Input(int index) const {
-      return scope_->GetVariable(op_.inputs_[index]);
-    }
-
-    Variable* Output(int index) const {
-      return scope_->GetVariable(op_.outputs_[index]);
-    }
-
-    const OperatorBase& op_;
-    const ScopePtr& scope_;
-    const platform::DeviceContext& device_context_;
-  };
-
   virtual void Compute(const KernelContext& context) const = 0;
 
   virtual ~OpKernel() {}
@@ -140,7 +184,7 @@ class OperatorWithKernel : public OperatorBase {
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(KernelContext(this, scope, dev_ctx));
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -148,6 +192,7 @@ class OperatorWithKernel : public OperatorBase {
     static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
     return g_all_op_kernels;
   }
+
   void InferShape(const std::shared_ptr<Scope>& scope) const final {
     std::vector<const Tensor*> ins;
     VarNamesToTensors(scope, inputs_, &ins);
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 19ac4ecafa..6fa110f94c 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -30,7 +30,6 @@ class OpWithoutKernelTest : public OperatorBase {
     op_run_num++;
     ASSERT_EQ((int)inputs_.size(), 1);
     ASSERT_EQ((int)outputs_.size(), 1);
-    ASSERT_NEAR(GetAttr<float>("scale"), 3.14, 1e-5);
     ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
     ASSERT_EQ(x, 1);
     ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -86,9 +85,11 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op");
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
     AddComment("This is test op");
   }
 };
@@ -103,11 +104,65 @@ class OpWithKernelTest : public OperatorWithKernel {
 
 class CPUKernelTest : public OpKernel {
  public:
-  void Compute(const KernelContext& context) const {
+  void Compute(const KernelContext& ctx) const {
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
     cpu_kernel_run_num++;
-    ASSERT_EQ((int)context.op_.inputs_.size(), 1);
-    ASSERT_EQ((int)context.op_.outputs_.size(), 1);
-    ASSERT_NEAR(context.op_.GetAttr<float>("scale"), 3.14, 1e-5);
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+  }
+};
+
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+
+ public:
+  float x = 0;
+};
+
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("xs", "inputs of test op");
+    AddInput("k", "input of test op");
+    AddOutputs("ys", "outputs of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
   }
 };
 
@@ -118,6 +173,7 @@ REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
             paddle::framework::OpKernelTestProtoAndCheckerMaker);
 REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
 
+// test with single input
 TEST(OpKernel, all) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
@@ -137,3 +193,47 @@ TEST(OpKernel, all) {
   op->Run(scope, cpu_device_context);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
+
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  op->Run(scope, cpu_device_context);
+}
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 17d459dbc8..000564f66d 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -8,10 +8,10 @@ namespace operators {
 template <typename Place>
 class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
     LOG(INFO) << "Add kernel in " << typeid(Place).name();
   }
 };
 
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle

From 2a03e3808d48257a71366f5802aeec052914e1cc Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 17 Jul 2017 16:45:42 +0800
Subject: [PATCH 183/205] set correct place for output tensor

---
 paddle/framework/operator.cc | 4 ++--
 paddle/framework/operator.h  | 4 +++-
 paddle/operators/add_op.h    | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 946bde5734..1a7e332227 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -18,14 +18,14 @@ namespace paddle {
 namespace framework {
 
 template <>
-Eigen::DefaultDevice* OpKernel::KernelContext::get_eigen_device<
+Eigen::DefaultDevice* OpKernel::KernelContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
   return device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice* OpKernel::KernelContext::get_eigen_device<
+Eigen::GpuDevice* OpKernel::KernelContext::GetEigenDevice<
     platform::GPUPlace, Eigen::GpuDevice>() const {
   return device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index e6cae9c32b..b8c5098e49 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -109,7 +109,9 @@ class OpKernel {
     template <typename PlaceType,
               typename DeviceType =
                   typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-    DeviceType* get_eigen_device() const;
+    DeviceType* GetEigenDevice() const;
+
+    platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
     const OperatorBase& op_;
     const ScopePtr& scope_;
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index e8c718669a..e9a793d23b 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -27,9 +27,9 @@ public:
     auto input1 = context.Input(1)->Get<framework::Tensor>();
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
 
-    output->mutable_data<T>(Place());
+    output->mutable_data<T>(context.GetPlace());
 
-    output->flat<T>().device(*(context.get_eigen_device<Place>())) =
+    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
         input0.flat<T>() + input1.flat<T>();
   }
 };

From 5847b96a61fec031555f245702f604e928f8ad4e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 18:30:40 +0800
Subject: [PATCH 184/205] Follow comments, correct implement `DDim::size()`

* Also fix unit test
---
 paddle/framework/ddim.cc       | 2 +-
 paddle/operators/softmax_op.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index f3dd396613..eb1a18ee40 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -278,7 +278,7 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
-ssize_t DDim::size() const { return product(*this); }
+ssize_t DDim::size() const { return arity(*this); }
 
 DDim::DDim(std::initializer_list<int> init_list) {
   *this = make_ddim(init_list);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 59f32b35cf..903eef1b62 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,5 +1,5 @@
 #include <paddle/framework/op_registry.h>
-#include <paddle/operators/sigmoid_op.h>
+#include <paddle/operators/softmax_op.h>
 
 REGISTER_OP_GPU_KERNEL(
     softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);

From 73a9f0f25d86b46fa74fc574e2f443d644bcfb88 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 19:44:12 +0800
Subject: [PATCH 185/205] Merge develop

---
 paddle/operators/mul_op.h         | 3 ++-
 paddle/operators/rowwise_add_op.h | 2 +-
 paddle/operators/sigmoid_op.h     | 2 +-
 paddle/operators/softmax_op.h     | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index ed8d26e136..ce8a0169e0 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #pragma once
+
 #include <glog/logging.h>
 #include <paddle/framework/operator.h>
 
@@ -22,7 +23,7 @@ namespace operators {
 template <typename Place>
 class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
     LOG(INFO) << "Mul kernel in " << typeid(Place).name();
   }
 };
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 3dfde93ba2..35f43e6376 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -22,7 +22,7 @@ namespace operators {
 template <typename Place>
 class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
     LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
   }
 };
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 191aa42e4a..42173343f3 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -23,7 +23,7 @@ namespace operators {
 template <typename Place>
 class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
     LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
   }
 };
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index fe97c9aafe..74e9e2786b 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -23,7 +23,7 @@ namespace operators {
 template <typename Place>
 class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
     LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
   }
 };

From 78bd815e8504496ccae388bb799cc8026427084c Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 17 Jul 2017 19:48:33 +0800
Subject: [PATCH 186/205] refine conditional compilation and remove `numel_`

---
 paddle/framework/tensor.h | 40 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 29bad7a00a..b405e3877c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -27,7 +27,7 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : numel_(0), offset_(0) {}
+  Tensor() : offset_(0) {}
 
   template <typename T>
   const T* data() const {
@@ -44,30 +44,26 @@ class Tensor {
 
   template <typename T>
   T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(numel_ > 0,
-                   "Tensor::numel_ must be larger than zero to call "
+    PADDLE_ENFORCE(product(dims_) > 0,
+                   "Tensor's numel must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
     if (holder_ == nullptr ||
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
 #ifdef __CUDACC__
-      switch (place.which()) {
-        case 0:
-          holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
-          break;
-
-        case 1:
-          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
-          break;
-      }
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
 #endif
-
+      } else {
+        PADDLE_ENFORCE(true, "Unknown 'place'.");
+      }
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -88,7 +84,7 @@ class Tensor {
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
     src.CheckDims<T>();
-    size_t size = src.numel_ * sizeof(T);
+    size_t size = product(src.dims_) * sizeof(T);
     set_dims(src.dims());
     const void* src_ptr = static_cast<const void*>(src.data<T>());
     void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -122,7 +118,6 @@ class Tensor {
       return;
     }
     dims_ = dims;
-    numel_ = product(dims_);
   }
 
   DDim dims() const { return dims_; }
@@ -170,16 +165,15 @@ class Tensor {
   inline void CheckDims() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
-  size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};                 // namespace framework
+};
 
 }  // namespace framework
 }  // namespace paddle

From 78fa5e307da3cb32706f396346d3db7a875b4178 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 17 Jul 2017 20:00:58 +0800
Subject: [PATCH 187/205] Add DDim::size()

---
 paddle/framework/ddim.cc      | 2 ++
 paddle/framework/ddim.h       | 2 ++
 paddle/framework/ddim_test.cc | 1 +
 3 files changed, 5 insertions(+)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 73f5499ad1..b6ad8b60aa 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -117,6 +117,8 @@ int DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index a0c2a8a74a..7bc21a1e34 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -50,6 +50,8 @@ struct DDim {
 
   DDimVar getVar() { return var; }
 
+  ssize_t size() const;
+
   bool operator==(DDim d) const;
 
   bool operator!=(DDim d) const;
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 6a099f2aeb..9d18a2972c 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,6 +49,7 @@ TEST(DDim, Equality) {
 
   // arity of a DDim
   EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);

From 122e83e36cee629cf3e8c5b0e6222b2160437769 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 21:09:57 +0800
Subject: [PATCH 188/205] Fix unittest

---
 paddle/operators/rowwise_add_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 95e29d1fa3..2c4bfbf93a 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -2,4 +2,5 @@
 #include <paddle/operators/rowwise_add_op.h>
 
 REGISTER_OP_GPU_KERNEL(
-    mul, paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);

From bde90be71bc2758b464960c8e2631ee177c1d9a7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 18:10:18 +0800
Subject: [PATCH 189/205] Read/Write a Tensor Python

Basically following
http://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html

* Use buffer protocol to return a view of Tensor. It can be cast to
  numpy array in Python.
* Set a numpy array to a tensor.
---
 paddle/framework/tensor.h                     |   9 +-
 paddle/pybind/pybind.cc                       | 142 +++++++++++++++++-
 .../paddle/v2/framework/tests/test_tensor.py  |  45 ++++++
 3 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_tensor.py

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 29bad7a00a..891cf73641 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
@@ -127,6 +128,10 @@ class Tensor {
 
   DDim dims() const { return dims_; }
 
+  platform::Place place() const { return holder_->place(); }
+
+  std::type_index type() const { return holder_->type(); }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -135,6 +140,7 @@ class Tensor {
     virtual void* ptr() const = 0;
     virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
   };
 
   template <typename T, typename PlaceType>
@@ -159,7 +165,8 @@ class Tensor {
 
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
+    virtual paddle::platform::Place place() const { return place_; }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
     std::unique_ptr<T, Deleter<PlaceType>> ptr_;
     platform::Place place_;  // record the place of ptr_.
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index b5ead21fd0..8222323e36 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <fstream>
@@ -25,9 +26,143 @@ namespace pd = paddle::framework;
 
 USE_OP(add_two);
 
+struct PlaceDebugString : public boost::static_visitor<std::string> {
+  std::string operator()(const paddle::platform::GPUPlace& place) const {
+    return "GPU(" + std::to_string(place.device) + ")";
+  }
+
+  std::string operator()(const paddle::platform::CPUPlace& place) const {
+    return "CPU";
+  }
+};
+
+template <typename T>
+struct TensorToPyBuffer {
+  pd::Tensor& self_;
+  explicit TensorToPyBuffer(pd::Tensor& self) : self_(self) {}
+
+  bool CanCast() const { return std::type_index(typeid(T)) == self_.type(); }
+
+  py::buffer_info Cast() const {
+    auto dim_vec = pd::vectorize(self_.dims());
+    std::vector<size_t> dims_outside;
+    std::vector<size_t> strides;
+    dims_outside.resize(dim_vec.size());
+    strides.resize(dim_vec.size());
+
+    size_t prod = 1;
+    for (size_t i = dim_vec.size(); i != 0; --i) {
+      dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+      strides[i - 1] = sizeof(float) * prod;
+      prod *= dims_outside[i - 1];
+    }
+
+    return py::buffer_info(self_.mutable_data<T>(self_.place()),
+                           sizeof(T),
+                           py::format_descriptor<T>::format(),
+                           (size_t)pd::arity(self_.dims()),
+                           dims_outside,
+                           strides);
+  }
+};
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(pd::Tensor& tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(pd::Tensor& tensor) {
+    TensorToPyBuffer<CUR_TYPE> cast_object(tensor);
+    if (cast_object.CanCast()) {
+      return cast_object.Cast();
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
+  for (size_t i = 0; i < vec.size(); ++i) {
+    os << vec[i];
+    if (i + 1 != vec.size()) {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+py::buffer_info CastToPyBuffer(pd::Tensor& tensor) {
+  auto buffer_info = CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyTensorSet(
+    pd::Tensor& self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.set_dims(pd::make_ddim(dims));
+  auto* dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
+  py::class_<paddle::platform::Place>(
+      m, "Place", R"DOC(Device Place Class.)DOC")
+      .def("__str__",
+           [](const paddle::platform::Place& self) {
+             return boost::apply_visitor(PlaceDebugString(), self);
+           })
+      .def("is_gpu",
+           [](const paddle::platform::Place& self) {
+             return paddle::platform::is_gpu_place(self);
+           })
+      .def("is_cpu", [](const paddle::platform::Place& self) {
+        return paddle::platform::is_cpu_place(self);
+      });
+
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def("get_place", &pd::Tensor::place)
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
+                       "Only CPU tensor can cast to numpy array");
+        return CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.set_dims(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self) {
+             self.mutable_data<float>(paddle::platform::CPUPlace());
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self) {
+             self.mutable_data<int>(paddle::platform::CPUPlace());
+           })
+      .def("set", PyTensorSet<float>)
+      .def("set", PyTensorSet<int>);
+
   py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
@@ -38,7 +173,12 @@ All parameter, weight, gradient are variables in Paddle.
              *var.GetMutable<int>() = val;
            })
       .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference);
 
   py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
       .def(py::init<const std::shared_ptr<pd::Scope>&>())
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
new file mode 100644
index 0000000000..b72aff3b9c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -0,0 +1,45 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+
+
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2b1cac4113690f4090cdde2a57afb905b2804843 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 21:49:30 +0000
Subject: [PATCH 190/205] Handle all unchecked errors

Unchecked errors could be handled by: cd go; gometalinter --vendor --disable-all --enable errcheck $(glide nv)
---
 go/master/client.go               |  5 +++-
 go/master/client_internal_test.go | 22 ++++++++++++++---
 go/master/client_test.go          | 24 +++++++++++++++---
 go/pserver/client/client.go       |  2 +-
 go/pserver/client/client_test.go  | 28 +++++++++++++++++----
 go/pserver/service.go             | 41 +++++++++++++++++++++++--------
 6 files changed, 97 insertions(+), 25 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index de883bf4b9..90b9947097 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -69,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }
 
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 49263474c8..70dc09bf94 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {
 
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 6666d3860c..bc92dc5ac9 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {
 
 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
+
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index aa8bfe30c2..b4a45e1c21 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {
 
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index aab91556b4..5c89882a29 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -79,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 
 type selector bool
diff --git a/go/pserver/service.go b/go/pserver/service.go
index fec2ec61dc..5cb0293b97 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -219,7 +219,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }
 
 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -237,9 +237,9 @@ func (s *Service) doCheckpoint() error {
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 
 	cpMeta := checkpointMeta{}
@@ -248,10 +248,14 @@ func (s *Service) doCheckpoint() error {
 	h := md5.New()
 	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
 
-	cpMetajson, _ := json.Marshal(cpMeta)
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
 	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
@@ -264,15 +268,32 @@ func (s *Service) doCheckpoint() error {
 		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
 	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
+
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	return
 }

From 065e5666ed6d87e7736c26d795daf0bc2b6efb2a Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 22:32:55 +0000
Subject: [PATCH 191/205] add gometalinter/errcheck into pre-commit

---
 .pre-commit-config.yaml | 10 ++++++----
 .travis.yml             |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61b989dc69..44174d3558 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,12 @@
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
     hooks:
     -   id: clang-formater
--   repo: https://github.com/dnephin/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 6bce8cc8a6ce601bcf6feccf6bfbd43fe04ccbeb
     hooks:
       -   id: go-fmt
-          files: (.*\.go)
+          types: [go]
       -   id: go-lint
-          files: (.*\.go)
+          types: [go]
+      -   id: gometalinter
+          types: [go]
diff --git a/.travis.yml b/.travis.yml
index 2cf7666fb5..376c693602 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,6 +41,8 @@ before_install:
   - pip install rarfile
   - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

From 5d7bccb2a38cb09a2cb90781084cfbd58839cf63 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 23:09:53 +0000
Subject: [PATCH 192/205] fix golint errors

---
 go/pserver/client/c/cclient.go   | 12 ++++++------
 go/pserver/client/etcd_client.go | 15 +++++++--------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 7ddaceb7ed..d307c92983 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -101,11 +101,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
-	etcd_client := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
 
@@ -124,13 +124,13 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 8eb2a4f451..953065b427 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,8 +12,7 @@ import (
 )
 
 const (
-	// DefaultEtcdTimeout is the default etcd timeout
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )
 
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -48,7 +47,7 @@ func (p *EtcdClient) Desired() int {
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -67,12 +66,12 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
-			cancel()
 			if err != nil {
-				log.Infof("Get psKey=%s error, %v", psKey, err)
+				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)
 				continue
 			}
@@ -107,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -119,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client

From 37624b30ff3b769fdd768c77d2cdd8b55f09481c Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Sat, 15 Jul 2017 00:01:12 +0000
Subject: [PATCH 193/205] Fix Go pre-commit

---
 .pre-commit-config.yaml              | 4 +---
 paddle/scripts/travis/check_style.sh | 5 +++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 44174d3558..b7179c26fe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,11 +22,9 @@
     hooks:
     -   id: clang-formater
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: 6bce8cc8a6ce601bcf6feccf6bfbd43fe04ccbeb
+    sha: fb3ba0e9e38a516543925e96cef76740b61321ab
     hooks:
       -   id: go-fmt
           types: [go]
-      -   id: go-lint
-          types: [go]
       -   id: gometalinter
           types: [go]
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index 4754bdd4c8..8049aeb7b0 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -13,6 +13,11 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 if ! pre-commit run -a ; then
   git diff  --exit-code
 fi

From 25e57949cce1dd42ed8532a86712374af1bf8ea8 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 17 Jul 2017 21:04:12 +0000
Subject: [PATCH 194/205] add more linters, fix errors found by them.

---
 .pre-commit-config.yaml        |  2 +-
 go/master/c/client.go          |  5 ++---
 go/master/etcd_client.go       |  4 ++--
 go/master/inmem_store.go       |  2 +-
 go/master/service.go           |  3 +--
 go/pserver/client/c/cclient.go | 11 +++++------
 go/pserver/etcd_client.go      |  9 +++------
 go/pserver/optimizer.go        |  8 +++-----
 go/pserver/service.go          |  2 +-
 9 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b7179c26fe..efb4dcb2df 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@
     hooks:
     -   id: clang-formater
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: fb3ba0e9e38a516543925e96cef76740b61321ab
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
     hooks:
       -   id: go-fmt
           types: [go]
diff --git a/go/master/c/client.go b/go/master/c/client.go
index 31f4311974..2cbe164c7b 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}
 
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 04c1394e96..69dc6a8268 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)
 
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
index bcd549b20e..57e75dc4e0 100644
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"
 
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte
diff --git a/go/master/service.go b/go/master/service.go
index 9cef2270ce..262735f421 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -160,7 +160,7 @@ func (s *Service) recover() (bool, error) {
 
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -289,7 +289,6 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 
 	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-	return
 }
 
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index d307c92983..718b4304c8 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -137,7 +136,7 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter,
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
 
@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
 
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 66af4fa0b4..e70e826975 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -177,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 
 	if err != nil {
@@ -211,8 +211,5 @@ func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) err
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
-	if err != nil {
-		return err
-	}
-	return nil
+	return err
 }
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index d6b7fafd59..151a3f8033 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -14,8 +14,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
@@ -23,7 +21,7 @@ type optimizer struct {
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -92,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 5cb0293b97..c723959d6b 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -211,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()

From 302c4f11d164311d6352d39e162d4b79bac6459e Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 18 Jul 2017 10:48:23 +0800
Subject: [PATCH 195/205] rename voc_seg to voc2012

---
 python/paddle/v2/dataset/__init__.py                            | 2 +-
 .../paddle/v2/dataset/tests/{vocseg_test.py => voc2012_test.py} | 2 +-
 python/paddle/v2/dataset/{voc_seg.py => voc2012.py}             | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename python/paddle/v2/dataset/tests/{vocseg_test.py => voc2012_test.py} (97%)
 rename python/paddle/v2/dataset/{voc_seg.py => voc2012.py} (100%)

diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index cdd85cce37..f99116b012 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -30,5 +30,5 @@ import voc_seg
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc_seg'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
 ]
diff --git a/python/paddle/v2/dataset/tests/vocseg_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
similarity index 97%
rename from python/paddle/v2/dataset/tests/vocseg_test.py
rename to python/paddle/v2/dataset/tests/voc2012_test.py
index 8217ff45b1..31e72ebf5e 100644
--- a/python/paddle/v2/dataset/tests/vocseg_test.py
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.voc_seg
+import paddle.v2.dataset.voc2012
 import unittest
 
 
diff --git a/python/paddle/v2/dataset/voc_seg.py b/python/paddle/v2/dataset/voc2012.py
similarity index 100%
rename from python/paddle/v2/dataset/voc_seg.py
rename to python/paddle/v2/dataset/voc2012.py

From ceb9a73aaad48f063ae4dcccf4aafb0ce0a3f709 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 18 Jul 2017 11:02:53 +0800
Subject: [PATCH 196/205] fix import err

---
 python/paddle/v2/dataset/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index f99116b012..90830515c1 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -26,7 +26,7 @@ import sentiment
 import wmt14
 import mq2007
 import flowers
-import voc_seg
+import voc2012
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'

From a89c7ffa94bc26a879b8978273219980648aaec4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 11:57:31 +0800
Subject: [PATCH 197/205] Make Tensor <--> Numpy interactive in tensor.h

* Follow review comments to seperate Tensor Numpy interactive methods in
  tensor.h.
* Simplify logic for `CastToPyBufferImpl`, make it as one struct and in
  details namespace.
* Remove `Scope` expose in Python, since it currently is useless.
* Remove some debug functions.
---
 paddle/pybind/pybind.cc                       | 118 +-----------------
 paddle/pybind/tensor.h                        |  91 ++++++++++++++
 .../paddle/v2/framework/tests/CMakeLists.txt  |   3 +-
 3 files changed, 97 insertions(+), 115 deletions(-)
 create mode 100644 paddle/pybind/tensor.h

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 8222323e36..e3dc3e718c 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <paddle/pybind/tensor.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -26,125 +27,14 @@ namespace pd = paddle::framework;
 
 USE_OP(add_two);
 
-struct PlaceDebugString : public boost::static_visitor<std::string> {
-  std::string operator()(const paddle::platform::GPUPlace& place) const {
-    return "GPU(" + std::to_string(place.device) + ")";
-  }
-
-  std::string operator()(const paddle::platform::CPUPlace& place) const {
-    return "CPU";
-  }
-};
-
-template <typename T>
-struct TensorToPyBuffer {
-  pd::Tensor& self_;
-  explicit TensorToPyBuffer(pd::Tensor& self) : self_(self) {}
-
-  bool CanCast() const { return std::type_index(typeid(T)) == self_.type(); }
-
-  py::buffer_info Cast() const {
-    auto dim_vec = pd::vectorize(self_.dims());
-    std::vector<size_t> dims_outside;
-    std::vector<size_t> strides;
-    dims_outside.resize(dim_vec.size());
-    strides.resize(dim_vec.size());
-
-    size_t prod = 1;
-    for (size_t i = dim_vec.size(); i != 0; --i) {
-      dims_outside[i - 1] = (size_t)dim_vec[i - 1];
-      strides[i - 1] = sizeof(float) * prod;
-      prod *= dims_outside[i - 1];
-    }
-
-    return py::buffer_info(self_.mutable_data<T>(self_.place()),
-                           sizeof(T),
-                           py::format_descriptor<T>::format(),
-                           (size_t)pd::arity(self_.dims()),
-                           dims_outside,
-                           strides);
-  }
-};
-
-template <bool less, size_t I, typename... ARGS>
-struct CastToPyBufferImpl;
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(pd::Tensor& tensor) {
-    PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
-  }
-};
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<true, I, ARGS...> {
-  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(pd::Tensor& tensor) {
-    TensorToPyBuffer<CUR_TYPE> cast_object(tensor);
-    if (cast_object.CanCast()) {
-      return cast_object.Cast();
-    } else {
-      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
-      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
-    }
-  }
-};
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
-  for (size_t i = 0; i < vec.size(); ++i) {
-    os << vec[i];
-    if (i + 1 != vec.size()) {
-      os << ", ";
-    }
-  }
-  return os;
-}
-
-py::buffer_info CastToPyBuffer(pd::Tensor& tensor) {
-  auto buffer_info = CastToPyBufferImpl<true, 0, float, int>()(tensor);
-  return buffer_info;
-}
-
-template <typename T>
-void PyTensorSet(
-    pd::Tensor& self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
-  std::vector<int> dims;
-  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
-  }
-
-  self.set_dims(pd::make_ddim(dims));
-  auto* dst = self.mutable_data<T>(paddle::platform::CPUPlace());
-  std::memcpy(dst, array.data(), sizeof(T) * array.size());
-}
-
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
-  py::class_<paddle::platform::Place>(
-      m, "Place", R"DOC(Device Place Class.)DOC")
-      .def("__str__",
-           [](const paddle::platform::Place& self) {
-             return boost::apply_visitor(PlaceDebugString(), self);
-           })
-      .def("is_gpu",
-           [](const paddle::platform::Place& self) {
-             return paddle::platform::is_gpu_place(self);
-           })
-      .def("is_cpu", [](const paddle::platform::Place& self) {
-        return paddle::platform::is_cpu_place(self);
-      });
-
   py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
-      .def("get_place", &pd::Tensor::place)
       .def_buffer([](pd::Tensor& self) -> py::buffer_info {
         PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
                        "Only CPU tensor can cast to numpy array");
-        return CastToPyBuffer(self);
+        return paddle::pybind::CastToPyBuffer(self);
       })
       .def("get_dims",
            [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
@@ -160,8 +50,8 @@ PYBIND11_PLUGIN(core) {
            [](pd::Tensor& self) {
              self.mutable_data<int>(paddle::platform::CPUPlace());
            })
-      .def("set", PyTensorSet<float>)
-      .def("set", PyTensorSet<int>);
+      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyTensorSetFromArray<int>);
 
   py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 
diff --git a/paddle/pybind/tensor.h b/paddle/pybind/tensor.h
new file mode 100644
index 0000000000..ef07144ad4
--- /dev/null
+++ b/paddle/pybind/tensor.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <paddle/framework/tensor.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+
+      return py::buffer_info(tensor.mutable_data<CUR_TYPE>(tensor.place()),
+                             sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(tensor.dims()),
+                             dims_outside,
+                             strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.set_dims(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 86fc60f26a..4ce2bef6fc 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py test_op_creation_methods.py)
+    test_default_scope_funcs.py test_op_creation_methods.py
+    test_tensor.py)

From 051676a7e483b59583d92cd49aff6bdace916dc4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 18 Jul 2017 12:57:38 +0800
Subject: [PATCH 198/205] support multiple template parameter in KernelType for
 REGISTER_OP_XPU_KERNEL (#2932)

---
 paddle/framework/op_registry.h    | 14 ++++++++------
 paddle/framework/operator_test.cc |  4 +++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 7aa59f0b63..48f77a6784 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -311,7 +311,7 @@ class OpRegisterHelper {
 /**
  * Macro to Register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
       "REGISTER_OP_KERNEL must be in global namespace");                  \
@@ -320,17 +320,19 @@ class OpRegisterHelper {
       ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
       key.place_ = PlaceType();                                           \
       ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
+          .reset(new __VA_ARGS__());                                      \
     }                                                                     \
   };                                                                      \
   static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
   int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
 
-#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+// (type, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
 
-#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+// (type, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
 /**
  * Macro to mark what Operator and Kernel we will use and tell the compiler to
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 6fa110f94c..8e55d0111f 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -102,6 +102,7 @@ class OpWithKernelTest : public OperatorWithKernel {
                   const std::vector<Tensor*>& outputs) const override {}
 };
 
+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
  public:
   void Compute(const KernelContext& ctx) const {
@@ -171,7 +172,8 @@ class CPUKernalMultiInputsTest : public OpKernel {
 
 REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
             paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
 
 // test with single input
 TEST(OpKernel, all) {

From e00aae5386e4c2e9bef0e942f3be4405874c4f1c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 13:02:22 +0800
Subject: [PATCH 199/205] Fix Merge Bugs

* There is a merge conflict when merge PR #2914
* Develop and PR #2914 both add `DDim::size` method, but did not
  triger git merge conflict while merge.
---
 paddle/framework/ddim.cc | 2 --
 paddle/framework/ddim.h  | 2 --
 2 files changed, 4 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index d9d5e9f66e..d2ef85afe5 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -280,8 +280,6 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
-ssize_t DDim::size() const { return arity(*this); }
-
 DDim::DDim(std::initializer_list<int> init_list) {
   *this = make_ddim(init_list);
 }
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 0d10745a9e..3976c6c029 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -52,8 +52,6 @@ struct DDim {
 
   DDimVar getVar() { return var; }
 
-  ssize_t size() const;
-
   bool operator==(DDim d) const;
 
   bool operator!=(DDim d) const;

From 1ac0bffaf7d52178b037f4506c3d60d5d49241f4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 13:11:06 +0800
Subject: [PATCH 200/205] Override Net::DebugString to invoke inner ops_

Net::DebugString() print what it is and invokes inner operators
DebugString() with a prefix '\t'.
---
 paddle/framework/net.cc     | 9 +++++++++
 paddle/framework/net.h      | 2 ++
 paddle/framework/operator.h | 2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 7311cda9a9..b9cd732d40 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -55,5 +55,14 @@ void PlainNet::CompleteAddOp() {
   add_op_done_ = true;
 }
 
+std::string PlainNet::DebugString() const {
+  std::ostringstream os;
+  os << this->type_ << ":" << std::endl;
+  for (auto& op : ops_) {
+    os << "\t" << op->DebugString() << std::endl;
+  }
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 19a1620e29..33bb30ea07 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -88,6 +88,8 @@ class PlainNet : public Net {
 
   void CompleteAddOp() override;
 
+  std::string DebugString() const override;
+
   std::vector<OperatorPtr> ops_;
 
  private:
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index f7ed6e9f3d..b62cac6d27 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -48,7 +48,7 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  std::string DebugString() const;
+  virtual std::string DebugString() const;
 
   /// Init will be called after CreateOperator, you can put some initialization
   /// logic here.

From c1219a530c4641ec618e15c8f4e5a66ec0f637e8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 13:54:24 +0800
Subject: [PATCH 201/205] Change `in_out_idxs_` to shared_ptr

* `in_out_idxs_` shares between all operator instance in same type
  of operator.
---
 paddle/framework/op_registry.h | 33 +++++++++++++++++++++++++--------
 paddle/framework/operator.cc   | 26 ++++++++------------------
 paddle/framework/operator.h    |  5 +----
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 48f77a6784..491ee21eec 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -198,6 +198,7 @@ Add a mark to which output is temporary is helpful for future optimization.
 
 class OpRegistry {
   using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
@@ -212,6 +213,17 @@ class OpRegistry {
         op_proto.IsInitialized(),
         "Fail to initialize %s's OpProto, because %s is not initialized",
         op_type, op_proto.InitializationErrorString());
+
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
+    }
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
+    }
   }
 
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
@@ -220,7 +232,6 @@ class OpRegistry {
     OperatorPtr op(creators().at(op_type)());
     //! Fill op's data member. Not use constructor because it will be noising
     //! for Op developer.
-    const OpProto& op_proto = protos().at(op_type);
     op->type_ = op_desc.type();
     // set op's inputs_ from desc.
     op->inputs_.reserve((size_t)op_desc.inputs_size());
@@ -240,25 +251,31 @@ class OpRegistry {
     //! Convert Temporary variable name to an unique variable name.
     GenerateTempVariableName(op.get());
 
-    // set argument offsets stored in op.
-    CreateInOutOffsetMap(op, op_proto);
+    //! set argument offsets stored in op.
+    {
+      auto var_index_it = VarIndexMaps().find(op_type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
     //! Other op's custom Init for a complex Op. For simple Op, the Init
     //! method do nothing.
     op->Init();
     return op;
   }
 
-  // init op.in_out_idxs_ to accelerate argument's offset lookup.
-  static void CreateInOutOffsetMap(OperatorPtr op, const OpProto& proto) {
-    op->CreateInOutOffsetMap(proto);
-  }
-
   static std::unordered_map<std::string, OpProto>& protos() {
     static std::unordered_map<std::string, OpProto> protos_;
     return protos_;
   };
 
  private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
+  }
+
   static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
     for (auto& outname : op->outputs_) {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 50cb2d9362..3647983053 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -19,21 +19,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void OperatorBase::CreateInOutOffsetMap(const OpProto& proto) {
-  PADDLE_ENFORCE(in_out_idxs_.empty(), "duplicate call CreateInOutOffsetMap");
-  for (int i = 0; i < proto.inputs_size(); i++) {
-    const auto& name = proto.inputs()[i].name();
-    in_out_idxs_[name] = i;
-  }
-  for (int i = 0; i < proto.outputs_size(); i++) {
-    const auto& name = proto.outputs()[i].name();
-    in_out_idxs_[name] = i;
-  }
-}
-
 const std::string& OperatorBase::Input(const std::string& name) const {
-  auto it = in_out_idxs_.find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
 
   if (attrs_.count("input_format") == 0) {
     return inputs_[it->second];
@@ -46,7 +35,7 @@ const std::string& OperatorBase::Input(const std::string& name) const {
 
 std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
   auto input_format = GetAttr<std::vector<int>>("input_format");
-  auto offset = in_out_idxs_.at(name);
+  auto offset = in_out_idxs_->at(name);
 
   return std::vector<std::string>{
       inputs_.begin() + input_format.at(offset),
@@ -54,8 +43,9 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
 }
 
 const std::string& OperatorBase::Output(const std::string& name) const {
-  auto it = in_out_idxs_.find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
 
   if (attrs_.count("output_format") == 0) {
     return outputs_[it->second];
@@ -68,7 +58,7 @@ const std::string& OperatorBase::Output(const std::string& name) const {
 
 std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
   auto output_format = GetAttr<std::vector<int>>("output_format");
-  auto offset = in_out_idxs_.at(name);
+  auto offset = in_out_idxs_->at(name);
 
   return std::vector<std::string>{
       outputs_.begin() + output_format.at(offset),
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 2fe9670677..2081b8a05c 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -82,16 +82,13 @@ class OperatorBase {
   // TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Outputs(const std::string& name) const;
 
-  // init in_out_idxs_ to accelerate argument's offset lookup.
-  void CreateInOutOffsetMap(const OpProto& proto);
-
  public:
   std::string type_;
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   AttributeMap attrs_;
   // store the arguments' offset described in op_desc.
-  std::unordered_map<std::string, int> in_out_idxs_;
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };
 
 class KernelContext {

From 22be2b2bb8d02c214a633f1727a892370bd188d1 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Sat, 15 Jul 2017 00:16:17 +0000
Subject: [PATCH 202/205] fix travis check style

---
 paddle/scripts/travis/check_style.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index 8049aeb7b0..ec499a839a 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to reformat your code and git push again." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
     exit 1
 }
 
@@ -19,7 +19,8 @@ ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
 cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
 
 if ! pre-commit run -a ; then
-  git diff  --exit-code
+    git diff
+    exit 1
 fi
 
 trap : 0

From f6a51d9b4ef850ec650861de85ca1f3b55bfb4c8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 15:21:54 +0800
Subject: [PATCH 203/205] Make CreateOp in Plain C++ params

---
 paddle/framework/op_registry.h | 64 ++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 491ee21eec..c41fe10729 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -199,6 +199,7 @@ Add a mark to which output is temporary is helpful for future optimization.
 class OpRegistry {
   using OpCreator = std::function<OperatorBase*()>;
   using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
@@ -226,42 +227,51 @@ class OpRegistry {
     }
   }
 
-  static OperatorPtr CreateOp(const OpDesc& op_desc) {
-    //! Create a OpPtr by type.
-    std::string op_type = op_desc.type();
-    OperatorPtr op(creators().at(op_type)());
-    //! Fill op's data member. Not use constructor because it will be noising
-    //! for Op developer.
-    op->type_ = op_desc.type();
-    // set op's inputs_ from desc.
-    op->inputs_.reserve((size_t)op_desc.inputs_size());
-    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-              std::back_inserter(op->inputs_));
-    // set op's outputs_ from desc.
-    op->outputs_.reserve((size_t)op_desc.outputs_size());
-    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-              std::back_inserter(op->outputs_));
+  static OperatorPtr CreateOp(const std::string& type,
+                              const VarNameList& inputs,
+                              const VarNameList& outputs,
+                              const AttributeMap& attrs) {
+    auto op_create_it = creators().find(type);
+    PADDLE_ENFORCE(op_create_it != creators().end(),
+                   "Operator %s cannot be found", type);
 
-    //! Fill attrs, and validate attrs.
-    for (auto& attr : op_desc.attrs()) {
-      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
-    }
-    op_checkers().at(op_type).Check(op->attrs_);
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
 
-    //! Convert Temporary variable name to an unique variable name.
-    GenerateTempVariableName(op.get());
+    GenerateTempVariableName(op);
 
-    //! set argument offsets stored in op.
     {
-      auto var_index_it = VarIndexMaps().find(op_type);
+      auto var_index_it = VarIndexMaps().find(type);
       if (var_index_it != VarIndexMaps().end()) {
         op->in_out_idxs_ = var_index_it->second;
       }
     }
-    //! Other op's custom Init for a complex Op. For simple Op, the Init
-    //! method do nothing.
+
     op->Init();
-    return op;
+    return OperatorPtr(op);
+  }
+
+  static OperatorPtr CreateOp(const OpDesc& op_desc) {
+    std::vector<std::string> inputs;
+    inputs.reserve((size_t)op_desc.inputs_size());
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
+              std::back_inserter(inputs));
+
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
+              std::back_inserter(outputs));
+
+    AttributeMap attrs;
+    for (auto& attr : op_desc.attrs()) {
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+    }
+
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
 
   static std::unordered_map<std::string, OpProto>& protos() {

From 1dc53a289fe724cd3772618de374aacbf72a87f6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 15:23:13 +0800
Subject: [PATCH 204/205] Use friend not to expose tensor's `type/place`

---
 paddle/framework/tensor.h                 | 14 +++++++++-----
 paddle/pybind/pybind.cc                   |  4 +---
 paddle/pybind/{tensor.h => tensor_bind.h} | 18 +++++++++++-------
 3 files changed, 21 insertions(+), 15 deletions(-)
 rename paddle/pybind/{tensor.h => tensor_bind.h} (84%)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 891cf73641..c495687dc4 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -24,6 +24,12 @@ limitations under the License. */
 #include "paddle/platform/place.h"
 
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 
 class Tensor {
@@ -128,10 +134,6 @@ class Tensor {
 
   DDim dims() const { return dims_; }
 
-  platform::Place place() const { return holder_->place(); }
-
-  std::type_index type() const { return holder_->type(); }
-
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -186,7 +188,9 @@ class Tensor {
   DDim dims_;
   size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};                 // namespace framework
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
+};  // namespace framework
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index e3dc3e718c..0eef36f8ec 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
-#include <paddle/pybind/tensor.h>
+#include <paddle/pybind/tensor_bind.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -32,8 +32,6 @@ PYBIND11_PLUGIN(core) {
 
   py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer([](pd::Tensor& self) -> py::buffer_info {
-        PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
-                       "Only CPU tensor can cast to numpy array");
         return paddle::pybind::CastToPyBuffer(self);
       })
       .def("get_dims",
diff --git a/paddle/pybind/tensor.h b/paddle/pybind/tensor_bind.h
similarity index 84%
rename from paddle/pybind/tensor.h
rename to paddle/pybind/tensor_bind.h
index ef07144ad4..b96516643a 100644
--- a/paddle/pybind/tensor.h
+++ b/paddle/pybind/tensor_bind.h
@@ -40,7 +40,10 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
+                   "Only CPU tensor can cast to numpy array");
+
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;
@@ -54,12 +57,13 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         prod *= dims_outside[i - 1];
       }
 
-      return py::buffer_info(tensor.mutable_data<CUR_TYPE>(tensor.place()),
-                             sizeof(CUR_TYPE),
-                             py::format_descriptor<CUR_TYPE>::format(),
-                             (size_t)framework::arity(tensor.dims()),
-                             dims_outside,
-                             strides);
+      return py::buffer_info(
+          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(tensor.dims()),
+          dims_outside,
+          strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);

From 9b964b535bb060b584d1336c956f34c03fa9e3fe Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 16:16:59 +0800
Subject: [PATCH 205/205] Tensor should use PADDLE_ONLY_CPU macro

* Not use __CUDACC__ macro, because malloc in GPU memory do not have to
  used in .cu file.
* Also, `PADDLE_ENFORCE` logic is wrong.
---
 paddle/framework/tensor.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index b405e3877c..802471b341 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -55,14 +55,14 @@ class Tensor {
         holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
             boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
       } else if (platform::is_gpu_place(place)) {
-#ifdef __CUDACC__
+#ifdef PADDLE_ONLY_CPU
+        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#else
         holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
             boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
-#else
-        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
 #endif
       } else {
-        PADDLE_ENFORCE(true, "Unknown 'place'.");
+        PADDLE_THROW("Unknown 'place'.");
       }
       offset_ = 0;
     }