From 0e6ddcc7bc63eb6ddfe5f12f4d9060625befe41a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 10:01:10 +0800
Subject: [PATCH 01/64] ENH: Add GPU throw error

---
 paddle/platform/error.h | 87 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 paddle/platform/error.h
diff --git a/paddle/platform/error.h b/paddle/platform/error.h
new file mode 100644
index 0000000000..93424bb610
--- /dev/null
+++ b/paddle/platform/error.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle

From d3b77a5bc053b77309ecc094450e755604217674 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:38 +0800
Subject: [PATCH 02/64] ENH: Add Gpu info

---
 paddle/platform/gpu_info.cc | 49 +++++++++++++++++++++++++++++++++++++
 paddle/platform/gpu_info.h  | 36 +++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 paddle/platform/gpu_info.cc
 create mode 100644 paddle/platform/gpu_info.h

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000..4208d83078
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GpuDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+  return count;
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  return total * FLAGS_fraction_of_gpu_memory_to_use;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000..174f093b43
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of GPU devices in system.
+int GpuDeviceCount();
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t& available, size_t& total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU

From b29923f902dc6da1416a94bc153448f1546e62b2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:57 +0800
Subject: [PATCH 03/64] ENH: Add CPU info

---
 paddle/platform/cpu_info.cc | 55 +++++++++++++++++++++++++++++++++++++
 paddle/platform/cpu_info.h  | 26 ++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 paddle/platform/cpu_info.cc
 create mode 100644 paddle/platform/cpu_info.h

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000..deff76502e
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuTotalMemory() {
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
new file mode 100644
index 0000000000..3b768589e1
--- /dev/null
+++ b/paddle/platform/cpu_info.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total memory on host machine.
+size_t CpuTotalMemory();
+
+}  // namespace platform
+}  // namespace paddle

From 169022d0148a77cd10f16a82e841a75750e7e173 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 14:04:47 +0800
Subject: [PATCH 04/64] FIX: Improve fallback gpu allocator

---
 paddle/memory/detail/CMakeLists.txt           |  4 +-
 paddle/memory/detail/system_allocator.cc      | 64 ++++++++++++++-----
 paddle/memory/detail/system_allocator.h       | 15 +++--
 paddle/memory/detail/system_allocator_test.cc | 14 ++--
 paddle/platform/CMakeLists.txt                |  4 ++
 paddle/platform/cpu_info_test.cc              | 18 ++++++
 paddle/platform/cuda.h                        | 40 ------------
 7 files changed, 85 insertions(+), 74 deletions(-)
 create mode 100644 paddle/platform/cpu_info_test.cc
 delete mode 100644 paddle/platform/cuda.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad7..6caa97a76b 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,6 +1,8 @@
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_test(system_allocator_test
+    SRCS system_allocator_test.cc
+    DEPS system_allocator gpu_info gflags)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f8..332ff062d4 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,32 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  if (FLAGS_use_pinned_memory) {
+    void* p = malloc(size);
+    if (p != nullptr) {
+      mlock(p, size);
+    }
+  }
+
   void* p = malloc(size);
   if (p != nullptr && FLAGS_use_pinned_memory) {
     mlock(p, size);
@@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) {
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
   if (p != nullptr && FLAGS_use_pinned_memory) {
     munlock(p, size);
   }
@@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) {
 
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t remaining = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= remaining) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      total_alloc_size_ += size;
+      return p;
+    }
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    total_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  PADDLE_ASSERT(total_alloc_size_ >= size);
+  total_alloc_size_ -= size;
+  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
     platform::throw_on_error(err, "cudaFree{Host} failed");
   }
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f..e15302ce4f 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -30,21 +30,24 @@ namespace detail {
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+
+ private:
+  size_t total_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e..ba44e06ddb 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7abe2ab89e..17342356d6 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,3 +1,7 @@
+cc_library(cpu_info SRCS cpu_info.cc)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+
+nv_library(gpu_info SRCS gpu_info.cc)
 nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000..5b7ce7c763
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,18 @@
+#include "paddle/platform/cpu_info.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
+  ss << std::to_string(
+            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
+     << "% of CPU Memory Usage: " << mem_size << " GB";
+  std::cout << ss.str();
+}
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
deleted file mode 100644
index 8fe891f9ce..0000000000
--- a/paddle/platform/cuda.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace paddle {
-namespace platform {
-
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU

From e6c14f7e000d047cf3d3a1e18e2a13e3349b1ff9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:03 +0800
Subject: [PATCH 05/64] ENH: Polish cpu info interface

---
 paddle/platform/CMakeLists.txt   |  3 +-
 paddle/platform/cpu_info.cc      | 14 +++++++-
 paddle/platform/cpu_info.h       | 10 ++++--
 paddle/platform/cpu_info_test.cc | 13 ++++---
 paddle/platform/cuda_test.cu     | 59 --------------------------------
 5 files changed, 30 insertions(+), 69 deletions(-)
 delete mode 100644 paddle/platform/cuda_test.cu

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d0bedf6ba9..969c91985d 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc)
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
 nv_library(gpu_info SRCS gpu_info.cc)
-nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index deff76502e..3da04420e5 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -47,9 +47,21 @@ inline size_t CpuTotalPhysicalMemory() {
 #endif
 }
 
-size_t CpuTotalMemory() {
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
   return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index 3b768589e1..8df7c7b4bc 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -19,8 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Get the total memory on host machine.
-size_t CpuTotalMemory();
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
index 5b7ce7c763..8fb195aa7c 100644
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
@@ -1,18 +1,21 @@
 #include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
 
 #include <ostream>
 #include <sstream>
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
 TEST(CpuMemoryUsage, Print) {
   std::stringstream ss;
-  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
-  ss << std::to_string(
-            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
-     << "% of CPU Memory Usage: " << mem_size << " GB";
-  std::cout << ss.str();
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
 }
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f1..0000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}

From 6e7209f0584f73eb22313d98c676333379736d1e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:32 +0800
Subject: [PATCH 06/64] ENH: Add gpu info interface

---
 paddle/platform/gpu_info.cc | 25 +++++++++++++++++++++++++
 paddle/platform/gpu_info.h  |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 4208d83078..d6c6fe34ef 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -42,8 +42,33 @@ size_t GpuMaxAllocSize() {
 
   GpuMemoryUsage(available, total);
 
+  // Reserve the rest for page tables, etc.
   return total * FLAGS_fraction_of_gpu_memory_to_use;
 }
 
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 174f093b43..d7bf0f4093 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -30,6 +30,12 @@ void GpuMemoryUsage(size_t& available, size_t& total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
 }  // namespace platform
 }  // namespace paddle
 

From 464886bf56cd91ffcd6617390d62dbd13c90a093 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:31:05 +0800
Subject: [PATCH 07/64] FIX: fix typo in piece.h

---
 paddle/string/piece.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e6980..0272529d1c 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ public:
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);

From 26cd0bb5a59d913f8c216ceee0c6abb46317e31e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:13:24 +0800
Subject: [PATCH 08/64] ENH: count allocated fallback size for performance

---
 paddle/memory/detail/system_allocator.cc      | 52 +++++++++++++------
 paddle/memory/detail/system_allocator.h       |  3 +-
 .../paddle/trainer_config_helpers/networks.py |  4 +-
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 332ff062d4..2b0fbfa87e 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -39,22 +39,22 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
-  if (FLAGS_use_pinned_memory) {
-    void* p = malloc(size);
-    if (p != nullptr) {
-      mlock(p, size);
-    }
-  }
+  index = 0;  // unlock memory
 
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
 void CPUAllocator::Free(void* p, size_t size, size_t index) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
@@ -73,26 +73,34 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 
   // Reserve memory for page tables, etc.
   size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
-  size_t remaining = available > reserving ? available - reserving : 0;
+  size_t usable = available > reserving ? available - reserving : 0;
 
   // If remaining size no less than expected size, using general
   // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  if (size <= remaining) {
+  if (size <= usable) {
     cudaError_t result = cudaMalloc(&p, size);
     if (result == cudaSuccess) {
       index = 0;
-      total_alloc_size_ += size;
+      gpu_alloc_size_ += size;
       return p;
     }
   }
 
   // If remaining size less than expected size or cudaMalloc failed,
   // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
   cudaError_t result = cudaMallocHost(&p, size);
   if (result == cudaSuccess) {
     index = 1;
-    total_alloc_size_ += size;
+    fallback_alloc_size_ += size;
     return p;
   }
 
@@ -100,16 +108,26 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  PADDLE_ASSERT(total_alloc_size_ >= size);
-  total_alloc_size_ -= size;
-  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index e15302ce4f..7093c42967 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -47,7 +47,8 @@ class GPUAllocator : public SystemAllocator {
   virtual void Free(void* p, size_t size, size_t index);
 
  private:
-  size_t total_alloc_size_ = 0;
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 67154a8d7d..1bf59ed484 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1381,7 +1381,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1424,7 +1424,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From fb51c3dc895b78df966dd0d9713657289b1986b3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:57:10 +0800
Subject: [PATCH 09/64] FIX: add compile dependency gflags

---
 paddle/platform/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 969c91985d..5cbe491b2b 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
-cc_library(cpu_info SRCS cpu_info.cc)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
-nv_library(gpu_info SRCS gpu_info.cc)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

From 275e5b7d42903ea3c9bf4e4fed3f9eab45c727bf Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 11:12:18 +0800
Subject: [PATCH 10/64] FIX: yapf format version

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index f0b6625dc3..b77932ce5f 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1438,7 +1438,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 89110fd2660098bc949a1f13f7b53515e0c931a3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:51:32 +0800
Subject: [PATCH 11/64] ENH: Add useGpu in system allocator

---
 paddle/memory/detail/system_allocator.cc | 4 ++++
 paddle/memory/detail/system_allocator.h  | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 2b0fbfa87e..75a2c91ef9 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,6 +60,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
+bool CPUAllocator::UseGpu() { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
@@ -131,6 +133,8 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
+bool GPUAllocator::UseGpu() { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 7093c42967..f3bbfef843 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,12 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu();
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -45,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-
+  virtual bool UseGpu();
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From 929f9cbdff08090a222495db7db601f164cebb8c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:52:04 +0800
Subject: [PATCH 12/64] ENH: Add Metadata for memory block

---
 paddle/memory/detail/metadata.cc | 62 ++++++++++++++++++++++++++++++++
 paddle/memory/detail/metadata.h  | 53 +++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 paddle/memory/detail/metadata.cc
 create mode 100644 paddle/memory/detail/metadata.h

diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/metadata.cc
new file mode 100644
index 0000000000..4607cd8512
--- /dev/null
+++ b/paddle/memory/detail/metadata.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/metadata.h
new file mode 100644
index 0000000000..ddb826571b
--- /dev/null
+++ b/paddle/memory/detail/metadata.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle

From bbd3eab7ee88f02131edb41738a966aa0f1a0e88 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:54:32 +0800
Subject: [PATCH 13/64] ENH: Add Alloc for buddy Allocator

* Free will be added soon
---
 paddle/memory/detail/buddy_allocator.cc | 157 ++++++++++++++++++++++--
 paddle/memory/detail/buddy_allocator.h  |  88 +++++++++----
 2 files changed, 209 insertions(+), 36 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5ee..2462ba084b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,161 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size) {
+  PADDLE_ASSERT(min_chunk_size > 0);
+  PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
+
+  system_allocator_ = std::move(system_allocator);
+  min_chunk_size_ = min_chunk_size;
+  max_chunk_size_ = max_chunk_size;
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+  } else {
+    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  // if still failure, fail fatally
+  if (it == pool_.end()) {
+    return nullptr;
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << " Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert({index, max_chunk_size_, p}).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound({index, size, nullptr});
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+
+  pool_.erase(it);
+
+  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+
+  block->split(cache_, size);
+
+  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
+                 << ", " << block->right_buddy(cache_)->total_size(cache_)
+                 << ")";
+
+      pool_.insert({block->right_buddy(cache_)->index(cache_),
+                    block->right_buddy(cache_)->total_size(cache_),
+                    block->right_buddy(cache_)});
+    }
+  }
+
+  return block;
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc7..38bedc9a18 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -15,9 +15,15 @@
 #pragma once
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/metadata.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
+#include <set>
 #include <mutex>
 #include <vector>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -25,55 +31,83 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple type: allocator index, memory size, memory address
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /** 
+   *  \brief Find the suitable chunk from existing pool
+   *  
+   *  \param it   pool iterator which contains suitable block.
+   *  \param size the size of allocation.
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation  */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  PoolSet pool_;
+
+ private:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+  MetadataCache cache_;
+
+ private:
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
 };
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = nullptr;
   if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
+    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
+                           platform::CpuMaxChunkSize());
   }
   return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
 
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    int gpu_num = platform::GpuDeviceCount();
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] =
+          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize());
     }
   }
   return as[gpu_id];

From 4e1617d05994bda1a9eb0e0b5b563249cc12f271 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 13:15:00 +0800
Subject: [PATCH 14/64] ENH: add buddy alloctor Free

---
 paddle/memory/detail/CMakeLists.txt     |   3 +
 paddle/memory/detail/buddy_allocator.cc |  19 ++--
 paddle/memory/detail/buddy_allocator.h  |  10 +-
 paddle/memory/detail/memory_block.cc    | 145 ++++++++++++++++++++++++
 paddle/memory/detail/memory_block.h     |  97 ++++++++++++++++
 paddle/memory/detail/system_allocator.h |   1 +
 6 files changed, 262 insertions(+), 13 deletions(-)
 create mode 100644 paddle/memory/detail/memory_block.cc
 create mode 100644 paddle/memory/detail/memory_block.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6caa97a76b..dbc98a8a62 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -7,3 +7,6 @@ else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+
+cc_library(metadata SRCS metadata.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 2462ba084b..e8d694327d 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -58,17 +58,16 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // refill the pool if failure
   if (it == pool_.end()) {
     it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
   } else {
     DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
-  // if still failure, fail fatally
-  if (it == pool_.end()) {
-    return nullptr;
-  }
-
   total_used_ += size;
   total_free_ -= size;
 
@@ -76,6 +75,13 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
 }
 
+void BuddyAllocator::Free(void* p) {
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+}
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -140,17 +146,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
                                    size_t size) {
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
-
   pool_.erase(it);
 
   DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
-
   block->split(cache_, size);
 
   DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
              << ")";
-
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 38bedc9a18..4006bdcce8 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <set>
 #include <mutex>
-#include <vector>
+#include <set>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace memory {
@@ -57,9 +57,9 @@ class BuddyAllocator {
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
 
-  /** 
+  /**
    *  \brief Find the suitable chunk from existing pool
-   *  
+   *
    *  \param it   pool iterator which contains suitable block.
    *  \param size the size of allocation.
    */
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000..1c9e87df49
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,145 @@
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this,
+              MemoryBlockMetadata(t, index, size - overhead(), size,
+                                  static_cast<MemoryBlock*>(left_buddy),
+                                  static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  assert(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= overhead()) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlockMetadata(FREE_MEMORY, index(cache),
+                                  remaining_size - overhead(), remaining_size,
+                                  this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - overhead();
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  assert(type(cache) == FREE_MEMORY);
+  assert(right_buddy->type(cache) == FREE_MEMORY);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy,
+              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  assert(type(cache) != FREE_MEMORY);
+  assert(type(cache) != INVALID_MEMORY);
+
+  set_type(cache, FREE_MEMORY);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<MemoryBlockMetadata*>(
+             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
+         1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+}
+
+}  // detail
+}  // memory
+}  // paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000..e2d39c31cf
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <cstddef>
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declaration
+class Metadata;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index f3bbfef843..555061a533 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -48,6 +48,7 @@ class GPUAllocator : public SystemAllocator {
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu();
+
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From ff36389452c1af6cc6a5f03b5ca52404ab20f108 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 15:21:24 +0800
Subject: [PATCH 15/64] ENH: code style

---
 paddle/memory/detail/buddy_allocator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index e8d694327d..eddfd9d13c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -48,7 +48,6 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
     DLOG(INFO) << "Allocate from system allocator.";
-
     return SystemAlloc(size);
   }
 

From 379434b243faeaf9fd4d38cf9f95dfe45cc563d5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 17:21:09 +0800
Subject: [PATCH 16/64] Delete cmake in dynload

---
 paddle/platform/dynload/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 paddle/platform/dynload/CMakeLists.txt

diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
deleted file mode 100644
index 9f829b7012..0000000000
--- a/paddle/platform/dynload/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)

From 0ba63475659822bd146f1f1dcfc7eabca8b7047d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 21:23:23 +0800
Subject: [PATCH 17/64] ENH: Add buddy allocator Free

---
 paddle/memory/detail/CMakeLists.txt           |  9 ++--
 paddle/memory/detail/buddy_allocator.cc       | 41 ++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h        |  2 +-
 paddle/memory/detail/memory_block.h           |  2 +-
 .../detail/{metadata.cc => meta_data.cc}      |  2 +-
 .../memory/detail/{metadata.h => meta_data.h} |  0
 paddle/platform/cpu_info.h                    | 10 -----
 paddle/platform/gpu_info.cc                   | 13 ++++++
 paddle/platform/gpu_info.h                    |  6 +++
 9 files changed, 65 insertions(+), 20 deletions(-)
 rename paddle/memory/detail/{metadata.cc => meta_data.cc} (97%)
 rename paddle/memory/detail/{metadata.h => meta_data.h} (100%)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index dbc98a8a62..c3167cd30a 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,12 +1,9 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test
-    SRCS system_allocator_test.cc
-    DEPS system_allocator gpu_info gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
-cc_library(metadata SRCS metadata.cc)
+cc_library(meta_data SRCS meta_data.cc)
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index eddfd9d13c..f677feda0d 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -75,10 +75,49 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
 }
 
 void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
   auto block = static_cast<MemoryBlock*>(p)->metadata();
 
-  // acquire the allocator lock
+  // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    if (system_allocator_->UseGpu()) {
+      cache_.erase(block);
+    }
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+  }
+
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+
+  // TODO(gangliao): Clean up if existing too much free memory
 }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 4006bdcce8..49bd6cf901 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index e2d39c31cf..2945520113 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <cstddef>
 #include <unordered_map>
diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/meta_data.cc
similarity index 97%
rename from paddle/memory/detail/metadata.cc
rename to paddle/memory/detail/meta_data.cc
index 4607cd8512..a3b7a9b4fe 100644
--- a/paddle/memory/detail/metadata.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <functional>
 
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/meta_data.h
similarity index 100%
rename from paddle/memory/detail/metadata.h
rename to paddle/memory/detail/meta_data.h
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index edd76517a6..8df7c7b4bc 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -28,15 +28,5 @@ size_t CpuMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
 
-int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
-
-void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index d6c6fe34ef..05a243c506 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -31,6 +31,19 @@ int GpuDeviceCount() {
   return count;
 }
 
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
 void GpuMemoryUsage(size_t& available, size_t& total) {
   throw_on_error(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index d7bf0f4093..81ee5f6e0a 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -24,6 +24,12 @@ namespace platform {
 //! Get the total number of GPU devices in system.
 int GpuDeviceCount();
 
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
 //！Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t& available, size_t& total);
 

From 4dc3c9e0cc1b6ec5dbc324f4804974247ca6506f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 23:28:15 +0800
Subject: [PATCH 18/64] ENH: Add paddle_memory for external usage

---
 paddle/memory/CMakeLists.txt            | 10 ++++
 paddle/memory/detail/CMakeLists.txt     |  6 +++
 paddle/memory/detail/buddy_allocator.cc | 12 ++---
 paddle/memory/detail/buddy_allocator.h  |  2 +-
 paddle/memory/detail/memory_block.cc    | 56 +++++++++++--------
 paddle/memory/detail/memory_block.h     | 10 +---
 paddle/memory/detail/meta_cache.cc      | 57 ++++++++++++++++++++
 paddle/memory/detail/meta_cache.h       | 71 +++++++++++++++++++++++++
 paddle/memory/detail/meta_data.cc       |  8 +++
 paddle/memory/detail/meta_data.h        |  1 +
 10 files changed, 196 insertions(+), 37 deletions(-)
 create mode 100644 paddle/memory/detail/meta_cache.cc
 create mode 100644 paddle/memory/detail/meta_cache.h

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad..8c290712fc 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,11 @@
 add_subdirectory(detail)
+
+cc_library(memory
+    SRCS
+    memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c3167cd30a..4fdabc8eeb 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -3,7 +3,13 @@ if(${WITH_GPU})
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
 endif(${WITH_GPU})
+
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
 cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index f677feda0d..aa5b6b557c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -20,14 +20,14 @@ namespace memory {
 namespace detail {
 
 BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size) {
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {
   PADDLE_ASSERT(min_chunk_size > 0);
   PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
-
-  system_allocator_ = std::move(system_allocator);
-  min_chunk_size_ = min_chunk_size;
-  max_chunk_size_ = max_chunk_size;
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -90,7 +90,7 @@ void BuddyAllocator::Free(void* p) {
 
     // Invalidate GPU allocation from cache
     if (system_allocator_->UseGpu()) {
-      cache_.erase(block);
+      cache_.invalidate(block);
     }
     return;
   }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 49bd6cf901..ecf23b77ae 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
 #include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
@@ -80,7 +81,6 @@ class BuddyAllocator {
 
  private:
   // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
   MetadataCache cache_;
 
  private:
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index 1c9e87df49..eaa97e7b4a 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -1,4 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/platform/assert.h"
 
 namespace paddle {
@@ -7,10 +23,9 @@ namespace detail {
 
 void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
                        void* left_buddy, void* right_buddy) {
-  cache.store(this,
-              MemoryBlockMetadata(t, index, size - overhead(), size,
-                                  static_cast<MemoryBlock*>(left_buddy),
-                                  static_cast<MemoryBlock*>(right_buddy)));
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
 }
 
 MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
@@ -35,10 +50,10 @@ MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
 
 void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // make sure the split fits
-  assert(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(cache) >= size);
 
   // bail out if there is no room for another partition
-  if (total_size(cache) - size <= overhead()) {
+  if (total_size(cache) - size <= sizeof(Metadata)) {
     return;
   }
 
@@ -53,13 +68,13 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // Write the metadata for the new block
   auto new_block_right_buddy = metadata.right_buddy;
 
-  cache.store(static_cast<MemoryBlock*>(right_partition),
-              MemoryBlockMetadata(FREE_MEMORY, index(cache),
-                                  remaining_size - overhead(), remaining_size,
-                                  this, new_block_right_buddy));
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
 
   metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - overhead();
+  metadata.size = size - sizeof(Metadata);
   metadata.total_size = size;
 
   cache.store(this, metadata);
@@ -76,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  assert(type(cache) == FREE_MEMORY);
-  assert(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
 
   auto metadata = cache.load(this);
 
@@ -97,16 +112,15 @@ void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   metadata.total_size += right_buddy->total_size(cache);
 
   cache.store(this, metadata);
-  cache.store(right_buddy,
-              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
 
 void MemoryBlock::mark_as_free(MetadataCache& cache) {
   // check for double free or corruption
-  assert(type(cache) != FREE_MEMORY);
-  assert(type(cache) != INVALID_MEMORY);
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
 
-  set_type(cache, FREE_MEMORY);
+  set_type(cache, FREE_CHUNK);
 }
 
 void MemoryBlock::set_type(MetadataCache& cache, Type t) {
@@ -130,14 +144,12 @@ size_t MemoryBlock::index(MetadataCache& cache) const {
 }
 
 void* MemoryBlock::data() const {
-  return const_cast<MemoryBlockMetadata*>(
-             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
-         1;
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
 }
 
 MemoryBlock* MemoryBlock::metadata() const {
   return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+      reinterpret_cast<const Metadata*>(this) - 1));
 }
 
 }  // detail
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index 2945520113..a5168b519f 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,24 +14,18 @@
 
 #pragma once
 
-#include "paddle/memory/detail/meta_data.h"
-
 #include <cstddef>
-#include <unordered_map>
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// Forward Declaration
-class Metadata;
+// Forward Declarations
+class MetadataCache;
 
 /*! \brief A class used to interpret the contents of a memory block */
 class MemoryBlock {
  public:
-  // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
-
   enum Type {
     FREE_CHUNK,    // memory is free and idle
     ARENA_CHUNK,   // memory is being occupied
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000..189ab4fc7b
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    assert(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000..3ca1020d22
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/*! A cache for accessing memory block meta-data that may be expensive to access
+   directly.
+
+    Note: this class exists to unify the metadata format between GPU and CPU
+   allocations.
+    It should be removed when the CPU can access all GPU allocations directly
+   via UVM.
+*/
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+ public:
+  /*! \brief Acquire any external metadata updates. */
+  void acquire(MemoryBlock*);
+
+  /*! \brief Publish any local updates externally. */
+  void release(MemoryBlock*);
+
+  /*! \brief Indicate that the specified metadata will no longer be used */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
index a3b7a9b4fe..70c5c1f439 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -29,6 +29,14 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
       left_buddy(l),
       right_buddy(r) {}
 
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
 template <class T>
 inline void hash_combine(std::size_t& seed, const T& v) {
   std::hash<T> hasher;
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
index ddb826571b..628cf1f2e3 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@@ -26,6 +26,7 @@ class Metadata {
  public:
   Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
            MemoryBlock* r);
+  Metadata();
 
  public:
   /*! \brief Update the guards when metadata is changed */

From d0ad0314bb868b9e0c1aa77f74ca0d2d3e8b8ef0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 16:33:18 +0800
Subject: [PATCH 19/64] FIX: glog dependency

---
 paddle/memory/detail/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 4fdabc8eeb..6cb6422e47 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -12,4 +12,4 @@ cc_library(meta_cache SRCS meta_cache.cc)
 
 cc_library(memory_block SRCS memory_block.cc)
 
-cc_library(buddy_allocator SRCS buddy_allocator.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)

From ada1c20bbc2520d566b7d2bd2a56cf94cbcddd27 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 19:16:02 +0800
Subject: [PATCH 20/64] FIX: Buddy Allocator Free with Merge feature

---
 paddle/memory/detail/buddy_allocator.cc | 33 ++++++++++++++++++++++---
 paddle/memory/detail/buddy_allocator.h  | 15 +++++++----
 paddle/memory/detail/memory_block.cc    |  4 +--
 paddle/platform/CMakeLists.txt          |  2 +-
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index aa5b6b557c..9f334a7048 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -89,9 +89,8 @@ void BuddyAllocator::Free(void* p) {
                             block->index(cache_));
 
     // Invalidate GPU allocation from cache
-    if (system_allocator_->UseGpu()) {
-      cache_.invalidate(block);
-    }
+    cache_.invalidate(block);
+
     return;
   }
 
@@ -104,12 +103,35 @@ void BuddyAllocator::Free(void* p) {
   if (block->has_right_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its right buddy "
                << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
+                   right_buddy});
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
   }
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its left buddy "
                << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
+                   left_buddy});
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
   }
 
   // Dumping this block into pool
@@ -167,13 +189,16 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 
   while (1) {
     auto it = pool_.lower_bound({index, size, nullptr});
+
+    // no match chunk memory
     if (it == pool_.end()) return it;
 
     if (std::get<0>(*it) > index) {
+      // find suitable one
       if (std::get<1>(*it) >= size) {
         return it;
       }
-
+      // update and continue
       index = std::get<0>(*it);
       continue;
     }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index ecf23b77ae..2fd9c8162a 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,14 +42,14 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- public:
+ private:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 
- private:
-  // Tuple type: allocator index, memory size, memory address
+  // Tuple (allocator index, memory size, memory address)
   using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
   using PoolSet = std::set<IndexSizeAddress>;
 
   /*! \brief Allocate fixed-size memory from system */
@@ -57,7 +57,6 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
-
   /**
    *  \brief Find the suitable chunk from existing pool
    *
@@ -77,13 +76,19 @@ class BuddyAllocator {
   size_t max_chunk_size_;  // the maximum size of each chunk
 
  private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
   PoolSet pool_;
 
  private:
-  // Unify the metadata format between GPU and CPU allocations
+  /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
 
  private:
+  /*! Allocate CPU/GPU memory from system */
   SystemAllocator* system_allocator_;
   std::mutex mutex_;
 };
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index eaa97e7b4a..bc67bcef0f 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -91,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
 
   auto metadata = cache.load(this);
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 2f3d1c061e..0ad11f1b10 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)

From 74691789e9e5ee782adb003642f66699603b20e2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:16:03 +0800
Subject: [PATCH 21/64] ENH: add memory unit test

---
 paddle/memory/CMakeLists.txt            |  6 ++--
 paddle/memory/detail/CMakeLists.txt     |  4 +--
 paddle/memory/detail/buddy_allocator.cc | 33 +++++++++++------
 paddle/memory/detail/buddy_allocator.h  | 27 --------------
 paddle/memory/memory.cc                 | 42 ++++++++++++++++++----
 paddle/memory/memory_test.cc            | 48 +++++++++++++++++++++++++
 paddle/platform/gpu_info.cc             |  2 +-
 7 files changed, 112 insertions(+), 50 deletions(-)
 create mode 100644 paddle/memory/memory_test.cc

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8c290712fc..fac442cca5 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_subdirectory(detail)
 
-cc_library(memory
-    SRCS
-    memory.cc)
+cc_library(memory SRCS memory.cc)
 
 cc_library(paddle_memory
     DEPS
     memory meta_data
     meta_cache memory_block
     buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6cb6422e47..b9c3fc31c1 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 9f334a7048..ed2eedf9af 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -24,10 +24,20 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {
-  PADDLE_ASSERT(min_chunk_size > 0);
-  PADDLE_ASSERT(max_chunk_size > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -62,7 +72,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
@@ -142,6 +152,8 @@ void BuddyAllocator::Free(void* p) {
   // TODO(gangliao): Clean up if existing too much free memory
 }
 
+size_t BuddyAllocator::Used() { return total_used_; }
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -172,7 +184,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  DLOG(INFO) << " Creating and inserting new block " << p
+  DLOG(INFO) << "Creating and inserting new block " << p
              << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
@@ -211,20 +223,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
   block->split(cache_, size);
 
-  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
              << ")";
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
-                 << ", " << block->right_buddy(cache_)->total_size(cache_)
-                 << ")";
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert({block->right_buddy(cache_)->index(cache_),
                     block->right_buddy(cache_)->total_size(cache_),
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 2fd9c8162a..eeb2dc8836 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -93,33 +93,6 @@ class BuddyAllocator {
   std::mutex mutex_;
 };
 
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
-                           platform::CpuMaxChunkSize());
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] =
-          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
-                             platform::GpuMaxChunkSize());
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
-
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e2..dde6ff0ef3 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -22,37 +22,67 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+   return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GpuDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 void* Alloc(platform::Place pl, size_t size) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return GetCPUBuddyAllocator()->Alloc(size);
 }
 
 void Free(paddle::platform::Place pl, void* p) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
+    GetGPUBuddyAllocator(gpu_id)->Free(p);
+    return;
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+  GetCPUBuddyAllocator()->Free(p);
 }
 
 size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+    return GetGPUBuddyAllocator(gpu_id)->Used();
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return GetCPUBuddyAllocator()->Used();
 }
 
 }  // namespace memory
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000..a98ccbb806
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/platform/place.h"
+
+#include "gtest/gtest.h"
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 05a243c506..fe475d23ce 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -56,7 +56,7 @@ size_t GpuMaxAllocSize() {
   GpuMemoryUsage(available, total);
 
   // Reserve the rest for page tables, etc.
-  return total * FLAGS_fraction_of_gpu_memory_to_use;
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {

From 936cd1e11885daf35ae34b9fbf11a798e9e51022 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:34:51 +0800
Subject: [PATCH 22/64] FIX: code format

---
 paddle/memory/memory.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index dde6ff0ef3..43f2084e8d 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -29,7 +29,7 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
                                    platform::CpuMinChunkSize(),
                                    platform::CpuMaxChunkSize());
   }
-   return a;
+  return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.

From 5d2e8edb3ec86e13b7684cbe372650d21fe7954d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:37:20 +0800
Subject: [PATCH 23/64] FIX: dynamic loader deps

---
 paddle/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 0ad11f1b10..bf3e8f31ab 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)

From 3ad8e364715915fba5909c137834e34f38b6e9ac Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 11:24:01 +0800
Subject: [PATCH 24/64] FIX: merge static libs with propagation dependencies

---
 cmake/generic.cmake | 51 ++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cae9524b2f..87d8caaec4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
 
-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
   foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps 
+  # also help to track dependencies.
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # Make the generated dummy source file depended on all static input
+  # libs. If input lib changes,the source file is touched
+  # which causes the desired effect (relink).
+  add_custom_command(OUTPUT ${dummyfile}
+    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    DEPENDS ${libs})
+
+  # Generate dummy staic lib
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  add_library(${TARGET_NAME} STATIC ${dummyfile})
+  target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+  foreach(lib ${libs})
+    # Get the file names of the libraries to be merged
     set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
   endforeach()
 
+  # Get the file name of the generated library
+  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
   if(APPLE) # Use OSX's libtool to merge archives
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
@@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME)
       set(objdir ${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
-
-      list(APPEND mergebases "${mergebase}")
-    endforeach()
-
-    # We need a target for the output merged library
-    add_library(${TARGET_NAME} STATIC ${mergebases})
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
-
-    foreach(lib ${libs})
       add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
+        WORKING_DIRECTORY ${objdir})
     endforeach()
 
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD

From a669bf48d966a92206c57d72258bb625b5ff2fbc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 13:38:11 +0800
Subject: [PATCH 25/64] FIX: explicit construct pool element

---
 paddle/memory/detail/buddy_allocator.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ed2eedf9af..2cfacec46c 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -118,8 +118,9 @@ void BuddyAllocator::Free(void* p) {
 
     if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
-                   right_buddy});
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
 
       // merge its right buddy to the block
       block->merge(cache_, right_buddy);
@@ -135,8 +136,8 @@ void BuddyAllocator::Free(void* p) {
 
     if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
-                   left_buddy});
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
 
       // merge the block to its left buddy
       left_buddy->merge(cache_, block);
@@ -147,7 +148,8 @@ void BuddyAllocator::Free(void* p) {
   // Dumping this block into pool
   DLOG(INFO) << "Inserting free block (" << block << ", "
              << block->total_size(cache_) << ")";
-  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // TODO(gangliao): Clean up if existing too much free memory
 }
@@ -193,14 +195,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
-  return pool_.insert({index, max_chunk_size_, p}).first;
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
   size_t index = 0;
 
   while (1) {
-    auto it = pool_.lower_bound({index, size, nullptr});
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
 
     // no match chunk memory
     if (it == pool_.end()) return it;
@@ -237,9 +239,10 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
       DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
                  << block->right_buddy(cache_)->total_size(cache_) << ")";
 
-      pool_.insert({block->right_buddy(cache_)->index(cache_),
-                    block->right_buddy(cache_)->total_size(cache_),
-                    block->right_buddy(cache_)});
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
     }
   }
 

From adf8c95b62fc5ef1f608bc06dce32bb4b396828c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 15:40:22 +0800
Subject: [PATCH 26/64] FIX: propagation dependencies under linux

---
 cmake/generic.cmake | 68 ++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 87d8caaec4..3900ea2604 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,36 +104,32 @@ function(merge_static_libs TARGET_NAME)
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
 
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps 
-  # also help to track dependencies.
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  # Make the generated dummy source file depended on all static input
-  # libs. If input lib changes,the source file is touched
-  # which causes the desired effect (relink).
-  add_custom_command(OUTPUT ${dummyfile}
-    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
-    DEPENDS ${libs})
-
-  # Generate dummy staic lib
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-  add_library(${TARGET_NAME} STATIC ${dummyfile})
-  target_link_libraries(${TARGET_NAME} ${libs_deps})
+  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps 
+    # also help to track dependencies.
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
-  foreach(lib ${libs})
-    # Get the file names of the libraries to be merged
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-  endforeach()
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
 
-  # Get the file name of the generated library
-  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    # Generate dummy staic lib
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
 
-  if(APPLE) # Use OSX's libtool to merge archives
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library
       set(objdir ${lib}.objdir)
@@ -148,13 +144,27 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
-        WORKING_DIRECTORY ${objdir})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
+
+      list(APPEND mergebases "${mergebase}")
     endforeach()
 
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+
+    # Get the file name of the generated library
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
+    endforeach()
   endif()
 endfunction(merge_static_libs)
 

From ddfa6cf0d1fe91f8bf2e1d55841afee9e30d1859 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 17:07:04 +0800
Subject: [PATCH 27/64] FIX: remove boost from memory folder

---
 paddle/memory/memory.cc | 56 +++++++++++++++++++----------------------
 paddle/memory/memory.h  | 11 +++++---
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 43f2084e8d..def580f7a4 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -32,7 +32,22 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
   return a;
 }
 
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
@@ -49,41 +64,22 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
-#endif  // PADDLE_ONLY_CPU
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Alloc(size);
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    GetGPUBuddyAllocator(gpu_id)->Free(p);
-    return;
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  GetCPUBuddyAllocator()->Free(p);
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Used();
-  }
 #endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Used();
-}
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade..2d6f4fd2a0 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);
 
 }  // namespace memory
 }  // namespace paddle

From 199b5fcb45c69560de1b24b3147f5e7db309abe3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 10 Jul 2017 11:22:17 +0800
Subject: [PATCH 28/64] ENH: refine code comments

---
 paddle/memory/detail/buddy_allocator.h   |  3 ++-
 paddle/memory/detail/meta_cache.h        | 25 +++++++++---------------
 paddle/memory/detail/system_allocator.cc |  4 ++--
 paddle/memory/detail/system_allocator.h  |  6 +++---
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index eeb2dc8836..a89dd8eb7c 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,7 +42,7 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- private:
+ public:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
@@ -57,6 +57,7 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
+
   /**
    *  \brief Find the suitable chunk from existing pool
    *
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index 3ca1020d22..ca0789779e 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -23,14 +23,14 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-/*! A cache for accessing memory block meta-data that may be expensive to access
-   directly.
-
-    Note: this class exists to unify the metadata format between GPU and CPU
-   allocations.
-    It should be removed when the CPU can access all GPU allocations directly
-   via UVM.
-*/
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
 class MetadataCache {
  public:
   MetadataCache(bool uses_gpu);
@@ -42,14 +42,7 @@ class MetadataCache {
   /*! \brief Store the associated metadata for the specified memory block. */
   void store(MemoryBlock*, const Metadata&);
 
- public:
-  /*! \brief Acquire any external metadata updates. */
-  void acquire(MemoryBlock*);
-
-  /*! \brief Publish any local updates externally. */
-  void release(MemoryBlock*);
-
-  /*! \brief Indicate that the specified metadata will no longer be used */
+  /*! \brief Indicate that the specified metadata will no longer be used. */
   void invalidate(MemoryBlock*);
 
  public:
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 75a2c91ef9..1579174b1a 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,7 +60,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
-bool CPUAllocator::UseGpu() { return false; }
+bool CPUAllocator::UseGpu() const { return false; }
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -133,7 +133,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
-bool GPUAllocator::UseGpu() { return true; }
+bool GPUAllocator::UseGpu() const { return true; }
 
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 555061a533..04efcd9709 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,14 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
-  virtual bool UseGpu() = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -47,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 
  private:
   size_t gpu_alloc_size_ = 0;

From d4017cadcd0fa07d8874e052ffa91700ebb32a05 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:18:38 +0800
Subject: [PATCH 29/64] ENH: Add auto-free if allocate too much

---
 paddle/memory/detail/buddy_allocator.cc | 69 ++++++++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h  |  3 ++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 3f630973e9..27c1b4033b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -152,7 +152,7 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // Clean up if existing too much free memory
-  
+
   // Prefer freeing fallback allocation first
   CleanIdleFallBackAlloc();
 
@@ -198,6 +198,12 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
 
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
@@ -256,9 +262,68 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 }
 
 void BuddyAllocator::CleanIdleFallBackAlloc() {
-  
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
 }
 
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
+}
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 14ee1fa07c..4fa3fb0ee5 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -94,6 +94,9 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;

From 6a3b8416df124153d4a1fd1f8f559107578ed58e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:20:43 +0800
Subject: [PATCH 30/64] FIX: clang-format

---
 paddle/memory/memory_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index fed7444798..9fdcd03b1a 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -33,7 +33,7 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void*> ps;
+  std::vector<void *> ps;
   ps.reserve(8);
 
   for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {

From 383b96f32c60ec542819c62b4e09009cae9afc9d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 16:26:58 +0800
Subject: [PATCH 31/64] FIX: merge conflicts

---
 paddle/memory/detail/meta_cache.cc | 2 +-
 paddle/memory/memory.cc            | 2 +-
 paddle/platform/CMakeLists.txt     | 2 +-
 paddle/platform/device_context.h   | 3 ++-
 paddle/platform/gpu_info.cc        | 4 ++--
 paddle/platform/gpu_info.h         | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
index 189ab4fc7b..30ff80e7ba 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -25,7 +25,7 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 Metadata MetadataCache::load(const MemoryBlock* block) {
   if (uses_gpu_) {
     auto existing_metadata = cache_.find(block);
-    assert(existing_metadata->second.check_guards());
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
     return existing_metadata->second;
   } else {
     PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index def580f7a4..430ce98bfc 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -52,7 +52,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
+    int gpu_num = platform::GetDeviceCount();
     as = new detail::BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4b3f55b3c7..d16c747aee 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,4 +8,4 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3)
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 gpu_info)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e120..02194581d1 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include "paddle/platform/place.h"
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index fe475d23ce..9b917f9d35 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -23,11 +23,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
 namespace paddle {
 namespace platform {
 
-int GpuDeviceCount() {
+int GetDeviceCount() {
   int count;
   throw_on_error(
       cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
   return count;
 }
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 81ee5f6e0a..79e71956bd 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace platform {
 
 //! Get the total number of GPU devices in system.
-int GpuDeviceCount();
+int GetDeviceCount();
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

From 69d99d481dc553c2f26d967d365b7ebc7e228e07 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 12 Jul 2017 17:58:35 +0800
Subject: [PATCH 32/64] Add Tensor::CopyFrom and Tensor::mutable_data(Place
 place)

1. Add `Tensor::CopyFrom`. Current version can only support CPU memory
copy. The support of GPU will be provided later by `paddle::memory`.
The current implementation of `Tensor::CopyFrom` is a little inefficient:
Every time `CopyFrom` is called, tensor will re-allocate its memory. However, if
we try to check and reuse `placeholder_`, we have to provide a template
parameter for `CopyFrom` to indicate the data type. It seems strange for
a simple copy function.

2. Add `Tensor::mutable_data(Place place)`, which directly use member
variable `dims_` as its dim parameter. This interface is required by
`Op::InferShape`.
---
 paddle/framework/tensor.h       | 34 +++++++++++++++++++++++++++++++--
 paddle/framework/tensor_test.cc | 25 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a0945e8055..7f3894bb3c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <type_traits>
 #include "paddle/framework/ddim.h"
@@ -44,11 +45,17 @@ class Tensor {
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
     dims_ = dims;
+    return mutable_data<T>(place);
+  }
+
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  T* mutable_data(paddle::platform::Place place) {
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+        || holder_->Size() < product(dims_) * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, product(dims_) * sizeof(T)));
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
@@ -63,6 +70,15 @@ class Tensor {
     offset_ = src.offset_;
   }
 
+  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+    PADDLE_ENFORCE(src.holder_ != nullptr,
+                   "Can not copy from an uninitialized tensor.");
+    size_t size = product(src.dims()) * src.holder_->TypeSize();
+    holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
+    dims_ = src.dims();
+    offset_ = 0;
+  }
+
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "The sliced tenosr has not been initialized.");
@@ -95,6 +111,8 @@ class Tensor {
     virtual paddle::platform::Place Place() const = 0;
     virtual size_t Size() const = 0;
     virtual size_t TypeSize() const = 0;
+    virtual Placeholder* Clone(size_t begin, size_t size,
+                               paddle::platform::Place place) const = 0;
   };
 
   template <typename T>
@@ -122,6 +140,18 @@ class Tensor {
     virtual size_t Size() const { return size_; }
     virtual paddle::platform::Place Place() const { return place_; }
     virtual size_t TypeSize() const { return sizeof(T); }
+    // TODO: Clone only support CPU now. GPU support is needed.
+    virtual Placeholder* Clone(size_t begin, size_t size,
+                               paddle::platform::Place place) const {
+      PADDLE_ENFORCE(paddle::platform::is_cpu_place(place_) &&
+                         paddle::platform::is_cpu_place(place),
+                     "PlaceholderImpl::Clone only support CPU now.");
+      PlaceholderImpl<T>* dst = new PlaceholderImpl<T>(place, size);
+      void* begin_ptr =
+          reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(Ptr()) + begin);
+      memcpy(dst->Ptr(), begin_ptr, size);
+      return dst;
+    }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index f4822838cf..6db0ba8c79 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -178,4 +178,29 @@ TEST(Tensor, Slice) {
   }
 }
 
+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor src_tensor;
+  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  Tensor dst_tensor;
+  dst_tensor.CopyFrom(src_tensor, CPUPlace());
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  dst_tensor.CopyFrom(slice_tensor, CPUPlace());
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+}
 */
\ No newline at end of file

From ff98e3c1ece983403ebdfa57f07d3bdf58f85647 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 14:26:48 +0800
Subject: [PATCH 33/64] ENH: Remove comments

---
 paddle/memory/detail/system_allocator.h | 12 +++++-------
 paddle/platform/gpu_info.cc             |  1 -
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 04efcd9709..82ba322e05 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,13 +20,11 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 9b917f9d35..a1383d3524 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -65,7 +65,6 @@ size_t GpuMinChunkSize() {
 }
 
 size_t GpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
   size_t total = 0;
   size_t available = 0;
 

From 00572aa451d44ccb32b1c59a59241d7000c68fda Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 19:14:09 +0800
Subject: [PATCH 34/64] Add memory alignment test

---
 paddle/memory/memory_test.cc | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 9fdcd03b1a..4c9b3311bb 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+#include "gtest/gtest.h"
 #include "paddle/platform/place.h"
 
-#include "gtest/gtest.h"
+template <typename T>
+inline bool is_aligned(T *p, size_t n = alignof(T)) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+}
 
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
@@ -36,11 +40,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   std::vector<void *> ps;
   ps.reserve(8);
 
-  for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps.emplace_back(paddle::memory::Alloc(cpu, size));
   }
 
   for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
     paddle::memory::Free(cpu, p);
   }
 }
@@ -60,4 +66,21 @@ TEST(BuddyAllocator, GPUAllocation) {
   paddle::memory::Free(gpu, p);
 }
 
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::vector<void *> ps;
+  ps.reserve(8);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
+    paddle::memory::Free(gpu, p);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU

From a751c79331b7cc6066bf6da403dc72c9367aae27 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Thu, 13 Jul 2017 22:39:07 +0000
Subject: [PATCH 35/64] turn on race detector for all go tests

---
 cmake/generic.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 716955c7b4..25946f7a7b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -337,7 +337,7 @@ function(go_test TARGET_NAME)
   string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")

From ab5fe1e9071ef67850683442035f27c6c602e126 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 11:52:03 +0800
Subject: [PATCH 36/64] ENH: memory test: check alignment and memory size

---
 paddle/memory/memory_test.cc | 80 ++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 4c9b3311bb..458c8b2e24 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,14 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
-#include "gtest/gtest.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
-template <typename T>
-inline bool is_aligned(T *p, size_t n = alignof(T)) {
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p, const size_t n) {
   return 0 == (reinterpret_cast<uintptr_t>(p) % n);
 }
 
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void update_size(size_t &total_size, const size_t size) {}
+
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 
@@ -37,17 +59,33 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(cpu, size));
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(cpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 }
 
@@ -69,17 +107,33 @@ TEST(BuddyAllocator, GPUAllocation) {
 TEST(BuddyAllocator, GPUMultAlloc) {
   paddle::platform::GPUPlace gpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(gpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 }
 

From 21b7915d9122d29bdb7506ab2e30049653ccf52a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 12:03:09 +0800
Subject: [PATCH 37/64] Fix condition compile

---
 paddle/memory/memory_test.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 458c8b2e24..e13cbabb26 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -34,13 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-size_t align(size_t size, paddle::platform::GPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
 void update_size(size_t &total_size, const size_t size) {}
 
 TEST(BuddyAllocator, CPUAllocation) {
@@ -91,6 +84,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 
 #ifndef PADDLE_ONLY_CPU
 
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
 TEST(BuddyAllocator, GPUAllocation) {
   void *p = nullptr;
 

From cd5113c19766c4ae16b2298272263e6e78317225 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Jul 2017 19:57:06 +0800
Subject: [PATCH 38/64] Init commit

---
 paddle/framework/op_registry.h | 10 +++++-----
 paddle/pybind/CMakeLists.txt   |  2 +-
 paddle/pybind/pybind.cc        | 16 ++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 61dfcb7049..bf91b577c7 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -125,17 +125,17 @@ class OpRegistry {
     return op;
   }
 
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+
  private:
   static std::unordered_map<std::string, OpCreator>& creators() {
     static std::unordered_map<std::string, OpCreator> creators_;
     return creators_;
   }
 
-  static std::unordered_map<std::string, OpProto>& protos() {
-    static std::unordered_map<std::string, OpProto> protos_;
-    return protos_;
-  };
-
   static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index af85fdeecb..8564a5f5fe 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1 +1 @@
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f9f87acf15..6a1e9291cb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <Python.h>
+#include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
+#include <vector>
 
 namespace py = pybind11;
 namespace pd = paddle::framework;
 
+USE_OP(add_two);
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
@@ -43,5 +47,17 @@ All parameter, weight, gradient are variables in Paddle.
            &pd::Scope::CreateVariable,
            py::return_value_policy::reference);
 
+  m.def("get_all_op_protos", []() -> std::vector<std::string> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<std::string> ret_values;
+    ret_values.reserve(protos.size());
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      ret_values.emplace_back();
+      PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+    }
+    return ret_values;
+  });
+
   return m.ptr();
 }

From ea916c84874e4a036650bccb2b2eae142a6c36ae Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:00:03 +0800
Subject: [PATCH 39/64] Fix: alignment metric

---
 paddle/memory/memory_test.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index e13cbabb26..2a2cb98a31 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 
 inline bool is_aligned(void const *p, const size_t n) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
 size_t align(size_t size, paddle::platform::CPUPlace place) {
@@ -34,8 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-void update_size(size_t &total_size, const size_t size) {}
-
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 

From 033523ea9d01bb0e860dbbfd6c5bab72d2c5b149 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:02:17 +0800
Subject: [PATCH 40/64] update

---
 paddle/memory/memory_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 2a2cb98a31..53cc63a098 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <unordered_map>
 
-inline bool is_aligned(void const *p, const size_t n) {
+inline bool is_aligned(void const *p) {
   return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
@@ -68,7 +68,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(cpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk
@@ -123,7 +123,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(gpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk

From 8da5587205a0f613ed32273226739df3e82e8d8d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 13:28:49 +0800
Subject: [PATCH 41/64] Init commit

---
 .../v2/framework/create_op_creation_methods.py    | 12 ++++++++++++
 python/paddle/v2/framework/tests/CMakeLists.txt   |  2 +-
 .../framework/tests/test_op_creation_methods.py   | 15 +++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/v2/framework/create_op_creation_methods.py
 create mode 100644 python/paddle/v2/framework/tests/test_op_creation_methods.py

diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
new file mode 100644
index 0000000000..14beaadc9a
--- /dev/null
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -0,0 +1,12 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+
+
+def get_all_op_protos():
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = op_proto_pb2.OpProto()
+        op_proto.ParseFromString(pbstr)
+        ret_values.append(op_proto)
+    return ret_values
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 7023e82b5f..86fc60f26a 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py)
+    test_default_scope_funcs.py test_op_creation_methods.py)
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
new file mode 100644
index 0000000000..b205e2cabb
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -0,0 +1,15 @@
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+
+
+class TestOpCreationsMethods(unittest.TestCase):
+    def test_all_protos(self):
+        all_protos = creation.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()

From 010adb99b4d79d0babea132ca8ffb2b9dc048017 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 14 Jul 2017 13:40:49 +0800
Subject: [PATCH 42/64] Remove useless empty pointer check.

---
 paddle/api/ConfigParser.cpp                 | 12 ++----------
 paddle/api/ParameterOptimizer.cpp           | 12 ++----------
 paddle/api/Vector.cpp                       |  6 +-----
 paddle/gserver/dataproviders/DataProvider.h |  6 ++----
 paddle/math/Storage.cpp                     |  4 +---
 paddle/trainer/TrainerConfigHelper.cpp      |  6 +-----
 6 files changed, 9 insertions(+), 37 deletions(-)

diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd..b6ff6ec789 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e..120eea3f70 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929..500bc448c9 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4003676217..265dbb5493 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207..4adaaef983 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a1..133e2be104 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 

From 2462d0c5fedb783a322170ff15f828e63b612ead Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 00:50:46 -0500
Subject: [PATCH 43/64] Let OpProto support multiple and temporary (#2860)

* Let OpProto support multiple and temporary

* Each input/output of Paddle's Op could be a list. Add multiple mark to
  OpProto. Also add a `input_format`/`output_format` attribute if that
  Op has multiple input or output. The format of that attribute please
  reference the comments in `op_proto.proto`
* Add temporary mark, because some output of an Op is not used by user
  but used by other op for faster computation. Explicitly mark which
  output is temporary could let future memory/computation optimization.
* Add generated field to AttrProto.

* Add `AddInputs`/`AddOutputs` function

* It is more readable to invoke `AddInputs` not
  `AddInput(multiple=true)`.
---
 paddle/framework/op_proto.proto      | 39 +++++++++++
 paddle/framework/op_registry.h       | 97 +++++++++++++++++++++++++++-
 paddle/framework/op_registry_test.cc | 15 ++++-
 3 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
index 22df6f9c6b..596b8588e7 100644
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -34,6 +34,11 @@ message AttrProto {
 
     // Supported attribute comments. It helps 3rd-party language generate doc-string.
     required string comment = 3;
+
+    // If that attribute is generated, it means the Paddle third language
+    // binding has responsibility to fill that attribute. End-User should
+    // not set that attribute.
+    optional bool generated = 4 [default=false];
 }
 
 // Input or output message for 3rd-party language binding.
@@ -45,6 +50,40 @@ message VarProto {
 
     // The comment for that input. It helps 3rd-party language generate doc-string.
     required string comment = 2;
+
+    // Is that input/output could be a list or not.
+    // If so, that Op should write a attributed named `input_format` or
+    // `output_format`.
+    //
+    // e.g.
+    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+    //   will hold a attribute of them.
+    //
+    //   The Op desc of same fc could be
+    //   {
+    //      "type": "fc",
+    //      "input": ["X1", "X2", "W1", "W2", "b"],
+    //      "output": "fc.out",
+    //      "attrs" : {
+    //        "input_format": [0, 2, 4, 5]
+    //      }
+    //   }
+    //
+    optional bool multiple = 3 [default=false];
+
+    // It marks that output is a temporary output. That output is not used by
+    // user, but used by other op internally as input. If other op is not use
+    // that output, it could be optimized early.
+    //
+    // Attribute temporary_index will be set in OpDesc if there is some
+    // outputs are temporary.
+    //
+    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+    // attrs = {
+    //   "temporary_index": [1]
+    // }
+    optional bool temporary = 4 [default=false];
 }
 
 // Op protocol message for 3rd-party language binding.
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 61dfcb7049..d049599a2f 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -2,6 +2,8 @@
 
 #include <algorithm>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/framework/attr_checker.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
@@ -59,25 +61,52 @@ class OpProtoAndCheckerMaker {
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
+  ~OpProtoAndCheckerMaker() { CheckNoDuplicatedAttrs(); }
+
  protected:
-  void AddInput(const std::string& name, const std::string& comment) {
+  void AddInput(const std::string& name, const std::string& comment,
+                bool multiple = false) {
     auto input = proto_->mutable_inputs()->Add();
     *input->mutable_name() = name;
     *input->mutable_comment() = comment;
+    input->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleInput();
+    }
+  }
+
+  void AddInputs(const std::string& name, const std::string& comment) {
+    AddInput(name, comment, true);
   }
 
-  void AddOutput(const std::string& name, const std::string& comment) {
+  void AddOutput(const std::string& name, const std::string& comment,
+                 bool temporary = false, bool multiple = false) {
     auto output = proto_->mutable_outputs()->Add();
     *output->mutable_name() = name;
     *output->mutable_comment() = comment;
+    output->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleOutput();
+    }
+    output->set_temporary(temporary);
+    if (temporary) {
+      SetHasTemporaryOutput();
+    }
+  }
+
+  void AddOutputs(const std::string& name, const std::string& comment,
+                  bool temporary = false) {
+    AddOutput(name, comment, temporary, true);
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
+                               const std::string& comment,
+                               bool generated = false) {
     auto attr = proto_->mutable_attrs()->Add();
     *attr->mutable_name() = name;
     *attr->mutable_comment() = comment;
+    attr->set_generated(generated);
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
@@ -86,8 +115,70 @@ class OpProtoAndCheckerMaker {
     *(proto_->mutable_comment()) = comment;
   }
 
+ private:
+  void SetHasMultiple(const std::string& in_out, bool* flag) {
+    if (!*flag) {
+      AddAttr<std::vector<int>>(in_out + "_format",
+                                "The multiple index of " + in_out +
+                                    "\n"
+                                    R"DOC(
+This attribute is used by Paddle core framework. Paddle's Op support each input
+or output could be a list of variable. This attribute is used to show how that
+list organized.
+
+e.g.
+  input = ["a", "b", "c", "d", "e", "f"]
+  input_format = [0, 4, 5, 6]
+
+means
+  The number of all input variables this op is six, and they are segmented into
+  three inputs.
+
+  The first input is input[0:4], second is input[4:5], third is input[5:6].
+)DOC",
+                                /*generated*/ true);
+      *flag = true;
+    }
+  }
+
+  void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
+  void SetHasMultipleOutput() {
+    SetHasMultiple("output", &has_multiple_output_);
+  }
+
+  void SetHasTemporaryOutput() {
+    if (!has_temporary_output_) {
+      AddAttr<std::vector<int>>("temporary_index",
+                                R"DOC(The temporary index of output.
+
+Not all output of Paddle Op is used by user. For faster computation, each op
+could output some its internal state to other op, other op could take that
+output to make compute faster.
+
+Add a mark to which output is temporary is helpful for future optimization.
+)DOC",
+                                /*generated*/ true)
+          .SetDefault(std::vector<int>());
+      has_temporary_output_ = true;
+    }
+  }
+
+  void CheckNoDuplicatedAttrs() {
+    std::unordered_set<std::string> names;
+    size_t cnt = 0;
+    for (auto& attr : proto_->attrs()) {
+      names.insert(attr.name());
+      ++cnt;
+    }
+    PADDLE_ENFORCE(names.size() == cnt,
+                   "Cannot register two attribute in same name!");
+  }
+
   OpProto* proto_;
   OpAttrChecker* op_checker_;
+  bool has_multiple_input_{false};
+  bool has_multiple_output_{false};
+  bool has_temporary_output_{false};
 };
 
 class OpRegistry {
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 9bcc0407ad..1adafa3714 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -36,8 +36,9 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
+    AddInputs("input", "input of cosine op");
+    AddOutput("output", "output of cosine op",
+              /*temporary*/ true);
     auto my_checker = [](int i) {
       PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
     };
@@ -117,11 +118,20 @@ TEST(OpRegistry, DefaultValue) {
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
+static void SetInputFormat(paddle::framework::OpDesc* desc) {
+  auto attr = desc->add_attrs();
+  attr->set_name("input_format");
+  attr->set_type(paddle::framework::INTS);
+  attr->mutable_ints()->Add(0);
+  attr->mutable_ints()->Add(1);
+}
+
 TEST(OpRegistry, CustomChecker) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("my_test_op");
   op_desc.add_inputs("ii");
   op_desc.add_outputs("oo");
+  SetInputFormat(&op_desc);
 
   // attr 'test_attr' is not set
   bool caught = false;
@@ -163,6 +173,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
+  SetInputFormat(&op_desc);
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;

From 58f3de95cf34d8c826221781e8a8dbea954e7069 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 14 Jul 2017 14:56:49 +0800
Subject: [PATCH 44/64] Optimize ptr (#2851)

* use OperatorPtr = std::shared_ptr<OperatorBase>;
* use ScopePtr = std::share_ptr<Scope>;
---
 paddle/framework/net.cc              |  4 +-
 paddle/framework/net.h               | 13 +++---
 paddle/framework/op_registry.h       |  4 +-
 paddle/framework/op_registry_test.cc | 20 +++++-----
 paddle/framework/operator.h          | 12 +++---
 paddle/framework/operator_test.cc    | 59 +++++++++++++++++++++++-----
 paddle/framework/scope.h             |  7 +++-
 7 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 73b3051235..854ad8e33e 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -5,13 +5,13 @@ namespace framework {
 
 PlainNet::PlainNet(const NetDesc& def) {}
 
-void PlainNet::InferShape(Scope* scope) {
+void PlainNet::InferShape(const ScopePtr& scope) const {
   for (auto& op : ops_) {
     op.InferShape();
   }
 }
 
-void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
+void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
   for (auto& op : ops_) {
     op.Run(ctx);
   }
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 76992e0728..0481d8f47c 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -37,8 +37,8 @@ struct OpAttrs {};
 class Operator {
  public:
   Operator(const OpDesc &def) {}
-  void InferShape() {}
-  void Run(DeviceContext *ctx) {}
+  void InferShape() const {}
+  void Run(const DeviceContext &ctx) const {}
 };
 
 /**
@@ -60,7 +60,7 @@ class Net {
   /**
    * @brief Infer shapes of all inputs and outputs of operators.
    */
-  virtual void InferShape(Scope *scope) = 0;
+  virtual void InferShape(const ScopePtr &scope) const = 0;
   /**
    * @brief Run the network.
    *
@@ -69,7 +69,7 @@ class Net {
    * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
    * If no positive indexes are provided, all operators in `ops_` will run.
    */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
+  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
 
   /**
    * @brief Add an Operator according to `def`.
@@ -114,7 +114,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output varialbes' shapes, will be called
    * before every mini-batch
    */
-  virtual void InferShape(Scope *scope) override;
+  virtual void InferShape(const ScopePtr &scope) const override;
 
   /**
    * @brief Run the network.
@@ -123,7 +123,8 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
+  virtual void Run(const ScopePtr &scope,
+                   const DeviceContext &ctx) const override;
 
   /**
    * @brief Add an operator to this network.
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index d049599a2f..6be6ae15c2 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -198,9 +198,9 @@ class OpRegistry {
         op_type, op_proto.InitializationErrorString());
   }
 
-  static OperatorBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
-    OperatorBase* op = creators().at(op_type)();
+    OperatorPtr op(creators().at(op_type)());
     op->desc_ = op_desc;
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 1adafa3714..4791d4aaab 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -5,9 +5,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
  public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -25,8 +25,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 class MyTestOp : public OperatorBase {
  public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {}
 
  public:
@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -89,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -110,7 +110,7 @@ TEST(OpRegistry, DefaultValue) {
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -136,7 +136,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -155,7 +155,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -174,7 +174,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
   SetInputFormat(&op_desc);
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
   auto scope = std::make_shared<paddle::framework::Scope>();
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d3c55e0ceb..cf79f379fa 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -30,7 +30,7 @@ namespace paddle {
 namespace framework {
 
 class OperatorBase;
-
+using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -56,10 +56,10 @@ class OperatorBase {
 
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const ScopePtr& scope) const = 0;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
  protected:
@@ -82,7 +82,7 @@ class OpKernel {
    */
   class KernelContext {
    public:
-    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+    KernelContext(const OperatorBase* op, const ScopePtr& scope,
                   const platform::DeviceContext& device_context)
         : op_(*op), scope_(scope), device_context_(device_context) {}
 
@@ -95,7 +95,7 @@ class OpKernel {
     }
 
     const OperatorBase& op_;
-    const std::shared_ptr<Scope>& scope_;
+    const ScopePtr& scope_;
     const platform::DeviceContext& device_context_;
   };
 
@@ -140,7 +140,7 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
     opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 204b601a4a..d0c3153fae 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -22,8 +22,8 @@ namespace framework {
 class OperatorTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {
     float scale = GetAttr<float>("scale");
     ASSERT_NEAR(scale, 3.14, 1e-5);
@@ -36,6 +36,50 @@ class OperatorTest : public OperatorBase {
   float x = 0;
 };
 
+class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(test_operator, paddle::framework::OperatorTest,
+            paddle::framework::OperatorTestProtoAndCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  float scale = 3.14;
+  attr->set_f(scale);
+
+  paddle::platform::CPUDeviceContext device_context;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
+  scope->CreateVariable("OUT1");
+  op->Run(scope, device_context);
+  std::cout << op->DebugString() << std::endl;
+}
+
+namespace paddle {
+namespace framework {
+
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,9 +117,7 @@ REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
 REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
 
 TEST(OpKernel, all) {
-  using namespace paddle::framework;
-
-  OpDesc op_desc;
+  paddle::framework::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   *op_desc.mutable_inputs()->Add() = "IN1";
   *op_desc.mutable_outputs()->Add() = "OUT1";
@@ -85,10 +127,9 @@ TEST(OpKernel, all) {
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
 
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
-
-  delete op;
 }
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a4470f726f..ec62c9189f 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Scope;
+using ScopePtr = std::shared_ptr<Scope>;
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -41,7 +44,7 @@ class Scope {
   /**
    * @brief Initialize a Scope with parent.
    */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
 
   /**
    * @brief Create Variable
@@ -88,7 +91,7 @@ class Scope {
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  ScopePtr parent_{nullptr};
 };
 
 }  // namespace framework

From a76f7ed2eb02c9beb5e7b16ff7fede0f13477df8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 15:04:44 +0800
Subject: [PATCH 45/64] Get OpProtos in Python

* PyBind and SWIG of paddle cannot be load in a single Python process,
  lazy import all SWIG library of Paddle. Otherwise, the glog, gflags
  are imported twice in a same Python process.
* Note that all PyBind11 return C++ std::string as an unicode. For
  protobuf, it is need be cast to `str` before use them.
* Add unit test for Get `OpProtos`
---
 paddle/pybind/pybind.cc                               |  7 ++++++-
 python/paddle/v2/__init__.py                          |  4 +---
 python/paddle/v2/data_feeder.py                       |  1 -
 python/paddle/v2/event.py                             |  3 +--
 .../paddle/v2/framework/create_op_creation_methods.py |  3 +--
 python/paddle/v2/inference.py                         |  4 ++--
 python/paddle/v2/optimizer.py                         |  5 +++--
 python/paddle/v2/parameters.py                        |  5 +++--
 python/paddle/v2/trainer.py                           | 11 +++++------
 python/setup.py.in                                    |  3 ++-
 10 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6a1e9291cb..c1a025ed04 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <fstream>
 #include <vector>
 
 namespace py = pybind11;
@@ -47,11 +49,14 @@ All parameter, weight, gradient are variables in Paddle.
            &pd::Scope::CreateVariable,
            py::return_value_policy::reference);
 
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<std::string> {
     auto& protos = pd::OpRegistry::protos();
     std::vector<std::string> ret_values;
-    ret_values.reserve(protos.size());
     for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
       ret_values.emplace_back();
       PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
                      "Serialize OpProto Error. This could be a bug of Paddle.");
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3ba5c31871..3c75ca4c3a 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,7 +30,6 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
@@ -47,7 +45,6 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'data_feeder',
     'dataset',
     'reader',
     'topology',
@@ -61,6 +58,7 @@ __all__ = [
 
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e..98dfb85a0e 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa33..7589cc9917 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
     'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
index 14beaadc9a..2fcdfead25 100644
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -6,7 +6,6 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = op_proto_pb2.OpProto()
-        op_proto.ParseFromString(pbstr)
+        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
         ret_values.append(op_proto)
     return ret_values
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601..40134a3270 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,7 @@
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
 
 __all__ = ['infer', 'Inference']
 
@@ -28,6 +26,7 @@ class Inference(object):
     """
 
     def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
             topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -40,6 +39,7 @@ class Inference(object):
         self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 390c22ee55..3dec340cfb 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,5 +1,3 @@
-import py_paddle.swig_paddle as swig_api
-
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -26,6 +24,8 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
+        if swig_api is None:
+            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 
@@ -268,6 +268,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bbaf8bfa97..a9cba8ca0b 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,5 +1,4 @@
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -124,6 +123,7 @@ class Parameters(object):
         :return: parameter value
         :rtype: np.ndarray
         """
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
@@ -223,7 +223,7 @@ class Parameters(object):
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
@@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 96c6c4b89a..92fdf98e90 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -59,6 +53,7 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
@@ -124,6 +119,8 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
@@ -187,6 +184,8 @@ class SGD(object):
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/setup.py.in b/python/setup.py.in
index 271ee6e552..b1041f6102 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,8 @@ setup_requires=["requests",
                 "recordio",
                 "matplotlib",
                 "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "nltk"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]

From ac69f7730a1013842836b534036aa5a953db0001 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 14 Jul 2017 11:05:56 +0800
Subject: [PATCH 46/64] add gated linear unit into config helper.

---
 doc/api/v2/config/layer.rst                   |   5 +
 .../paddle/trainer_config_helpers/layers.py   |  96 +++++++++++++++-
 .../tests/configs/file_list.sh                |   2 +-
 .../protostr/test_gated_unit_layer.protostr   | 106 ++++++++++++++++++
 .../tests/configs/test_gated_unit_layer.py    |  16 +++
 5 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 4f4a9187bc..daee55b7f9 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b0524a507b..f0ee46262d 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -126,6 +126,7 @@ __all__ = [
     'row_conv_layer',
     'dropout_layer',
     'prelu_layer',
+    'gated_unit_layer',
 ]
 
 
@@ -5862,7 +5863,7 @@ def prelu_layer(input,
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
     assert isinstance(param_attr, ParameterAttribute)
 
     l = Layer(
@@ -5876,3 +5877,96 @@ def prelu_layer(input,
         layer_type=LayerType.PRELU,
         parents=input,
         size=l.config.size)
+
+
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_bias_attr=True,
+                     gate_param_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     inproj_layer_attr=None,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_layer_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_layer_attr: ExtraLayerAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        param_attr=inproj_param_attr,
+        layer_attr=inproj_layer_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        param_attr=gate_param_attr,
+        layer_attr=gate_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 70e342fb79..cdf9b2eab7 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology)
+test_recursive_topology test_gated_unit_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000..f1e4d894a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000..83aa51bf28
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)

From e2fd06c386107d518ebfe315d89d5ed70e5ee780 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 14 Jul 2017 16:02:44 +0800
Subject: [PATCH 47/64] refine name of the input parameter.

---
 .../paddle/trainer_config_helpers/layers.py   | 22 +++++++++----------
 .../tests/configs/test_gated_unit_layer.py    |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index f0ee46262d..78aa0778f8 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -5879,19 +5879,19 @@ def prelu_layer(input,
         size=l.config.size)
 
 
-@layer_support(ERROR_CLIPPING, DROPOUT)
 @wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 @wrap_act_default(act=LinearActivation())
 def gated_unit_layer(input,
                      size,
                      act=None,
                      name=None,
                      gate_attr=None,
-                     gate_bias_attr=True,
                      gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
                      inproj_param_attr=None,
                      inproj_bias_attr=True,
-                     inproj_layer_attr=None,
                      layer_attr=None):
     """
     The gated unit layer implements a simple gating mechanism over the input.
@@ -5923,18 +5923,18 @@ def gated_unit_layer(input,
         clipping threshold, dropout and so on. See ExtraLayerAttribute for
         more details.
     :type gate_attr: ExtraLayerAttribute|None
-    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
-    :type gate_bias_attr: ParameterAttribute|None
     :param gate_param_attr: Attributes to tune the learnable projected matrix
         parameter of the gate.
     :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
     :param inproj_param_attr: Attributes to tune the learnable parameter of
         the projection of input.
     :type inproj_param_attr: ParameterAttribute|None
-    :param inproj_layer_attr: Attributes to the tune the projected input, for
-        example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
-    :type inproj_layer_attr: ExtraLayerAttribute|None
     :param inproj_bias_attr: Attributes to tune the learnable bias of
         projection of the input.
     :type inproj_bias_attr: ParameterAttribute|None
@@ -5954,8 +5954,8 @@ def gated_unit_layer(input,
         name="%s_input_proj" % name,
         size=size,
         act=act,
+        layer_attr=inproj_attr,
         param_attr=inproj_param_attr,
-        layer_attr=inproj_layer_attr,
         bias_attr=inproj_bias_attr)
 
     gate = fc_layer(
@@ -5963,8 +5963,8 @@ def gated_unit_layer(input,
         name="%s_gate" % name,
         act=SigmoidActivation(),
         input=input,
-        param_attr=gate_param_attr,
         layer_attr=gate_attr,
+        param_attr=gate_param_attr,
         bias_attr=gate_bias_attr)
     return mixed_layer(
         name="%s_gated_act" % name,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
index 83aa51bf28..9dab45519c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -5,11 +5,11 @@ glu = gated_unit_layer(
     size=512,
     input=data,
     act=TanhActivation(),
-    gate_param_attr=ParamAttr(initial_std=1e-4),
     gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
     gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
     inproj_param_attr=ParamAttr(initial_std=1e-4),
-    inproj_layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
     inproj_bias_attr=ParamAttr(initial_std=1),
     layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
 

From a1dc4311a30cbdd5af2c66deedf1162668d9001f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:02:03 +0800
Subject: [PATCH 48/64] Refactor `Tensor::CopyFrom()`

---
 paddle/framework/tensor.h | 61 +++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7f3894bb3c..e164f57abc 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -28,34 +28,33 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : offset_(0) {}
+  Tensor() : offset_(0) { numel_ = product(dims_); }
 
-  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+  Tensor& operator=(const Tensor& src) = delete;
 
   template <typename T>
   const T* data() const {
-    PADDLE_ENFORCE(
-        holder_ != nullptr,
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
+    CheckDimsValidity();
     return reinterpret_cast<const T*>(
         reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
-    dims_ = dims;
+    set_dims(dims);
     return mutable_data<T>(place);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T>
   T* mutable_data(paddle::platform::Place place) {
+    PADDLE_ENFORCE(numel_ > 0,
+                   "Tensor::numel_ must be larger than zero to call "
+                   "Tensor::mutable_data.");
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims_) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims_) * sizeof(T)));
+        || holder_->Size() < numel_ * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
@@ -63,25 +62,24 @@ class Tensor {
   }
 
   void ShareDataFrom(const Tensor& src) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not share data from an uninitialized tensor.");
+    src.CheckDimsValidity();
     holder_ = src.holder_;
-    dims_ = src.dims_;
+    dims_ = src.dims();
+    numel_ = src.numel_;
     offset_ = src.offset_;
   }
 
   void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not copy from an uninitialized tensor.");
-    size_t size = product(src.dims()) * src.holder_->TypeSize();
+    src.CheckDimsValidity();
+    size_t size = src.numel_ * src.holder_->TypeSize();
     holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
     dims_ = src.dims();
+    numel_ = src.numel_;
     offset_ = 0;
   }
 
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "The sliced tenosr has not been initialized.");
+    CheckDimsValidity();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -94,12 +92,22 @@ class Tensor {
     }
     Tensor dst;
     dst.holder_ = holder_;
-    dst.dims_ = dims_;
-    dst.dims_[0] = end_idx - begin_idx;
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.set_dims(dst_dims);
     dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
     return dst;
   }
 
+  void set_dims(const DDim& dims) {
+    if (dims == dims_) {
+      return;
+    }
+    dims_ = dims;
+    numel_ = product(dims_);
+    return;
+  }
+
   DDim dims() const { return dims_; }
 
  private:
@@ -158,8 +166,17 @@ class Tensor {
     size_t size_;                    // size of the memory block.
   };
 
+  inline void CheckDimsValidity() {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_ENFORCE(holder_->Size() > numel_ * sizeof(T) + offset_,
+                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                   "first to re-allocate memory.");
+  }
+
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
+  int numel_;      // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
 };
 

From dcfcf6872a6a7b5c9d58eec2e30e08e7f7897cf4 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:03:54 +0800
Subject: [PATCH 49/64] Refactor Tensor::CopyFrom()

1. Add template T which indicates data type to `CopyFrom()`, `Slice()`
and `ShareData()` functions. This makes `CopyData()` code much clearer.

2. Add `set_dim()`.

3. `product(DDim)` transforms `DDim` to `vector<int>` first and then calculate
its product. That might be quite slow. For `product(dims_)` is frequently
used in Tensor, we add a mumber variable `numel_` as a cache of the
product result.
TODO: refactor `product()` to make it more efficient.

4. Unable Tensor::operator=

5. Remove the limit of POD type, because `float16` and `int8` are not POD type.
---
 paddle/framework/tensor.h       | 73 ++++++++++++++-------------------
 paddle/framework/tensor_test.cc |  5 ++-
 2 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index e164f57abc..8cb4d1793c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
-#include <type_traits>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
@@ -28,15 +27,15 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : offset_(0) { numel_ = product(dims_); }
+  Tensor() : numel_(0), offset_(0) {}
 
   Tensor& operator=(const Tensor& src) = delete;
 
   template <typename T>
   const T* data() const {
-    CheckDimsValidity();
+    CheckDimsValidity<T>();
     return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
   template <typename T>
@@ -51,35 +50,40 @@ class Tensor {
                    "Tensor::numel_ must be larger than zero to call "
                    "Tensor::mutable_data.");
     if (holder_ == nullptr ||
-        !(holder_->Place() ==
+        !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < numel_ * sizeof(T) + offset_) {
       holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
       offset_ = 0;
     }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
   }
 
+  template <typename T>
   void ShareDataFrom(const Tensor& src) {
-    src.CheckDimsValidity();
+    src.CheckDimsValidity<T>();
     holder_ = src.holder_;
-    dims_ = src.dims();
-    numel_ = src.numel_;
+    set_dims(src.dims());
     offset_ = src.offset_;
   }
 
+  template <typename T>
   void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
-    src.CheckDimsValidity();
-    size_t size = src.numel_ * src.holder_->TypeSize();
-    holder_.reset(src.holder_->Clone(src.offset_, size, dst_place));
-    dims_ = src.dims();
-    numel_ = src.numel_;
-    offset_ = 0;
+    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       platform::is_cpu_place(dst_place),
+                   "Tensor::CopyFrom only support CPU now.");
+    src.CheckDimsValidity<T>();
+    size_t size = src.numel_ * sizeof(T);
+    set_dims(src.dims());
+    void* src_ptr = static_cast<void*>(src.data<T>());
+    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+    memcpy(dst_ptr, src_ptr, size);
   }
 
+  template <typename T>
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDimsValidity();
+    CheckDimsValidity<T>();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -95,7 +99,7 @@ class Tensor {
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.set_dims(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
     return dst;
   }
 
@@ -115,12 +119,9 @@ class Tensor {
   // parameter of Variable.
   struct Placeholder {
     virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
-    virtual size_t TypeSize() const = 0;
-    virtual Placeholder* Clone(size_t begin, size_t size,
-                               paddle::platform::Place place) const = 0;
+    virtual void* ptr() const = 0;
+    virtual paddle::platform::Place place() const = 0;
+    virtual size_t size() const = 0;
   };
 
   template <typename T>
@@ -144,32 +145,20 @@ class Tensor {
           place_(place),
           size_(size) {}
 
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
-    virtual size_t TypeSize() const { return sizeof(T); }
-    // TODO: Clone only support CPU now. GPU support is needed.
-    virtual Placeholder* Clone(size_t begin, size_t size,
-                               paddle::platform::Place place) const {
-      PADDLE_ENFORCE(paddle::platform::is_cpu_place(place_) &&
-                         paddle::platform::is_cpu_place(place),
-                     "PlaceholderImpl::Clone only support CPU now.");
-      PlaceholderImpl<T>* dst = new PlaceholderImpl<T>(place, size);
-      void* begin_ptr =
-          reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(Ptr()) + begin);
-      memcpy(dst->Ptr(), begin_ptr, size);
-      return dst;
-    }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual size_t size() const { return size_; }
+    virtual paddle::platform::Place place() const { return place_; }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
     size_t size_;                    // size of the memory block.
   };
 
-  inline void CheckDimsValidity() {
+  template <typename T>
+  inline void CheckDimsValidity() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->Size() > numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() > numel_ * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 6db0ba8c79..eef9cfcd9e 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -18,7 +18,8 @@
 TEST(Tensor, Dims) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  Tensor tt(make_ddim({2, 3, 4}));
+  Tensor tt;
+  tt.set_dims(make_ddim({2, 3, 4}));
   DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
@@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) {
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);

From 1f97388a3410275e663483f2b7d4de20561c2e66 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 17:22:06 +0800
Subject: [PATCH 50/64] fix several compile error

---
 paddle/framework/tensor.h       |  2 +-
 paddle/framework/tensor_test.cc | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 8cb4d1793c..7f731813ef 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -76,7 +76,7 @@ class Tensor {
     src.CheckDimsValidity<T>();
     size_t size = src.numel_ * sizeof(T);
     set_dims(src.dims());
-    void* src_ptr = static_cast<void*>(src.data<T>());
+    const void* src_ptr = static_cast<const void*>(src.data<T>());
     void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
     memcpy(dst_ptr, src_ptr, size);
   }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index eef9cfcd9e..255f69372f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -105,19 +105,18 @@ TEST(Tensor, ShareDataFrom) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataFrom(src_tensor);
+      dst_tensor.ShareDataFrom<float>(src_tensor);
     } catch (EnforceNotMet err) {
       caught = true;
-      std::string msg = "Can not share data from an uninitialized tensor.";
-      const char* what = err.what();
-      for (size_t i = 0; i < msg.length(); ++i) {
-        ASSERT_EQ(what[i], msg[i]);
+      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
+first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
+++i) { ASSERT_EQ(what[i], msg[i]);
       }
     }
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
@@ -125,7 +124,7 @@ TEST(Tensor, ShareDataFrom) {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 }
@@ -136,7 +135,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -159,7 +158,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -188,15 +187,15 @@ TEST(Tensor, CopyFrom) {
   int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
   memcpy(src_ptr, arr, 9 * sizeof(int));
   Tensor dst_tensor;
-  dst_tensor.CopyFrom(src_tensor, CPUPlace());
+  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
   const int* dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(src_ptr, dst_ptr);
   for (size_t i = 0; i < 9; ++i) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
-  Tensor slice_tensor = src_tensor.Slice(1, 2);
-  dst_tensor.CopyFrom(slice_tensor, CPUPlace());
+  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(dst_ptr, slice_ptr);

From d7405e1eaa48c546323a086c170093673fdf435a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Jul 2017 17:53:42 +0800
Subject: [PATCH 51/64] Fix a typo before

---
 python/paddle/v2/optimizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 3dec340cfb..b6ee51cfe8 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -16,6 +16,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -24,8 +25,6 @@ class Optimizer(object):
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
-        if swig_api is None:
-            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 

From 8594d5c31424e346fa2b8cdbada5188de1c13264 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 18:13:32 +0800
Subject: [PATCH 52/64] change int numel_ to size_t numel

---
 paddle/framework/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7f731813ef..8b2a65dca9 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -165,7 +165,7 @@ class Tensor {
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
-  int numel_;      // cache of `product(dims_)`
+  size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
 };
 

From 34beec0f7a645ee4c5c0ce47592619c5ed05facc Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 19:32:01 +0800
Subject: [PATCH 53/64] update tensor.h

---
 paddle/framework/tensor.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 8b2a65dca9..d98706e6ed 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -33,7 +33,7 @@ class Tensor {
 
   template <typename T>
   const T* data() const {
-    CheckDimsValidity<T>();
+    CheckDims<T>();
     return reinterpret_cast<const T*>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
@@ -62,7 +62,7 @@ class Tensor {
 
   template <typename T>
   void ShareDataFrom(const Tensor& src) {
-    src.CheckDimsValidity<T>();
+    src.CheckDims<T>();
     holder_ = src.holder_;
     set_dims(src.dims());
     offset_ = src.offset_;
@@ -73,7 +73,7 @@ class Tensor {
     PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
-    src.CheckDimsValidity<T>();
+    src.CheckDims<T>();
     size_t size = src.numel_ * sizeof(T);
     set_dims(src.dims());
     const void* src_ptr = static_cast<const void*>(src.data<T>());
@@ -83,7 +83,7 @@ class Tensor {
 
   template <typename T>
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDimsValidity<T>();
+    CheckDims<T>();
     PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                    "Slice index is less than zero or out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
@@ -109,7 +109,6 @@ class Tensor {
     }
     dims_ = dims;
     numel_ = product(dims_);
-    return;
   }
 
   DDim dims() const { return dims_; }
@@ -155,10 +154,10 @@ class Tensor {
   };
 
   template <typename T>
-  inline void CheckDimsValidity() const {
+  inline void CheckDims() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() > numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }

From 57a22db3fad1251a50d3d3dd2f241ad7f1949d77 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 14 Jul 2017 19:43:59 +0800
Subject: [PATCH 54/64] update PADDLE_ENFORCE message

---
 paddle/framework/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index d98706e6ed..62e0710a82 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -48,7 +48,7 @@ class Tensor {
   T* mutable_data(paddle::platform::Place place) {
     PADDLE_ENFORCE(numel_ > 0,
                    "Tensor::numel_ must be larger than zero to call "
-                   "Tensor::mutable_data.");
+                   "Tensor::mutable_data. Call Tensor::set_dim first.");
     if (holder_ == nullptr ||
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */

From 03b3d0d8a8b8e90f997e1a2cec49bb04486adc8a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 20:12:35 +0800
Subject: [PATCH 55/64] Follow comments

---
 paddle/platform/cpu_info.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index 3da04420e5..1905cfeee6 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -54,8 +54,8 @@ size_t CpuMaxAllocSize() {
 }
 
 size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
 }
 
 size_t CpuMaxChunkSize() {

From fbfbe93a78e9cc411dab6c2b54a5516b16dae430 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 18:59:05 +0000
Subject: [PATCH 56/64] cmake: do not run glide install every time.

---
 cmake/configure.cmake | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a4f98ec7d4..7afab5d534 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
       message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
     endif()
 
-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
   endif()
 
 endif(WITH_GOLANG)

From 9eb9b2c29c97c63e4f0ca32e5d69e5dd5b26d89d Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 14 Jul 2017 20:20:50 +0000
Subject: [PATCH 57/64] fix race condition in test

---
 go/pserver/client/client_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 27f4ff2380..aab91556b4 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -164,7 +164,7 @@ func testClient(t *testing.T, c *client.Client) {
 
 		wg.Add(1)
 		go func(gs []pserver.Gradient) {
-			err = c.SendGrads(gs)
+			err := c.SendGrads(gs)
 			if err != nil {
 				t.Fatal(err)
 			}

From 13b0dcd295d49fa49f65de2219462999e34294b0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:01:37 +0800
Subject: [PATCH 58/64] ENH: add cpplint

---
 cmake/cpplint.cmake | 14 +++++++-------
 cmake/generic.cmake |  4 ++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 6bbcd730e1..656e1a0803 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
     .*cblas\\.h.*
     .*\\.pb\\.txt
     .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                 endif()
             endforeach()
             if(LINT MATCHES ON)
+                # cpplint code style
                 get_filename_component(base_filename ${filename} NAME)
                 set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 48c054d17f..b3204e863a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -185,6 +185,10 @@ function(cc_library TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
   else(cc_library_SRCS)
     if (cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})

From 620575b6eb7f59abc0b1fd2052159590d6a29113 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:02:03 +0800
Subject: [PATCH 59/64] FIX: cpplint code style

---
 paddle/framework/ddim.cc             | 42 ++++++++++++++++++----------
 paddle/framework/ddim.h              |  2 +-
 paddle/framework/net.cc              | 16 +++++++++++
 paddle/framework/op_registry.cc      | 16 ++++++++++-
 paddle/framework/operator.cc         |  2 +-
 paddle/memory/detail/memory_block.cc |  6 ++--
 paddle/memory/memory.cc              |  2 --
 paddle/platform/cpu_info.cc          |  4 +--
 paddle/platform/place.cc             | 16 ++++++++++-
 9 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595..87a3618e09 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,9 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
 
 namespace paddle {
 namespace framework {
 
-///@cond HIDDEN
+/// @cond HIDDEN
 
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +64,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
   }
 }
 
-///@endcond
+/// @endcond
 
 DDim make_ddim(std::initializer_list<int> dims) {
   DDim result(make_dim(0));
@@ -64,11 +78,11 @@ DDim make_ddim(const std::vector<int>& dims) {
   return result;
 }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
  public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int& operator()(Dim<D>& dim) const {
@@ -81,7 +95,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
  public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int operator()(const Dim<D>& dim) const {
@@ -92,7 +106,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
   int idx_;
 };
 
-///@endcond
+/// @endcond
 
 int& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -155,11 +169,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
   std::vector<int>& vector;
 
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -169,7 +183,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 
   void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 
 std::vector<int> vectorize(const DDim& ddim) {
   std::vector<int> result;
@@ -187,7 +201,7 @@ ssize_t product(const DDim& ddim) {
   return result;
 }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
   template <int D>
@@ -196,15 +210,15 @@ struct ArityVisitor : boost::static_visitor<int> {
   }
 };
 
-///\endcond
+/// \endcond
 
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct DDimPrinter : boost::static_visitor<void> {
   std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -212,7 +226,7 @@ struct DDimPrinter : boost::static_visitor<void> {
   }
 };
 
-///\endcond
+/// \endcond
 
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180be..f8714acf32 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -27,7 +27,7 @@ struct DDim {
   DDim() : var(Dim<1>()) {}
 
   template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 854ad8e33e..a0e8788846 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "paddle/framework/net.h"
 
 namespace paddle {
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 4b35e04e68..1d14535c50 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 
 namespace paddle {
@@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
   attr->set_type(paddle::framework::AttrType::STRINGS);
 }
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8f7adff8b3..d065670829 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -40,4 +40,4 @@ std::string OperatorBase::DebugString() const {
 }
 
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index bc67bcef0f..fc40993208 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -152,6 +152,6 @@ MemoryBlock* MemoryBlock::metadata() const {
       reinterpret_cast<const Metadata*>(this) - 1));
 }
 
-}  // detail
-}  // memory
-}  // paddle
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 430ce98bfc..df3d57d629 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 
-#include <boost/variant.hpp>
-
 namespace paddle {
 namespace memory {
 
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index 1905cfeee6..dfab391cfb 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -41,8 +41,8 @@ inline size_t CpuTotalPhysicalMemory() {
   if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
   return 0L;
 #else
-  long pages = sysconf(_SC_PHYS_PAGES);
-  long page_size = sysconf(_SC_PAGE_SIZE);
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
   return pages * page_size;
 #endif
 }
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 0704820aa0..b31515e1f0 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 
 class PlacePrinter : public boost::static_visitor<> {
  public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 

From 569f7e83a6fa37a82e341cf4a588af7abfbcfca8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:02:27 +0800
Subject: [PATCH 60/64] FIX: cppint code style

---
 paddle/function/GemmConvOp.cpp                |  9 +--
 paddle/function/NaiveConvOp.cpp               |  3 +-
 .../gradientmachines/NeuralNetwork.cpp        |  2 +-
 .../RecurrentGradientMachine.cpp              |  2 +-
 paddle/gserver/layers/AgentLayer.cpp          |  2 +-
 paddle/operators/add_op.cc                    | 18 +++++-
 paddle/optimizer/parameter_optimizer_test.cpp | 60 ++++++++++++-------
 paddle/optimizer/serialization_test.cpp       | 27 ++++++---
 paddle/utils/DynamicLoader.h                  |  5 +-
 paddle/utils/ThreadLocal.h                    | 12 ++--
 10 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e..00880effc5 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 4348f0f775..e0692fa06d 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 2e839f6405..cfa80a8936 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
       : layerName_(layerName) {
     addEvaluator(std::move(evaluator));
   }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
     const LayerPtr& layer = nn.getLayer(layerName_);
     CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                  << nn.getName();
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 9a972466d6..9ddd449de7 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
   }
   starts.back() = pos;
 }
-}
+}  // namespace
 
 void RecurrentGradientMachine::calcSequenceStartPositions() {
   std::vector<int> starts(commonSeqInfo_.size() + 1);
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 15e7411b5f..bdae7e623a 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
     dest[index[i]] = src[i];
   }
 }
-}
+}  // namespace
 
 void GatherAgentLayer::forwardIds(PassType passType) {
   IVectorPtr realId = realLayers_[0]->getOutputLabel();
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 2766f0bf25..522b23cbc4 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/tensor.h>
 #include <paddle/operators/add_op.h>
@@ -36,9 +50,9 @@ The equation is: Out = X + Y
 )DOC");
   }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
+    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 4e6254d9e4..60a3b32789 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = (float)rand() / (float)RAND_MAX;
   }
   return param;
 }
 
-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = i;
   }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest();
+  // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 
   virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
   virtual void TearDown() {}
 
   void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
     config_.mutable_sgd()->set_momentum(0.0);
     config_.mutable_sgd()->set_decay(0.0);
     config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
     config_.mutable_adam()->set_beta_1(0.9);
     config_.mutable_adam()->set_beta_2(0.1);
     config_.mutable_adam()->set_epsilon(1e-3);
     config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
   }
 
   void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       opts_[i]->Update(g);
     }
@@ -91,8 +107,8 @@ public:
   }
 
 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };
 
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index d2454140dc..e4d97cbdba 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -1,19 +1,32 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
     t1[i] = 0;
   }
 
-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
   for (size_t i = 0; i < t1.size(); ++i) {
     EXPECT_EQ(t1[i], t[i]);
   }
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724..2e5ff76a06 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b5e2862546..0a27b8b97b 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 

From ab8c7f11f3b376153d97f00abaca6dab7f041cf9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 15 Jul 2017 10:18:49 +0800
Subject: [PATCH 61/64] FIX: virtual OptimizerTest

---
 paddle/optimizer/parameter_optimizer_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 60a3b32789..edf4ae37a9 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -41,7 +41,7 @@ paddle::optimizer::Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  virtual ~OptimizerTest();
+  virtual ~OptimizerTest() {}
   // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 

From afa99d9ae6f96fff62e46e57d8a110121c1e9c6d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 16 Jul 2017 12:53:03 +0800
Subject: [PATCH 62/64] add ADD_OP_CPU to enable add op with only cpu kernel
 (#2896)

* add ADD_OP_CPU to enable add op with only cpu kernel
---
 paddle/framework/op_registry.h | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index de20e7af05..19cb4c7b3e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -241,12 +241,18 @@ class OpRegisterHelper {
   }
 };
 
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+/**
+ * Macro to Register Operator.
+ */
 #define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
   STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
                                  "REGISTER_OP must be in global namespace"); \
@@ -254,9 +260,12 @@ class OpRegisterHelper {
       __op_register_##__op_type##__(#__op_type);                             \
   int __op_register_##__op_type##_handle__() { return 0; }
 
-#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
       "REGISTER_OP_KERNEL must be in global namespace");                  \
   struct __op_kernel_register__##type##__ {                               \
     __op_kernel_register__##type##__() {                                  \
@@ -267,7 +276,7 @@ class OpRegisterHelper {
     }                                                                     \
   };                                                                      \
   static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
-  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
 
 #define REGISTER_OP_GPU_KERNEL(type, KernelType) \
   REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
@@ -275,6 +284,10 @@ class OpRegisterHelper {
 #define REGISTER_OP_CPU_KERNEL(type, KernelType) \
   REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
 
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
 #define USE_OP_WITHOUT_KERNEL(op_type)                      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
       __use_op_without_kernel_##op_type,                    \
@@ -292,15 +305,16 @@ class OpRegisterHelper {
       __attribute__((unused)) =                                           \
           __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
 
-#ifdef PADDLE_ONLY_CPU
-#define USE_OP(op_type)           \
+// use Operator with only cpu kernel.
+#define USE_OP_CPU(op_type)       \
   USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);
+  USE_OP_KERNEL(op_type, CPU)
 
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
 #else
-#define USE_OP(op_type)           \
-  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);    \
+#define USE_OP(op_type) \
+  USE_OP_CPU(op_type);  \
   USE_OP_KERNEL(op_type, GPU)
 #endif
 

From 45ce1649a13a730931bc911576caad2f61afb715 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 16 Jul 2017 18:08:55 +0800
Subject: [PATCH 63/64] change net to operator (#2846)

* OperatorBase should not store OpDesc because not All op contains an
  OpDesc and not all ops create from OpDesc.
  * Networks do not contain OpDesc and are not created by OpDesc
* Do not register Network to OpRegistry.
  * The network is directly created by the user in Python. Not from
    registry.
* Correctly handle the `inputs` and `outputs` of a Network.
  * Add CompleteAddOp() methods
* Remove `AddOp(OpDesc&)` in net-op. All op are added by OperatorPtr.
* Rewrite unit test for truly tested what networks do.
* optimise operator_test
---
 paddle/framework/CMakeLists.txt   |   5 +-
 paddle/framework/net.cc           |  41 ++++++--
 paddle/framework/net.h            | 163 +++++++++---------------------
 paddle/framework/net_op_test.cc   |  67 ++++++++++++
 paddle/framework/op_registry.h    |   2 +-
 paddle/framework/operator.cc      |   2 +-
 paddle/framework/operator.h       |   7 +-
 paddle/framework/operator_test.cc |  46 +++++----
 8 files changed, 179 insertions(+), 154 deletions(-)
 create mode 100644 paddle/framework/net_op_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8415ce67e9..cc5b05ff0d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,10 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+
 cc_library(operator SRCS operator.cc DEPS op_desc device_context)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
@@ -21,4 +23,5 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index a0e8788846..7311cda9a9 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -19,18 +19,41 @@
 namespace paddle {
 namespace framework {
 
-PlainNet::PlainNet(const NetDesc& def) {}
-
-void PlainNet::InferShape(const ScopePtr& scope) const {
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
   for (auto& op : ops_) {
-    op.InferShape();
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
   }
-}
-
-void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
-  for (auto& op : ops_) {
-    op.Run(ctx);
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
   }
+
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 0481d8f47c..19a1620e29 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -1,99 +1,51 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
-using namespace paddle::platform;
-
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-
-struct OpDesc;
-struct OpAttrs {};
-
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() const {}
-  void Run(const DeviceContext &ctx) const {}
-};
-
 /**
- * @brief Network that manage the operators it has.
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
  *
- * Network is the container and controller of a set of operators, user can build
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
+ * Network is the container and controller of a set of operators.
 
  * A network object knows all Operators belonging to this network. Variables,
  * which are inputs and outputs of these operators, are created and managed by a
  * hierarchy of Scope objects.
  *
- * This is the base class of network, all the networks should implement the apis
+ * This is the base class of network, all the networks should implement the APIs
  * it defines.
  */
-class Net {
+class Net : public OperatorBase {
  public:
-  /**
-   * @brief Infer shapes of all inputs and outputs of operators.
-   */
-  virtual void InferShape(const ScopePtr &scope) const = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
-
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-
-  virtual ~Net() {}
+  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void CompleteAddOp() = 0;
 };
 
+using NetPtr = std::shared_ptr<Net>;
+
 /**
  * @brief a basic implementation of Net.
  *
@@ -103,18 +55,14 @@ class Net {
 class PlainNet : public Net {
  public:
   /**
-   * @brief Initialize a PlainNet.
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
+   * Infer all the operators' input and output variables' shapes, will be called
    * before every mini-batch
    */
-  virtual void InferShape(const ScopePtr &scope) const override;
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
 
   /**
    * @brief Run the network.
@@ -123,49 +71,32 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  virtual void Run(const ScopePtr &scope,
-                   const DeviceContext &ctx) const override;
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
 
   /**
-   * @brief Add an operator to this network.
+   * @brief Add an operator by ptr
    */
-  virtual OpIndex AddOp(const OpProto &def) override;
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }
 
-  /**
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
+  void CompleteAddOp() override;
 
-  /**
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-
-  virtual ~PlainNet() override {}
-
- protected:
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
+  std::vector<OperatorPtr> ops_;
 
  private:
-  // the operators owned by `Network`.
-  std::vector<Operator> ops_;
+  bool add_op_done_{false};
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
 };
 
 }  // namespace framework
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
new file mode 100644
index 0000000000..f5e1c22400
--- /dev/null
+++ b/paddle/framework/net_op_test.cc
@@ -0,0 +1,67 @@
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+
+namespace pd = paddle::framework;
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+
+  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 19cb4c7b3e..24f56b2812 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -201,7 +201,7 @@ class OpRegistry {
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
-    op->desc_ = op_desc;
+    op->type_ = op_desc.type();
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
               std::back_inserter(op->inputs_));
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index d065670829..7756162a87 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,7 +20,7 @@ namespace framework {
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
   ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
+  ss << "type = " << type_ << "\n";
   ss << "inputs = [";
   for (auto& ipt : inputs_) {
     ss << ipt << ", ";
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index cf79f379fa..f7ed6e9f3d 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -62,11 +62,8 @@ class OperatorBase {
   virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
- protected:
-  std::string Type() const { return desc_.type(); }
-
  public:
-  OpDesc desc_;
+  std::string type_;
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   AttributeMap attrs_;
@@ -142,7 +139,7 @@ class OperatorWithKernel : public OperatorBase {
 
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
     opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
   }
 
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index d0c3153fae..19ac4ecafa 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,14 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
   void InferShape(const ScopePtr& scope) const override {}
   void Run(const ScopePtr& scope,
            const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+    op_run_num++;
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
+    ASSERT_NEAR(GetAttr<float>("scale"), 3.14, 1e-5);
     ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
     ASSERT_EQ(x, 1);
     ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -36,15 +40,14 @@ class OperatorTest : public OperatorBase {
   float x = 0;
 };
 
-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
     AddComment("This is test op");
   }
 };
@@ -52,8 +55,8 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_OP(test_operator, paddle::framework::OperatorTest,
-            paddle::framework::OperatorTestProtoAndCheckerMaker);
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
 
 TEST(OperatorBase, all) {
   paddle::framework::OpDesc op_desc;
@@ -63,18 +66,17 @@ TEST(OperatorBase, all) {
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
-  attr->set_f(scale);
+  attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext device_context;
   auto scope = std::make_shared<paddle::framework::Scope>();
 
   paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
   scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
   op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
 namespace paddle {
@@ -86,13 +88,13 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
     AddComment("This is test op");
   }
 };
 
+static int cpu_kernel_run_num = 0;
+
 class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(const std::vector<const Tensor*>& inputs,
@@ -102,10 +104,10 @@ class OpWithKernelTest : public OperatorWithKernel {
 class CPUKernelTest : public OpKernel {
  public:
   void Compute(const KernelContext& context) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
-    std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ((int)context.op_.inputs_.size(), 1);
+    ASSERT_EQ((int)context.op_.outputs_.size(), 1);
+    ASSERT_NEAR(context.op_.GetAttr<float>("scale"), 3.14, 1e-5);
   }
 };
 
@@ -131,5 +133,7 @@ TEST(OpKernel, all) {
 
   paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }

From 12fe514dd35d2ea16caecde559c6b192debe378f Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 16 Jul 2017 19:20:28 +0800
Subject: [PATCH 64/64] "fix unrar in docker"

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ed5910d93b..8cfb16928c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \