Merge pull request #2674 from gangliao/cpu_mem

[Done] Memory Management: Buddy Allocator
8 years ago · 48cf64e833
parent c48fc4d826 03b3d0d8a8
commit 48cf64e833
27 changed files with 1510 additions and 221 deletions
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
+  list(REMOVE_DUPLICATES libs_deps)

  if(APPLE) # Use OSX's libtool to merge archives
    # To produce a library we need at least one source file.
@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
      # Get the file names of the libraries to be merged
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
    endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})

-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		

      list(APPEND mergebases "${mergebase}")
    endforeach()
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@ -1 +1,11 @@
 add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@ -1,7 +1,15 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@ -14,9 +14,16 @@

 #pragma once

+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"

 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>

 namespace paddle {
@ -25,61 +32,80 @@ namespace detail {

 class BuddyAllocator {
 public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
  ~BuddyAllocator();

-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
  void Free(void*);
  size_t Used();

+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
 private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;

-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);

-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();

-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);

-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);

-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();

-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};

 }  // namespace detail
 }  // namespace memory
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // detail
+}  // memory
+}  // paddle
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@ -13,76 +13,128 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"

 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock

 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"

 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");

 namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;

+  index = 0;  // unlock memory
+
  void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
  }
+
  return p;
 }

-void CPUAllocator::Free(void* p, size_t size) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
    munlock(p, size);
  }
  free(p);
 }

+bool CPUAllocator::UseGpu() const { return false; }
+
 #ifndef PADDLE_ONLY_CPU

-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;

+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t usable = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
  void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= usable) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      gpu_alloc_size_ += size;
+      return p;
+    }
+  }
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
  }
-  return result == cudaSuccess ? p : nullptr;
+
+  return nullptr;
 }

-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
  // Purposefully allow cudaErrorCudartUnloading, because
  // that is returned if you ever call cudaFree after the
  // driver has already shutdown. This happens only if the
  // process is terminating, in which case we don't care if
  // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
  if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
  }
 }

+bool GPUAllocator::UseGpu() const { return true; }
+
 #endif  // PADDLE_ONLY_CPU

 }  // namespace detail
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {

-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };

 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };

 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU

--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
  bool freed = false;
  {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }

 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
  paddle::memory::detail::GPUAllocator a;
  TestAllocator(a, 2048);
  TestAllocator(a, 0);
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@ -22,38 +22,64 @@ limitations under the License. */
 namespace paddle {
 namespace memory {

-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return a;
 }

-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
 }

-size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return as[gpu_id];
+}
+
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
+
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }

+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {

-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@ -1,10 +1,13 @@
-add_subdirectory(dynload)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

-nv_test(cuda_test SRCS cuda_test.cu)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)

 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

+add_subdirectory(dynload)
+
 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
@ -12,4 +15,4 @@ ELSE()
 ENDIF()

 cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
@ -0,0 +1,21 @@
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
--- a/paddle/platform/cuda_test.cu
+++ b/paddle/platform/cuda_test.cu
@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@ -13,10 +13,11 @@ limitations under the License. */

 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include <paddle/platform/place.h>
--- a/paddle/platform/error.h
+++ b/paddle/platform/error.h
@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/Show More
+++ b/Show More