Merge pull request #13804 from sneaxiy/rewrite_allocation

Rewrite allocation
7 years ago · f1a392a5fe
parent fd7e643153 98bbfc17be
commit f1a392a5fe
88 changed files with 3531 additions and 874 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ paddle/operators/tensor.save
 python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 paddle/fluid/operators/distributed/send_recv.proto
 *.DS_Store
 *.vs
 build/
@ -28,4 +29,5 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
 paddle/fluid/operators/distributed/send_recv.proto
 model_test
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@ -30,6 +30,8 @@ class ExceptionHolder {
      Catch(exp);
    } catch (platform::EnforceNotMet exp) {
      Catch(exp);
    } catch (std::exception& ex) {
      LOG(FATAL) << "std::exception caught, " << ex.what();
    } catch (...) {
      LOG(FATAL) << "Unknown exception caught";
    }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                          &(ctx->cur_ref_cnts_));
    }
    if (FLAGS_benchmark) {
      VLOG(20) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
    }
  }
  if (gc != nullptr) {
@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      scope->DropKids();
    }
  }
  if (FLAGS_benchmark) {
    VLOG(20) << "-------------------------------------------------------";
    VLOG(20) << "Memory used after deleting local scope: "
             << memory::memory_usage(place_);
    VLOG(20) << "-------------------------------------------------------";
  }
 }
 void Executor::RunPreparedContext(
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
 public:
  LoDTensor() : Tensor() {}
  /* Constructor with place should only be used in pybind */
  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
  void set_lod(const LoD& lod) { lod_ = lod; }
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@ -31,46 +32,6 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
 namespace details {
 struct CUDABuffer {
  void *data_{nullptr};
  size_t size_{0};
  platform::CUDAPlace place_;
  CUDABuffer() {}
  CUDABuffer(platform::Place place, size_t size)
      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
    data_ = memory::Alloc(place_, size);
  }
  ~CUDABuffer() { ClearMemory(); }
  CUDABuffer(const CUDABuffer &o) = delete;
  CUDABuffer &operator=(const CUDABuffer &o) = delete;
  void Resize(platform::Place place, size_t size) {
    ClearMemory();
    place_ = boost::get<platform::CUDAPlace>(place);
    data_ = memory::Alloc(place_, size);
    PADDLE_ENFORCE_NOT_NULL(data_);
    size_ = size;
  }
  void Swap(CUDABuffer &o) {
    std::swap(data_, o.data_);
    std::swap(place_, o.place_);
    std::swap(size_, o.size_);
  }
 private:
  void ClearMemory() const {
    if (data_ != nullptr) {
      memory::Free(place_, data_);
    }
  }
 };
 }  // namespace details
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@ -103,8 +64,6 @@ class Vector {
      o.ImmutableCPU();
      cpu_ = o.cpu_;
      flag_ = kDataInCPU;
      details::CUDABuffer null;
      gpu_.Swap(null);
      return *this;
    }
@ -199,7 +158,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
+      return reinterpret_cast<T *>(gpu_->ptr());
    }
    // get cuda ptr. mutable
@ -234,13 +193,11 @@ class Vector {
    std::mutex &Mutex() const { return mtx_; }
-    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+    boost::optional<platform::CUDAPlace> CUDAPlace() const {
-      if (gpu_.data_ == nullptr) {
+      return gpu_ == nullptr
-        return nullptr;
+                 ? boost::none
-      } else {
+                 : boost::optional<platform::CUDAPlace>(
-        return std::unique_ptr<platform::CUDAPlace>(
+                       boost::get<platform::CUDAPlace>(gpu_->place()));
            new platform::CUDAPlace(gpu_.place_));
      }
    }
   private:
@ -254,13 +211,12 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(
+          platform::DeviceContextPool::Instance().Get(gpu_->place()));
              platform::Place(gpu_.place_)));
      auto stream = dev_ctx->stream();
-      void *src = gpu_.data_;
+      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   stream);
+                   gpu_->size(), stream);
      dev_ctx->Wait();
    }
@ -277,8 +233,7 @@ class Vector {
          CopyCPUDataToCUDA(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
          PADDLE_THROW("This situation should not happen");
          // Still dirty
        } else {
@ -290,7 +245,7 @@ class Vector {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
          CopyCPUDataToCUDA(place);
          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (!(place == gpu_->place())) {
          PADDLE_THROW("This situation should not happen.");
        } else {
          // Not Dirty && DataInCUDA && Device is same
@ -301,13 +256,13 @@ class Vector {
    void CopyCPUDataToCUDA(const platform::Place &place) const {
      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
+      void *dst = gpu_->ptr();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   stream);
+                   gpu_->size(), stream);
    }
    void ImmutableCPU() const {
@ -329,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
+    mutable memory::AllocationPtr gpu_;
    mutable int flag_;
    mutable std::mutex mtx_;
@ -428,8 +383,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.Data().CUDAData(place);
      }
    }
@ -444,8 +399,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.MutableData()->CUDAMutableData(place);
      }
    }
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
 }
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
                           memory::Allocator::Attr attr,
                           size_t requested_size) {
-  if (holder_ != nullptr) {
+  type_ = type;
    holder_->set_type(type);
  }
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
+    holder_ = memory::AllocShared(place, size, attr);
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place) ||
               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW(
          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
      if (platform::is_gpu_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
            boost::get<platform::CUDAPlace>(place), size, type));
      } else if (platform::is_cuda_pinned_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
            boost::get<platform::CUDAPinnedPlace>(place), size, type));
      }
    }
 #endif
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
                           size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type(), requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
    dst.type_ = type_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -67,12 +67,7 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0) {}
+  Tensor() : type_(typeid(float)), offset_(0) {}
  /*! Constructor with place should only be used in pybind. */
  explicit Tensor(const platform::Place& place) : offset_(0) {
    holder_->set_place(place);
  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
@ -89,12 +84,17 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  void* mutable_data(platform::Place place, std::type_index type,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
  /**
   * @brief     Return a pointer to mutable memory block.
@ -106,7 +106,9 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
@ -139,7 +141,7 @@ class Tensor {
  std::type_index type() const {
    PADDLE_ENFORCE_NOT_NULL(
        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
+    return type_;
  }
  // memory size returns the holding memory size in byte.
@ -153,56 +155,13 @@ class Tensor {
  void clear() { holder_ = nullptr; }
- private:
+  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  /**
+  size_t offset() const { return offset_; }
   * @note    Placeholder hides type T, so it doesn't appear as a template
   *          parameter of Variable.
   */
  struct Placeholder {
    virtual ~Placeholder() = default;
    virtual void* ptr() const = 0;
    virtual size_t size() const = 0;
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
    virtual void set_type(std::type_index type) = 0;
    virtual void set_place(platform::Place place) = 0;
  };
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
    PlaceholderImpl(Place place, size_t size, std::type_index type)
        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
               memory::PODDeleter<uint8_t, Place>(place)),
          place_(place),
          size_(size),
          type_(type) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
    virtual size_t size() const { return size_; }
    virtual platform::Place place() const { return place_; }
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
    virtual void set_place(platform::Place place) { place_ = place; }
    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
    /*! the place of memory block. */
    platform::Place place_;
    /*! the size of memory block. */
    size_t size_;
    /* the current type of memory */
    std::type_index type_;
  };
 private:
  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
+  std::shared_ptr<memory::Allocation> holder_;
-
+  std::type_index type_;
  /**
   * @brief points to elements dimensions.
   *
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -23,10 +23,10 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
 }
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
+  return reinterpret_cast<T*>(
      mutable_data(place, typeid(T), attr, requested_size));
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
    TensorToStream(oss, gpu_tensor, gpu_ctx);
    std::istringstream iss(oss.str());
-    TensorFromStream(iss, &dst_tensor, gpu_ctx);
+    TensorFromStream(
        iss, &dst_tensor,
        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -1,15 +1,12 @@
 add_subdirectory(detail)
-
+add_subdirectory(allocation)
-cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
+cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 cc_library(memory
        DEPS
        malloc
        memcpy)
 cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -0,0 +1,64 @@
 cc_library(allocator SRCS allocator.cc DEPS place)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
 endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 if (WITH_GPU)
    nv_test(best_fit_allocator_test
            SRCS best_fit_allocator_test.cc
                 best_fit_allocator_test.cu
            DEPS best_fit_allocator
                 locked_allocator
                 cpu_allocator
                 cuda_allocator
                 device_context
                 memcpy)
 else()
    cc_test(best_fit_allocator_test
            SRCS best_fit_allocator_test.cc
            DEPS best_fit_allocator
                 locked_allocator
                 cpu_allocator)
 endif()
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
 else ()
    set(AllocatorFacadeDeps)
 endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
 cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
 cc_library(allocator_facade SRCS allocator_facade.cc DEPS
        ${AllocatorFacadeDeps}
        cpu_allocator
        locked_allocator
        best_fit_allocator
        aligned_allocator
        auto_increment_allocator
        zero_size_allocator
        conditional_allocator
        retry_allocator
        buffered_allocator
        allocator_strategy
        legacy_allocator
        )
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@ -0,0 +1,31 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 ThinAlignedAllocator::ThinAlignedAllocator(
    std::shared_ptr<Allocator> underlyning_allocator)
    : underlying_allocator_(std::move(underlyning_allocator)) {}
 bool ThinAlignedAllocator::IsAllocThreadSafe() const {
  return underlying_allocator_->IsAllocThreadSafe();
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@ -0,0 +1,100 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // The aligned allocation and allocator will wrap a managed allocator,
 // and returns the aligned pointer.
 //
 // NOTE(yy): For speed reason, I just use a template parameter to get
 // alignment, however, it can be an private member if necessary.
 //
 // NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
 template <size_t kAlignment>
 class AlignedAllocation : public Allocation {
  static_assert(kAlignment > 0 && (kAlignment & (kAlignment - 1)) == 0,
                "kAlignment must be 2^N");
 public:
  AlignedAllocation(AllocationPtr&& underlying_allocation, size_t size)
      : Allocation(AlignedPtr(underlying_allocation->ptr()),
                   size + kAlignment - Offset(underlying_allocation->ptr()),
                   underlying_allocation->place()),
        underlying_allocation_(std::move(underlying_allocation)) {}
 private:
  static void* AlignedPtr(void* ptr) {
    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
                                   Offset(ptr));
  }
  // Offset to aligned pointer.
  // if ptr is already aligned, returns 0.
  static size_t Offset(void* ptr) {
    auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
    intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
    intptr_t diff = aligned_addr - ptr_addr;
    if (diff == 0) {
      return 0;
    } else {
      return kAlignment + diff;
    }
  }
  AllocationPtr underlying_allocation_;
 };
 // Thin aligned allocator is trivial and used to generate a small size binary.
 //
 // NOTE(yy): This is a trick to make a template class. This class extract the
 // common code into a `thin` class. So if there are multiple specification of
 // the template class, the binary size will not extended too much.
 //
 // NOTE(yy): This could be an over design. If it harms readability of code, it
 // could be removed later.
 class ThinAlignedAllocator : public Allocator {
 public:
  explicit ThinAlignedAllocator(
      std::shared_ptr<Allocator> underlyning_allocator);
  bool IsAllocThreadSafe() const;
 protected:
  std::shared_ptr<Allocator> underlying_allocator_;
 };
 // An aligned allocator will allocate `size+kAlignment` allocation and adjust
 // the pointer offset.
 template <size_t kAlignment>
 class AlignedAllocator : public ThinAlignedAllocator {
 public:
  using ThinAlignedAllocator::ThinAlignedAllocator;
 protected:
  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
    auto raw_allocation =
        underlying_allocator_->Allocate(size + kAlignment, attr);
    return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
  }
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
@ -0,0 +1,48 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 // NOTE(yy): this unittest is not important. It just used for debugging.
 // It can be removed later.
 struct FillZero {
 public:
  float* ptr_;
  __device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
 };
 namespace paddle {
 TEST(Eigen, main) {
  framework::Tensor tensor;
  platform::CUDAPlace gpu(0);
  float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
  auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(gpu));
  PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
  platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
  for_range(FillZero{ptr});
  dev_ctx.Wait();
  auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
  auto& eigen_dev = *dev_ctx.eigen_device();
  eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
 }
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_with_underlying.h
+++ b/paddle/fluid/memory/allocation/allocation_with_underlying.h
@ -0,0 +1,33 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 class AllocationWithUnderlying : public Allocation {
 public:
  explicit AllocationWithUnderlying(AllocationPtr allocation)
      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
        allocation_(std::move(allocation)) {}
  AllocationPtr allocation_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@ -0,0 +1,45 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include <functional>
 namespace paddle {
 namespace memory {
 namespace allocation {
 Allocation::~Allocation() {}
 Allocator::~Allocator() {}
 bool Allocator::IsAllocThreadSafe() const { return false; }
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
  auto ptr = AllocateImpl(size, attr);
  ptr->set_allocator(this);
  return AllocationPtr(ptr);
 }
 void Allocator::Free(Allocation* allocation) { delete allocation; }
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 void AllocationDeleter::operator()(Allocation* allocation) const {
  auto* allocator = allocation->allocator();
  allocator->Free(allocation);
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@ -0,0 +1,145 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <string>
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
 public:
  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
  const char* what() const noexcept override;
 private:
  std::string msg_;
 };
 class Allocation;
 class AllocationDeleter {
 public:
  void operator()(Allocation* allocation) const;
 };
 class Allocator;
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
 //
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
 class Allocation {
 public:
  Allocation(void* ptr, size_t size, platform::Place place)
      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
  Allocation(const Allocation& o) = delete;
  Allocation& operator=(const Allocation& o) = delete;
  // Returns the holding pointer.
  // NOTE: For performance consideration, it is better not to make this method
  // as a virtual method. If we want to implement a `defragmentation` later,
  // we might need to make `ptr_` field as a protected field, and add a virtual
  // method like `defragmentation` to change `ptr_`.
  void* ptr() const { return ptr_; }
  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
  // last valid element.
  //
  // NOTE: Some allocator might alloc more memory than request. The size
  // could larger than its request. For example,
  //    the AlignedAllocator will always allocate memory as size + kAlignment.
  //    The raw pointer might not aligned, so an offset might be added to raw
  //    the pointer. The size of this allocation will be
  //    `size + kAlignemnt - offset`.
  size_t size() const { return size_; }
  const platform::Place& place() const { return place_; }
  Allocator* allocator() { return allocator_; }
  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
  virtual ~Allocation();
 private:
  Allocator* allocator_;
  void* ptr_;
  size_t size_;
  platform::Place place_;
 };
 using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
 // Base interface class of memory Allocator.
 // To allocate a memory, allocator needs two parameters:
 //    1. size of bytes.
 //    2. Attribute of memory.
 // NOTE: the attribute of memory might be ignored if the allocator does not
 // care it.
 class Allocator {
 public:
  enum Attr {
    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
                   // algorithm.
    kFixedHuge = 1,  // The allocation may not be freed until the program
                     // ends. e.g., `Parameters` and `Momentum`.
    kFluxHuge = 2,  // The allocation may create and freed frequently and the
                    // allocation is considerable huge. Like `activations`
                    // and gradients.
    kScratchpad =
        3,  // The `Scratchpad` memory is allocated and freed very soon,
            // usually within an operator or aux memory.
            // Like CUDNN workspace, AUX memory in batch norm, etc.
            //
            // https://en.wikipedia.org/wiki/Scratchpad_memory
    kCrossDevice =
        4,  // The memory used cross-device memory copy/communication.
            // For example:
            // 1. it can use an `pinned` memory for CPU-GPU
            //    communication.
            // 2. it can use an `registered` memory for RDMA
            //    communication.
    NumOfAttrs = 5  // The number of all attributes. It is used internally.
  };
  virtual ~Allocator();
  // Allocate an allocation.
  AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
  // True if the `Allocate` is thread safe.
  virtual bool IsAllocThreadSafe() const;
 protected:
  virtual void Free(Allocation* allocation);
  virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
 private:
  friend class AllocationDeleter;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@ -0,0 +1,57 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.
 //
 // NOTE(yy): This class is a singleton class.
 // NOTE(yy): To create a stable ABI and make compilation faster. Here we use
 // a Pimpl trick;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
 public:
  ~AllocatorFacade();
  AllocatorFacade(const AllocatorFacade& o) = delete;
  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
  static AllocatorFacade& Instance();
  // Allocate a shared allocation.
  std::shared_ptr<Allocation> AllocShared(
      const platform::Place& place, size_t size,
      Allocator::Attr attr = Allocator::kDefault);
  // Allocate a unique allocation.
  AllocationPtr Alloc(const platform::Place& place, size_t size,
                      Allocator::Attr attr = Allocator::kDefault);
  // TODO(yy): Allocate a Copy-On-Write allocation?
 private:
  AllocatorFacade();
  AllocatorFacadePrivate* m_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@ -0,0 +1,87 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
 namespace paddle {
 namespace memory {
 namespace allocation {
 TEST(allocator, allocator) {
 #ifdef PADDLE_WITH_CUDA
  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
  FLAGS_gpu_allocator_retry_time = 500;
  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
 #endif
  auto &instance = AllocatorFacade::Instance();
  platform::Place place;
  size_t size = 1024;
  {
    place = platform::CPUPlace();
    size = 1024;
    auto cpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(cpu_allocation, nullptr);
    ASSERT_NE(cpu_allocation->ptr(), nullptr);
    ASSERT_EQ(cpu_allocation->place(), place);
    ASSERT_EQ(cpu_allocation->size(), size);
  }
 #ifdef PADDLE_WITH_CUDA
  {
    place = platform::CUDAPlace(0);
    size = 1024;
    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
    ASSERT_NE(gpu_allocation->ptr(), nullptr);
    ASSERT_EQ(gpu_allocation->place(), place);
    ASSERT_GE(gpu_allocation->size(), size);
  }
  {
    // Allocate 2GB gpu memory
    place = platform::CUDAPlace(0);
    size = 2 * static_cast<size_t>(1 << 30);
    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
    ASSERT_NE(gpu_allocation->ptr(), nullptr);
    ASSERT_EQ(gpu_allocation->place(), place);
    ASSERT_GE(gpu_allocation->size(), size);
  }
  {
    place = platform::CUDAPinnedPlace();
    size = (1 << 20);
    auto cuda_pinned_allocation =
        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
    ASSERT_NE(cuda_pinned_allocation, nullptr);
    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
    ASSERT_EQ(cuda_pinned_allocation->place(), place);
    ASSERT_GE(cuda_pinned_allocation->size(), size);
  }
 #endif
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@ -0,0 +1,41 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "gflags/gflags.h"
 DEFINE_string(
    allocator_strategy, "legacy",
    "The allocation strategy. Legacy means the original allocator of Fluid."
    "New means the experimental allocators of Fluid. in [legacy, new]");
 namespace paddle {
 namespace memory {
 namespace allocation {
 static AllocatorStrategy GetStrategyFromFlag() {
  return FLAGS_allocator_strategy == "legacy"
             ? AllocatorStrategy::kLegacy
             : AllocatorStrategy::kNaiveBestFit;
 }
 AllocatorStrategy GetAllocatorStrategy() {
  static AllocatorStrategy strategy = GetStrategyFromFlag();
  return strategy;
 }
 void UseAllocatorStrategyGFlag() {}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@ -0,0 +1,30 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 namespace paddle {
 namespace memory {
 namespace allocation {
 enum class AllocatorStrategy { kLegacy, kNaiveBestFit };
 extern AllocatorStrategy GetAllocatorStrategy();
 // Do nothing, just make sure linker do not prune this file.
 extern void UseAllocatorStrategyGFlag();
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.cc
@ -0,0 +1,78 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 bool AutoIncrementAllocator::IsAllocThreadSafe() const { return true; }
 std::shared_ptr<Allocator> AutoIncrementAllocator::CreateNewAllocator() {
  std::lock_guard<std::mutex> guard(mtx_);
  auto old_size = allocator_num_.load();
  PADDLE_ENFORCE_LT(old_size, underlying_allocators_.size(),
                    "Allocator number exceeds capacity %d",
                    underlying_allocators_.size());
  underlying_allocators_[old_size] = creator_();
  prev_success_allocator_ = old_size;
  ++allocator_num_;
  PADDLE_ENFORCE(
      underlying_allocators_[old_size]->IsAllocThreadSafe(),
      "the underlying allocator must be thread safe. This is a program "
      "bug.");
  return underlying_allocators_[old_size];
 }
 Allocation *AutoIncrementAllocator::AllocateImpl(size_t size,
                                                 Allocator::Attr attr) {
  auto cur = prev_success_allocator_.load();
  size_t retry_count = allocator_num_.load();
  size_t allocator_num = retry_count;
  while (retry_count-- > 0) {  // until there retry count is zero
    try {
      auto res = underlying_allocators_[cur]->Allocate(size, attr);
      prev_success_allocator_ = cur;
      return res.release();
    } catch (BadAlloc &) {
      if (++cur >= allocator_num) {
        cur = 0;
      }
    } catch (...) {
      // if there is another type of allocation, just rethrow it.
      throw;
    }
  }
  // This happens when the first allocator is exhausted and
  // there are more than 1 allocation requests
  // In this situation, the first allocation request would success
  // and the second allocation request would fail if we do not use
  // the newly created allocator by the first allocation request.
  for (cur = allocator_num; cur < allocator_num_; ++cur) {
    try {
      auto ret = underlying_allocators_[cur]->Allocate(size, attr);
      prev_success_allocator_ = cur;
      return ret.release();
    } catch (BadAlloc &) {
    } catch (...) {
      throw;
    }
  }
  // No suitable allocator
  return CreateNewAllocator()->Allocate(size, attr).release();
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@ -0,0 +1,79 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <atomic>  // NOLINT
 #include <functional>
 #include <memory>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // The AutoIncrementAllocator manages many underlying allocators. If none of
 // them can allocate the request memory, a new allocator will be created and
 // invoke its `allocate` method.
 //
 // NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
 // the latest successful allocator.
 //
 // NOTE(yy): We may need to release an underlying allocator if it allocate
 // nothing. However, it is generally not useful, since it will make performance
 // undetermined.
 //
 // NOTE(yy): This allocator is only locked when creating new underlying
 // allocator. The allocation requests from many threads may be dispatched
 // to the same underlying allocator. So the underlying allocator must be
 // thread safe.
 //
 // NOTE(zjl): Add capacity parameters to constructor. A high-performance
 // thread-safe std::vector with varying size is hard to implement.
 // Fortunately, we can get the total GPU memory and each chunk size.
 // Therefore, we can get the suitable capacity of AutoIncrementAllocator.
 class AutoIncrementAllocator : public Allocator {
 public:
  // Creator is the method to create ManagedAllocator
  using AllocatorCreator = std::function<std::shared_ptr<Allocator>()>;
  explicit AutoIncrementAllocator(AllocatorCreator&& creator, size_t capacity)
      : creator_(std::move(creator)), underlying_allocators_(capacity) {}
  bool IsAllocThreadSafe() const override;
 private:
  std::shared_ptr<Allocator> CreateNewAllocator();
 protected:
  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 private:
  AllocatorCreator creator_;
  std::vector<AllocatorCreator::result_type> underlying_allocators_;
  std::atomic<size_t> allocator_num_{0};
  // Use std::atomic rather than std::mutex, since std::atomic is usually
  // lock-free
  std::atomic<size_t> prev_success_allocator_{0};
  std::mutex mtx_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@ -0,0 +1,168 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include <cmath>
 #include <list>
 #include <map>
 #include <string>
 namespace paddle {
 namespace memory {
 namespace allocation {
 static int HighestBitPos(size_t N) {
  if (UNLIKELY(N == 0)) {
    return 0;
  } else {
 #ifdef __GNUCC__
    return sizeof(unsigned int) * 8 - __builtin_clz(N);
 #else
    return static_cast<int>(std::log2(N) + 1);
 #endif
  }
 }
 BestFitAllocator::BestFitAllocator(Allocation* allocation)
    : allocation_(allocation) {
  details::Chunk chunk;
  chunk.size_ = allocation_->size();
  chunk.offset_ = 0;
  chunk.is_free = true;
  chunks_.emplace_back(chunk);
  free_chunks_[HighestBitPos(chunk.size_)].insert(
      {chunk.size_, chunks_.begin()});
 }
 size_t BestFitAllocator::FreeSize() const {
  size_t acc = 0;
  for (auto& array_item : free_chunks_) {
    for (auto& pair : array_item) {
      acc += pair.second->size_;
    }
  }
  return acc;
 }
 BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
                                                      size_t free_chunk_offset,
                                                      MapIt bin_iterator) {
  auto to_split_it = bin_iterator->second;
  free_chunks_[free_chunk_offset].erase(bin_iterator);
  PADDLE_ENFORCE(to_split_it->is_free);
  PADDLE_ENFORCE_GE(to_split_it->size_, request_size);
  auto remaining_size = to_split_it->size_ - request_size;
  details::Chunk to_use;
  details::Chunk remaining;
  to_use.size_ = request_size;
  to_use.is_free = false;
  remaining.size_ = remaining_size;
  remaining.is_free = true;
  // calc offsets
  to_use.offset_ = to_split_it->offset_;
  remaining.offset_ = to_use.offset_ + to_use.size_;
  // insert to chunk list
  auto to_use_it = chunks_.insert(to_split_it, to_use);
  if (remaining.size_ != 0) {
    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
    free_chunks_[bit_size].insert(
        {remaining.size_, chunks_.insert(to_split_it, remaining)});
  }
  chunks_.erase(to_split_it);
  return to_use_it;
 }
 void BestFitAllocator::InsertFreeNode(const ListIt& it) {
  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
  auto& free_map = free_chunks_[pos];
  free_map.insert({it->size_, it});
 }
 void BestFitAllocator::EraseFreeNode(const ListIt& it) {
  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
  auto& free_map = free_chunks_[pos];
  auto map_it = free_map.find(it->size_);
  while (map_it->second != it && map_it != free_map.end()) {
    ++map_it;
  }
  PADDLE_ENFORCE(map_it != free_map.end());
  free_map.erase(map_it);
 }
 size_t BestFitAllocator::NumFreeChunks() const {
  size_t num = 0;
  for (auto& array_item : free_chunks_) {
    num += array_item.size();
  }
  return num;
 }
 void BestFitAllocator::Free(Allocation* allocation) {
  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
  auto chunk_it = bf_allocation->ChunkIterator();
  PADDLE_ENFORCE(!chunk_it->is_free);
  chunk_it->is_free = true;
  if (chunk_it != chunks_.begin()) {
    auto prev_it = chunk_it;
    --prev_it;
    if (prev_it->is_free) {
      // Merge Left.
      EraseFreeNode(prev_it);
      prev_it->size_ += chunk_it->size_;
      chunks_.erase(chunk_it);
      chunk_it = prev_it;
    }
  }
  auto next_it = chunk_it;
  ++next_it;
  if (next_it != chunks_.end() && next_it->is_free) {
    EraseFreeNode(next_it);
    chunk_it->size_ += next_it->size_;
    chunks_.erase(next_it);
  }
  InsertFreeNode(chunk_it);
  delete allocation;
 }
 Allocation* BestFitAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
  auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
  MapIt map_it;
  for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
    map_it = free_chunks_[highest_set_bit].lower_bound(size);
    if (map_it != free_chunks_[highest_set_bit].end()) {
      break;
    }
  }
  if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
    throw BadAlloc(string::Sprintf(
        "Cannot allocate %d, All fragments size is %d", size, FreeSize()));
  }
  auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
  return new BestFitAllocation(this, chunk_it);
 }
 BestFitAllocation::BestFitAllocation(
    paddle::memory::allocation::BestFitAllocator* allocator,
    typename details::ChunkList::iterator chunk_it)
    : Allocation(reinterpret_cast<void*>(
                     reinterpret_cast<uintptr_t>(allocator->BasePtr()) +
                     chunk_it->offset_),
                 chunk_it->size_, allocator->Place()),
      chunk_it_(chunk_it) {}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/Show More
+++ b/Show More