New feature: thread local allocator, test=develop (#23989)

* add the thread_local_allocator, test=develop * refactor the thread_local_allocator, test=develop * provides option setting strategy, test=develop
5 years ago · d2584a7082
parent 80cf3c3c4d
commit d2584a7082
8 changed files with 296 additions and 3 deletions
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -14,13 +14,15 @@ endif()
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 else ()
    set(AllocatorFacadeDeps)
 endif()
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@ -32,6 +32,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@ -80,6 +81,18 @@ class AllocatorFacadePrivate {
        break;
      }
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
        break;
      }
      default: {
        PADDLE_THROW("Unsupported allocator strategy: %d",
                     static_cast<int>(strategy));
@ -136,6 +149,10 @@ class AllocatorFacadePrivate {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@ -32,6 +32,10 @@ static AllocatorStrategy GetStrategyFromFlag() {
    return AllocatorStrategy::kAutoGrowth;
  }
  if (FLAGS_allocator_strategy == "thread_local") {
    return AllocatorStrategy::kThreadLocal;
  }
  PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy);
 }
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
-enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth };
+enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };
 extern AllocatorStrategy GetAllocatorStrategy();
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@ -0,0 +1,76 @@
 // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
    : place_(p) {
  if (platform::is_gpu_place(place_)) {
    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
        std::unique_ptr<memory::detail::SystemAllocator>(
            new memory::detail::GPUAllocator(
                boost::get<platform::CUDAPlace>(place_).device)),
        platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
  } else {
    LOG(FATAL) << "Thread local allocator only supports CUDAPlace now.";
  }
 }
 std::shared_ptr<ThreadLocalAllocatorImpl> ThreadLocalCUDAAllocatorPool::Get(
    int gpu_id) {
  auto pos = std::distance(devices_.begin(),
                           std::find(devices_.begin(), devices_.end(), gpu_id));
  PADDLE_ENFORCE_LT(
      pos, devices_.size(),
      platform::errors::InvalidArgument(
          "The position of device should be less than the size of devices."));
  std::call_once(*init_flags_[pos], [this, pos, gpu_id] {
    platform::SetDeviceId(devices_[pos]);
    allocators_[pos].reset(
        new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id)));
  });
  return allocators_[pos];
 }
 ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
    : devices_(platform::GetSelectedDevices()) {
  auto gpu_num = devices_.size();
  allocators_.resize(gpu_num);
  init_flags_.reserve(gpu_num);
  for (size_t i = 0; i < gpu_num; ++i) {
    init_flags_.emplace_back(new std::once_flag());
  }
 }
 ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
  VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
  void* ptr = buddy_allocator_->Alloc(size);
  auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
  tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
  return tl_allocation;
 }
 void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
  VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
  buddy_allocator_->Free(allocation->ptr());
  delete allocation;
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@ -0,0 +1,100 @@
 // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 class ThreadLocalAllocatorImpl;
 class ThreadLocalAllocation : public Allocation {
 public:
  ThreadLocalAllocation(void* ptr, size_t size, platform::Place place)
      : Allocation(ptr, size, place) {}
  void SetThreadLocalAllocatorImpl(
      std::shared_ptr<ThreadLocalAllocatorImpl> allocator) {
    allocator_ = allocator;
  }
  std::shared_ptr<ThreadLocalAllocatorImpl> GetAllocator() {
    return allocator_;
  }
 private:
  std::shared_ptr<ThreadLocalAllocatorImpl> allocator_;
 };
 class ThreadLocalAllocatorImpl
    : public std::enable_shared_from_this<ThreadLocalAllocatorImpl> {
 public:
  explicit ThreadLocalAllocatorImpl(const platform::Place& p);
  ThreadLocalAllocation* AllocateImpl(size_t size);
  void FreeImpl(ThreadLocalAllocation* allocation);
 private:
  std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
  platform::Place place_;
 };
 class ThreadLocalCUDAAllocatorPool {
 public:
  static ThreadLocalCUDAAllocatorPool& Instance() {
    static thread_local ThreadLocalCUDAAllocatorPool pool;
    return pool;
  }
  std::shared_ptr<ThreadLocalAllocatorImpl> Get(int gpu_id);
 private:
  ThreadLocalCUDAAllocatorPool();
  std::vector<int> devices_;
  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
  std::vector<std::shared_ptr<ThreadLocalAllocatorImpl>> allocators_;
 };
 class ThreadLocalCUDAAllocator : public Allocator {
 public:
  explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p)
      : gpu_id_(p.device) {}
  bool IsAllocThreadSafe() const override { return true; }
 protected:
  Allocation* AllocateImpl(size_t size) override {
    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
        size);
  }
  void FreeImpl(Allocation* allocation) override {
    auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
    auto allocator_impl = tl_allocation->GetAllocator();
    allocator_impl->FreeImpl(tl_allocation);
  }
 private:
  int gpu_id_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@ -0,0 +1,93 @@
 // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include <algorithm>
 #include <condition_variable>  // NOLINT
 #include <functional>
 #include <iostream>
 #include <thread>  // NOLINT
 #include <utility>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/gpu_info.h"
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_string(allocator_strategy);
 namespace paddle {
 namespace memory {
 namespace allocation {
 TEST(ThreadLocalAllocator, cross_scope_release) {
  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
  FLAGS_allocator_strategy = "thread_local";
  const size_t thread_num = 5;
  const std::vector<int> devices = platform::GetSelectedDevices();
  std::vector<std::vector<void *>> allocator_addresses(devices.size());
  std::vector<std::vector<AllocationPtr>> thread_allocations(devices.size());
  for (size_t i = 0; i < devices.size(); ++i) {
    allocator_addresses[i].resize(thread_num);
    thread_allocations[i].resize(thread_num);
  }
  std::vector<std::thread> threads(thread_num);
  std::mutex mutex;
  std::condition_variable cv;
  bool flag = false;
  for (size_t i = 0; i < threads.size(); ++i) {
    threads[i] = std::thread([&, i]() {
      {
        std::unique_lock<std::mutex> lock(mutex);
        cv.wait(lock, [&] { return flag; });
      }
      for (size_t j = 0; j < devices.size(); ++j) {
        thread_allocations[j][i] =
            memory::Alloc(platform::CUDAPlace(devices[j]), 10);
        auto tl_allocator_impl =
            ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
        allocator_addresses[j][i] = tl_allocator_impl.get();
      }
    });
  }
  {
    std::lock_guard<std::mutex> lock(mutex);
    flag = true;
    cv.notify_all();
  }
  for (auto &th : threads) {
    th.join();
  }
  for (auto &addresses : allocator_addresses) {
    std::sort(addresses.begin(), addresses.end());
    ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(),
                                 std::equal_to<void *>()),
              addresses.end());
  }
  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
  ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)),
              ::testing::ExitedWithCode(0), ".*");
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@ -303,7 +303,8 @@ DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
- * Value Range: string, {naive_best_fit, auto_growth}, default=auto_growth
+ * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
 * Example:
 * Note: For selecting allocator policy of PaddlePaddle.
 */