New feature: thread local allocator, test=develop (#23989)
* add the thread_local_allocator, test=develop * refactor the thread_local_allocator, test=develop * provides option setting strategy, test=developrevert-22778-infer_var_type
parent
80cf3c3c4d
commit
d2584a7082
@ -0,0 +1,76 @@
|
||||
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
namespace allocation {
|
||||
|
||||
ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
|
||||
: place_(p) {
|
||||
if (platform::is_gpu_place(place_)) {
|
||||
buddy_allocator_.reset(new memory::detail::BuddyAllocator(
|
||||
std::unique_ptr<memory::detail::SystemAllocator>(
|
||||
new memory::detail::GPUAllocator(
|
||||
boost::get<platform::CUDAPlace>(place_).device)),
|
||||
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
|
||||
} else {
|
||||
LOG(FATAL) << "Thread local allocator only supports CUDAPlace now.";
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ThreadLocalAllocatorImpl> ThreadLocalCUDAAllocatorPool::Get(
|
||||
int gpu_id) {
|
||||
auto pos = std::distance(devices_.begin(),
|
||||
std::find(devices_.begin(), devices_.end(), gpu_id));
|
||||
PADDLE_ENFORCE_LT(
|
||||
pos, devices_.size(),
|
||||
platform::errors::InvalidArgument(
|
||||
"The position of device should be less than the size of devices."));
|
||||
std::call_once(*init_flags_[pos], [this, pos, gpu_id] {
|
||||
platform::SetDeviceId(devices_[pos]);
|
||||
allocators_[pos].reset(
|
||||
new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id)));
|
||||
});
|
||||
return allocators_[pos];
|
||||
}
|
||||
|
||||
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
|
||||
: devices_(platform::GetSelectedDevices()) {
|
||||
auto gpu_num = devices_.size();
|
||||
allocators_.resize(gpu_num);
|
||||
init_flags_.reserve(gpu_num);
|
||||
for (size_t i = 0; i < gpu_num; ++i) {
|
||||
init_flags_.emplace_back(new std::once_flag());
|
||||
}
|
||||
}
|
||||
|
||||
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
|
||||
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
|
||||
void* ptr = buddy_allocator_->Alloc(size);
|
||||
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
|
||||
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
|
||||
return tl_allocation;
|
||||
}
|
||||
|
||||
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
|
||||
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
|
||||
buddy_allocator_->Free(allocation->ptr());
|
||||
delete allocation;
|
||||
}
|
||||
|
||||
} // namespace allocation
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
@ -0,0 +1,100 @@
|
||||
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/memory/allocation/allocator.h"
|
||||
#include "paddle/fluid/memory/detail/buddy_allocator.h"
|
||||
#include "paddle/fluid/memory/detail/system_allocator.h"
|
||||
#include "paddle/fluid/platform/gpu_info.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
namespace allocation {
|
||||
|
||||
class ThreadLocalAllocatorImpl;
|
||||
|
||||
class ThreadLocalAllocation : public Allocation {
|
||||
public:
|
||||
ThreadLocalAllocation(void* ptr, size_t size, platform::Place place)
|
||||
: Allocation(ptr, size, place) {}
|
||||
|
||||
void SetThreadLocalAllocatorImpl(
|
||||
std::shared_ptr<ThreadLocalAllocatorImpl> allocator) {
|
||||
allocator_ = allocator;
|
||||
}
|
||||
|
||||
std::shared_ptr<ThreadLocalAllocatorImpl> GetAllocator() {
|
||||
return allocator_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<ThreadLocalAllocatorImpl> allocator_;
|
||||
};
|
||||
|
||||
class ThreadLocalAllocatorImpl
|
||||
: public std::enable_shared_from_this<ThreadLocalAllocatorImpl> {
|
||||
public:
|
||||
explicit ThreadLocalAllocatorImpl(const platform::Place& p);
|
||||
ThreadLocalAllocation* AllocateImpl(size_t size);
|
||||
void FreeImpl(ThreadLocalAllocation* allocation);
|
||||
|
||||
private:
|
||||
std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
|
||||
platform::Place place_;
|
||||
};
|
||||
|
||||
class ThreadLocalCUDAAllocatorPool {
|
||||
public:
|
||||
static ThreadLocalCUDAAllocatorPool& Instance() {
|
||||
static thread_local ThreadLocalCUDAAllocatorPool pool;
|
||||
return pool;
|
||||
}
|
||||
|
||||
std::shared_ptr<ThreadLocalAllocatorImpl> Get(int gpu_id);
|
||||
|
||||
private:
|
||||
ThreadLocalCUDAAllocatorPool();
|
||||
std::vector<int> devices_;
|
||||
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
|
||||
std::vector<std::shared_ptr<ThreadLocalAllocatorImpl>> allocators_;
|
||||
};
|
||||
|
||||
class ThreadLocalCUDAAllocator : public Allocator {
|
||||
public:
|
||||
explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p)
|
||||
: gpu_id_(p.device) {}
|
||||
|
||||
bool IsAllocThreadSafe() const override { return true; }
|
||||
|
||||
protected:
|
||||
Allocation* AllocateImpl(size_t size) override {
|
||||
return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
|
||||
size);
|
||||
}
|
||||
void FreeImpl(Allocation* allocation) override {
|
||||
auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
|
||||
auto allocator_impl = tl_allocation->GetAllocator();
|
||||
allocator_impl->FreeImpl(tl_allocation);
|
||||
}
|
||||
|
||||
private:
|
||||
int gpu_id_;
|
||||
};
|
||||
|
||||
} // namespace allocation
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
@ -0,0 +1,93 @@
|
||||
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
|
||||
#include <algorithm>
|
||||
#include <condition_variable> // NOLINT
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <thread> // NOLINT
|
||||
#include <utility>
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/fluid/memory/malloc.h"
|
||||
#include "paddle/fluid/platform/gpu_info.h"
|
||||
|
||||
DECLARE_double(fraction_of_gpu_memory_to_use);
|
||||
DECLARE_string(allocator_strategy);
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
namespace allocation {
|
||||
|
||||
TEST(ThreadLocalAllocator, cross_scope_release) {
|
||||
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
|
||||
FLAGS_allocator_strategy = "thread_local";
|
||||
|
||||
const size_t thread_num = 5;
|
||||
const std::vector<int> devices = platform::GetSelectedDevices();
|
||||
|
||||
std::vector<std::vector<void *>> allocator_addresses(devices.size());
|
||||
std::vector<std::vector<AllocationPtr>> thread_allocations(devices.size());
|
||||
|
||||
for (size_t i = 0; i < devices.size(); ++i) {
|
||||
allocator_addresses[i].resize(thread_num);
|
||||
thread_allocations[i].resize(thread_num);
|
||||
}
|
||||
|
||||
std::vector<std::thread> threads(thread_num);
|
||||
std::mutex mutex;
|
||||
std::condition_variable cv;
|
||||
bool flag = false;
|
||||
|
||||
for (size_t i = 0; i < threads.size(); ++i) {
|
||||
threads[i] = std::thread([&, i]() {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
cv.wait(lock, [&] { return flag; });
|
||||
}
|
||||
for (size_t j = 0; j < devices.size(); ++j) {
|
||||
thread_allocations[j][i] =
|
||||
memory::Alloc(platform::CUDAPlace(devices[j]), 10);
|
||||
auto tl_allocator_impl =
|
||||
ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
|
||||
allocator_addresses[j][i] = tl_allocator_impl.get();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
flag = true;
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
for (auto &th : threads) {
|
||||
th.join();
|
||||
}
|
||||
|
||||
for (auto &addresses : allocator_addresses) {
|
||||
std::sort(addresses.begin(), addresses.end());
|
||||
ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(),
|
||||
std::equal_to<void *>()),
|
||||
addresses.end());
|
||||
}
|
||||
|
||||
::testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||
ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)),
|
||||
::testing::ExitedWithCode(0), ".*");
|
||||
}
|
||||
|
||||
} // namespace allocation
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
Loading…
Reference in new issue