New feature: thread local allocator, test=develop (#23989)
* add the thread_local_allocator, test=develop * refactor the thread_local_allocator, test=develop * provides option setting strategy, test=developrevert-22778-infer_var_type
parent
80cf3c3c4d
commit
d2584a7082
@ -0,0 +1,76 @@
|
|||||||
|
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace memory {
|
||||||
|
namespace allocation {
|
||||||
|
|
||||||
|
ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
|
||||||
|
: place_(p) {
|
||||||
|
if (platform::is_gpu_place(place_)) {
|
||||||
|
buddy_allocator_.reset(new memory::detail::BuddyAllocator(
|
||||||
|
std::unique_ptr<memory::detail::SystemAllocator>(
|
||||||
|
new memory::detail::GPUAllocator(
|
||||||
|
boost::get<platform::CUDAPlace>(place_).device)),
|
||||||
|
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
|
||||||
|
} else {
|
||||||
|
LOG(FATAL) << "Thread local allocator only supports CUDAPlace now.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<ThreadLocalAllocatorImpl> ThreadLocalCUDAAllocatorPool::Get(
|
||||||
|
int gpu_id) {
|
||||||
|
auto pos = std::distance(devices_.begin(),
|
||||||
|
std::find(devices_.begin(), devices_.end(), gpu_id));
|
||||||
|
PADDLE_ENFORCE_LT(
|
||||||
|
pos, devices_.size(),
|
||||||
|
platform::errors::InvalidArgument(
|
||||||
|
"The position of device should be less than the size of devices."));
|
||||||
|
std::call_once(*init_flags_[pos], [this, pos, gpu_id] {
|
||||||
|
platform::SetDeviceId(devices_[pos]);
|
||||||
|
allocators_[pos].reset(
|
||||||
|
new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id)));
|
||||||
|
});
|
||||||
|
return allocators_[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
|
||||||
|
: devices_(platform::GetSelectedDevices()) {
|
||||||
|
auto gpu_num = devices_.size();
|
||||||
|
allocators_.resize(gpu_num);
|
||||||
|
init_flags_.reserve(gpu_num);
|
||||||
|
for (size_t i = 0; i < gpu_num; ++i) {
|
||||||
|
init_flags_.emplace_back(new std::once_flag());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
|
||||||
|
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
|
||||||
|
void* ptr = buddy_allocator_->Alloc(size);
|
||||||
|
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
|
||||||
|
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
|
||||||
|
return tl_allocation;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
|
||||||
|
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
|
||||||
|
buddy_allocator_->Free(allocation->ptr());
|
||||||
|
delete allocation;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace allocation
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,100 @@
|
|||||||
|
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include "paddle/fluid/memory/allocation/allocator.h"
|
||||||
|
#include "paddle/fluid/memory/detail/buddy_allocator.h"
|
||||||
|
#include "paddle/fluid/memory/detail/system_allocator.h"
|
||||||
|
#include "paddle/fluid/platform/gpu_info.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace memory {
|
||||||
|
namespace allocation {
|
||||||
|
|
||||||
|
class ThreadLocalAllocatorImpl;
|
||||||
|
|
||||||
|
class ThreadLocalAllocation : public Allocation {
|
||||||
|
public:
|
||||||
|
ThreadLocalAllocation(void* ptr, size_t size, platform::Place place)
|
||||||
|
: Allocation(ptr, size, place) {}
|
||||||
|
|
||||||
|
void SetThreadLocalAllocatorImpl(
|
||||||
|
std::shared_ptr<ThreadLocalAllocatorImpl> allocator) {
|
||||||
|
allocator_ = allocator;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<ThreadLocalAllocatorImpl> GetAllocator() {
|
||||||
|
return allocator_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<ThreadLocalAllocatorImpl> allocator_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ThreadLocalAllocatorImpl
|
||||||
|
: public std::enable_shared_from_this<ThreadLocalAllocatorImpl> {
|
||||||
|
public:
|
||||||
|
explicit ThreadLocalAllocatorImpl(const platform::Place& p);
|
||||||
|
ThreadLocalAllocation* AllocateImpl(size_t size);
|
||||||
|
void FreeImpl(ThreadLocalAllocation* allocation);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
|
||||||
|
platform::Place place_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ThreadLocalCUDAAllocatorPool {
|
||||||
|
public:
|
||||||
|
static ThreadLocalCUDAAllocatorPool& Instance() {
|
||||||
|
static thread_local ThreadLocalCUDAAllocatorPool pool;
|
||||||
|
return pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<ThreadLocalAllocatorImpl> Get(int gpu_id);
|
||||||
|
|
||||||
|
private:
|
||||||
|
ThreadLocalCUDAAllocatorPool();
|
||||||
|
std::vector<int> devices_;
|
||||||
|
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
|
||||||
|
std::vector<std::shared_ptr<ThreadLocalAllocatorImpl>> allocators_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ThreadLocalCUDAAllocator : public Allocator {
|
||||||
|
public:
|
||||||
|
explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p)
|
||||||
|
: gpu_id_(p.device) {}
|
||||||
|
|
||||||
|
bool IsAllocThreadSafe() const override { return true; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
Allocation* AllocateImpl(size_t size) override {
|
||||||
|
return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
|
||||||
|
size);
|
||||||
|
}
|
||||||
|
void FreeImpl(Allocation* allocation) override {
|
||||||
|
auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
|
||||||
|
auto allocator_impl = tl_allocation->GetAllocator();
|
||||||
|
allocator_impl->FreeImpl(tl_allocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int gpu_id_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace allocation
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,93 @@
|
|||||||
|
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <condition_variable> // NOLINT
|
||||||
|
#include <functional>
|
||||||
|
#include <iostream>
|
||||||
|
#include <thread> // NOLINT
|
||||||
|
#include <utility>
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include "paddle/fluid/memory/malloc.h"
|
||||||
|
#include "paddle/fluid/platform/gpu_info.h"
|
||||||
|
|
||||||
|
DECLARE_double(fraction_of_gpu_memory_to_use);
|
||||||
|
DECLARE_string(allocator_strategy);
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace memory {
|
||||||
|
namespace allocation {
|
||||||
|
|
||||||
|
TEST(ThreadLocalAllocator, cross_scope_release) {
|
||||||
|
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
|
||||||
|
FLAGS_allocator_strategy = "thread_local";
|
||||||
|
|
||||||
|
const size_t thread_num = 5;
|
||||||
|
const std::vector<int> devices = platform::GetSelectedDevices();
|
||||||
|
|
||||||
|
std::vector<std::vector<void *>> allocator_addresses(devices.size());
|
||||||
|
std::vector<std::vector<AllocationPtr>> thread_allocations(devices.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < devices.size(); ++i) {
|
||||||
|
allocator_addresses[i].resize(thread_num);
|
||||||
|
thread_allocations[i].resize(thread_num);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::thread> threads(thread_num);
|
||||||
|
std::mutex mutex;
|
||||||
|
std::condition_variable cv;
|
||||||
|
bool flag = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < threads.size(); ++i) {
|
||||||
|
threads[i] = std::thread([&, i]() {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
cv.wait(lock, [&] { return flag; });
|
||||||
|
}
|
||||||
|
for (size_t j = 0; j < devices.size(); ++j) {
|
||||||
|
thread_allocations[j][i] =
|
||||||
|
memory::Alloc(platform::CUDAPlace(devices[j]), 10);
|
||||||
|
auto tl_allocator_impl =
|
||||||
|
ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
|
||||||
|
allocator_addresses[j][i] = tl_allocator_impl.get();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
flag = true;
|
||||||
|
cv.notify_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto &th : threads) {
|
||||||
|
th.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto &addresses : allocator_addresses) {
|
||||||
|
std::sort(addresses.begin(), addresses.end());
|
||||||
|
ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(),
|
||||||
|
std::equal_to<void *>()),
|
||||||
|
addresses.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
::testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||||
|
ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)),
|
||||||
|
::testing::ExitedWithCode(0), ".*");
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace allocation
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace paddle
|
Loading…
Reference in new issue