Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refine_seq_concat
commit
24459501fe
@ -0,0 +1,123 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/framework/details/op_handle_base.h"
|
||||
#include "paddle/fluid/framework/garbage_collector.h"
|
||||
#include "paddle/fluid/framework/scope.h"
|
||||
#include "paddle/fluid/framework/tensor.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace details {
|
||||
|
||||
using ReferenceCountMap = std::unordered_map<std::string, int>;
|
||||
using AtomicReferenceCountMap =
|
||||
std::unordered_map<std::string, std::atomic<int>>;
|
||||
using DeviceReferenceCountMap =
|
||||
std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
|
||||
using AtomicDeviceReferenceCountMap =
|
||||
std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
|
||||
using DeviceGarbageCollectorMap =
|
||||
std::unordered_map<int,
|
||||
std::unique_ptr<GarbageCollector<framework::Tensor>>>;
|
||||
|
||||
class ReferenceCountOpHandle : public OpHandleBase {
|
||||
public:
|
||||
ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
|
||||
const platform::CUDAPlace &place,
|
||||
const std::vector<std::string> &var_names,
|
||||
GarbageCollector<Tensor> *gc,
|
||||
AtomicReferenceCountMap *ref_cnts)
|
||||
: OpHandleBase(node),
|
||||
scope_(scope),
|
||||
var_names_(var_names),
|
||||
gc_(gc),
|
||||
ref_cnts_(ref_cnts) {
|
||||
dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
|
||||
platform::DeviceContextPool::Instance().Get(place));
|
||||
if (IsStreamGarabageCollector()) {
|
||||
PADDLE_ENFORCE(cudaSetDevice(place.device));
|
||||
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
|
||||
~ReferenceCountOpHandle() {
|
||||
if (IsStreamGarabageCollector()) {
|
||||
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
|
||||
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
|
||||
PADDLE_ENFORCE(cudaEventDestroy(event_));
|
||||
}
|
||||
}
|
||||
|
||||
std::string Name() const override { return "reference_count"; }
|
||||
|
||||
protected:
|
||||
void RunImpl() override {
|
||||
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
|
||||
std::vector<LoDTensor *> tensors;
|
||||
for (auto &name : var_names_) {
|
||||
auto it = ref_cnts_->find(name);
|
||||
if (it == ref_cnts_->end()) continue;
|
||||
|
||||
auto *var = exec_scope->FindVar(name);
|
||||
if (var == nullptr || !var->IsType<LoDTensor>()) continue;
|
||||
|
||||
if (it->second.fetch_sub(1) <= 1) {
|
||||
tensors.emplace_back(var->GetMutable<LoDTensor>());
|
||||
}
|
||||
}
|
||||
|
||||
if (!tensors.empty()) {
|
||||
ClearTensors(tensors);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void ClearTensors(const std::vector<LoDTensor *> &tensors) {
|
||||
auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
|
||||
if (gc != nullptr) {
|
||||
auto compute_stream = dev_ctx_->stream();
|
||||
auto callback_stream = gc->stream();
|
||||
auto callback_func = [=]() {
|
||||
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
|
||||
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
|
||||
};
|
||||
gc_->Add(tensors, callback_func);
|
||||
} else {
|
||||
gc_->Add(tensors);
|
||||
}
|
||||
}
|
||||
|
||||
bool IsStreamGarabageCollector() const {
|
||||
return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
|
||||
}
|
||||
|
||||
const Scope *scope_;
|
||||
platform::CUDADeviceContext *dev_ctx_;
|
||||
std::vector<std::string> var_names_;
|
||||
GarbageCollector<Tensor> *gc_; // not own
|
||||
AtomicReferenceCountMap *ref_cnts_; // not own
|
||||
cudaEvent_t event_;
|
||||
};
|
||||
|
||||
} // namespace details
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,150 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/framework/details/computation_op_handle.h"
|
||||
#include "paddle/fluid/framework/details/multi_devices_helper.h"
|
||||
#include "paddle/fluid/framework/details/reference_count_pass.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace details {
|
||||
|
||||
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
|
||||
std::unique_ptr<ir::Graph> graph) const {
|
||||
auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
|
||||
auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
|
||||
auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
|
||||
|
||||
// It is not easy to find the right reference counts of varaibles in graph
|
||||
// Step 1: Find all variables in computation ops
|
||||
// Step 2: Find all variables in non-computation ops which refers to variables
|
||||
// in computation ops
|
||||
std::unordered_set<std::string> names;
|
||||
auto get_ref_cnts_from_compute_op = [&](
|
||||
const std::unique_ptr<OpHandleBase> &op,
|
||||
const std::vector<VarHandleBase *> &vars) {
|
||||
std::vector<std::string> var_names_in_op;
|
||||
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
|
||||
if (compute_op == nullptr ||
|
||||
!platform::is_gpu_place(compute_op->GetPlace()))
|
||||
return var_names_in_op;
|
||||
auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
|
||||
for (VarHandleBase *var_handle_base : vars) {
|
||||
auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
|
||||
if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
|
||||
|
||||
if (!platform::is_gpu_place(var_handle->place_) ||
|
||||
boost::get<platform::CUDAPlace>(var_handle->place_) != place)
|
||||
continue;
|
||||
|
||||
VarDesc *var_desc = var_handle->Node()->Var();
|
||||
auto var_name = var_handle->Node()->Name();
|
||||
|
||||
// This is wierd but there is really some variables without var_desc
|
||||
// in computation_op
|
||||
if (var_desc == nullptr) {
|
||||
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
|
||||
continue;
|
||||
} else {
|
||||
if (var_desc->Persistable() ||
|
||||
var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR)
|
||||
continue;
|
||||
}
|
||||
|
||||
// compute op only runs in one device
|
||||
if (ref_cnts[place.device]->count(var_name))
|
||||
++(*ref_cnts[place.device])[var_name];
|
||||
else
|
||||
(*ref_cnts[place.device])[var_name] = 1;
|
||||
|
||||
names.insert(var_name);
|
||||
var_names_in_op.push_back(var_name);
|
||||
}
|
||||
return var_names_in_op;
|
||||
};
|
||||
|
||||
auto update_ref_cnts_from_non_compute_op = [&](
|
||||
const std::unique_ptr<OpHandleBase> &op,
|
||||
const std::vector<VarHandleBase *> &vars) {
|
||||
if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
|
||||
for (VarHandleBase *var_handle_base : vars) {
|
||||
auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
|
||||
if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
|
||||
|
||||
auto var_name = var_handle->Node()->Name();
|
||||
auto var_place = var_handle->place_;
|
||||
if (!platform::is_gpu_place(var_place)) continue;
|
||||
auto place = boost::get<platform::CUDAPlace>(var_place);
|
||||
if (names.count(var_name) == 0) continue;
|
||||
if (ref_cnts.count(place.device) &&
|
||||
ref_cnts[place.device]->count(var_name)) {
|
||||
++(*ref_cnts[place.device])[var_name];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
|
||||
compute_ref_cnt_map;
|
||||
auto &all_ops = graph->Get<GraphOps>(kGraphOps);
|
||||
for (auto &op : all_ops) {
|
||||
auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
|
||||
auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
|
||||
if (in_var_names.empty() && out_var_names.empty()) continue;
|
||||
in_var_names.insert(in_var_names.end(), out_var_names.begin(),
|
||||
out_var_names.end());
|
||||
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
|
||||
auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
|
||||
ir::Node *ref_cnt_node =
|
||||
graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
|
||||
auto *ref_cnt_handle = new ReferenceCountOpHandle(
|
||||
ref_cnt_node, compute_op->GetScope(), place, in_var_names,
|
||||
gcs[place.device].get(), cur_ref_cnts[place.device].get());
|
||||
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
|
||||
compute_op->AddOutput(dep_var);
|
||||
ref_cnt_handle->AddInput(dep_var);
|
||||
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
|
||||
compute_ref_cnt_map[compute_op] = ref_cnt_handle;
|
||||
}
|
||||
|
||||
for (auto &op : all_ops) {
|
||||
update_ref_cnts_from_non_compute_op(op, op->Inputs());
|
||||
update_ref_cnts_from_non_compute_op(op, op->Outputs());
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
|
||||
new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
|
||||
for (auto &op : all_ops) {
|
||||
new_all_ops.emplace_back(std::move(op));
|
||||
auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
|
||||
if (it != compute_ref_cnt_map.end()) {
|
||||
new_all_ops.emplace_back(it->second);
|
||||
}
|
||||
}
|
||||
|
||||
all_ops.swap(new_all_ops);
|
||||
return graph;
|
||||
}
|
||||
|
||||
} // namespace details
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
||||
|
||||
REGISTER_PASS(reference_count_pass,
|
||||
paddle::framework::details::ReferenceCountPass)
|
||||
.RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
|
||||
.RequirePassAttr(paddle::framework::details::kCurReferenceCount)
|
||||
.RequirePassAttr(paddle::framework::details::kGarbageCollector);
|
@ -0,0 +1,37 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
|
||||
#include "paddle/fluid/framework/ir/graph.h"
|
||||
#include "paddle/fluid/framework/ir/pass.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace details {
|
||||
|
||||
constexpr char kGlobalReferenceCount[] = "reference_count";
|
||||
constexpr char kCurReferenceCount[] = "current_reference_count";
|
||||
constexpr char kGarbageCollector[] = "garbage_collector";
|
||||
|
||||
class ReferenceCountPass : public ir::Pass {
|
||||
protected:
|
||||
std::unique_ptr<ir::Graph> ApplyImpl(
|
||||
std::unique_ptr<ir::Graph> graph) const override;
|
||||
};
|
||||
|
||||
} // namespace details
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,163 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <deque>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex> // NOLINT
|
||||
#include "paddle/fluid/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
|
||||
// T should have memory_size() and clear() method
|
||||
template <typename T>
|
||||
class GarbageCollector {
|
||||
public:
|
||||
GarbageCollector(const platform::Place &place, size_t max_memory_size)
|
||||
: max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
|
||||
garbages_.reset(new std::deque<T *>());
|
||||
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
|
||||
}
|
||||
|
||||
virtual ~GarbageCollector() {}
|
||||
|
||||
void Reset() {
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
garbages_.reset(new std::deque<T *>());
|
||||
cur_memory_size_ = 0;
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
void Add(const Container &objs) {
|
||||
Add(objs, []() {});
|
||||
}
|
||||
|
||||
template <typename Container, typename Callback>
|
||||
void Add(const Container &objs, Callback &&callback) {
|
||||
std::shared_ptr<std::deque<T *>> clear_deque;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
for (auto *obj : objs) {
|
||||
garbages_->push_back(obj);
|
||||
cur_memory_size_ += obj->memory_size();
|
||||
}
|
||||
if (cur_memory_size_ >= max_memory_size_) {
|
||||
cur_memory_size_ = 0;
|
||||
clear_deque = garbages_;
|
||||
garbages_.reset(new std::deque<T *>());
|
||||
}
|
||||
}
|
||||
|
||||
if (clear_deque != nullptr) {
|
||||
callback();
|
||||
ClearCallback([=]() {
|
||||
for (auto *obj : *clear_deque) obj->clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Wait() const {}
|
||||
|
||||
protected:
|
||||
virtual void ClearCallback(const std::function<void()> &callback) = 0;
|
||||
|
||||
platform::DeviceContext *dev_ctx_;
|
||||
std::shared_ptr<std::deque<T *>> garbages_;
|
||||
mutable std::mutex mutex_;
|
||||
const size_t max_memory_size_;
|
||||
size_t cur_memory_size_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class CPUGarbageCollector : public GarbageCollector<T> {
|
||||
public:
|
||||
CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
|
||||
: GarbageCollector<T>(place, max_memory_size) {}
|
||||
|
||||
protected:
|
||||
void ClearCallback(const std::function<void()> &callback) override {
|
||||
callback();
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
template <typename T>
|
||||
class DefaultStreamGarbageCollector : public GarbageCollector<T> {
|
||||
public:
|
||||
DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
|
||||
size_t max_memory_size)
|
||||
: GarbageCollector<T>(place, max_memory_size) {}
|
||||
|
||||
cudaStream_t stream() const {
|
||||
return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
|
||||
->stream();
|
||||
}
|
||||
|
||||
void Wait() const override {
|
||||
this->dev_ctx_->Wait();
|
||||
static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
|
||||
->WaitStreamCallback();
|
||||
}
|
||||
|
||||
protected:
|
||||
void ClearCallback(const std::function<void()> &callback) override {
|
||||
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
|
||||
->AddStreamCallback(callback);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class StreamGarbageCollector : public GarbageCollector<T> {
|
||||
public:
|
||||
StreamGarbageCollector(const platform::CUDAPlace &place,
|
||||
size_t max_memory_size)
|
||||
: GarbageCollector<T>(place, max_memory_size) {
|
||||
PADDLE_ENFORCE(cudaSetDevice(place.device));
|
||||
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
|
||||
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
|
||||
}
|
||||
|
||||
~StreamGarbageCollector() {
|
||||
auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
|
||||
PADDLE_ENFORCE(cudaSetDevice(place.device));
|
||||
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
|
||||
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
|
||||
}
|
||||
|
||||
void Wait() const override {
|
||||
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
|
||||
std::lock_guard<std::mutex> guard(this->mutex_);
|
||||
callback_manager_->Wait();
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
protected:
|
||||
void ClearCallback(const std::function<void()> &callback) override {
|
||||
std::lock_guard<std::mutex> guard(this->mutex_);
|
||||
callback_manager_->AddCallback(callback);
|
||||
}
|
||||
|
||||
private:
|
||||
cudaStream_t stream_;
|
||||
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue