|
|
|
@ -21,35 +21,42 @@
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/framework/details/computation_op_handle.h"
|
|
|
|
|
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
|
|
|
|
|
#include "paddle/fluid/framework/details/eager_deletion_pass.h"
|
|
|
|
|
#include "paddle/fluid/framework/details/multi_devices_helper.h"
|
|
|
|
|
#include "paddle/fluid/framework/ir/graph_helper.h"
|
|
|
|
|
|
|
|
|
|
DEFINE_double(fraction_of_eager_deletion, 1.0, "Fraction of eager deletion");
|
|
|
|
|
DEFINE_bool(eager_delete_tensor_only, false, "");
|
|
|
|
|
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
|
|
|
|
|
"Fraction of eager deletion");
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
|
namespace framework {
|
|
|
|
|
namespace details {
|
|
|
|
|
|
|
|
|
|
namespace { // NOLINT
|
|
|
|
|
// op -> variables which can be deleted after op runs
|
|
|
|
|
using OpToVarNameSetMap =
|
|
|
|
|
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
|
|
|
|
|
} // NOLINT
|
|
|
|
|
|
|
|
|
|
// Check whether the variable is LoDTensor based on static VarDesc info
|
|
|
|
|
static bool IsLoDTensor(VarDesc *var) {
|
|
|
|
|
return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int64_t GetNumel(const GraphVars &vars, const std::string &var_name,
|
|
|
|
|
size_t scope_idx) {
|
|
|
|
|
auto *var_desc = TryGetLatestVarDesc(vars[scope_idx].at(var_name));
|
|
|
|
|
// Get memory size of LoDTensor
|
|
|
|
|
static int64_t GetMemorySize(
|
|
|
|
|
const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
|
|
|
|
|
const std::string &var_name) {
|
|
|
|
|
auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(var_desc);
|
|
|
|
|
PADDLE_ENFORCE(IsLoDTensor(var_desc));
|
|
|
|
|
auto dims = var_desc->GetShape();
|
|
|
|
|
return std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
|
|
|
|
|
return SizeOfType(var_desc->GetDataType()) *
|
|
|
|
|
std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
|
|
|
|
|
std::multiplies<int64_t>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
|
|
|
|
|
// SelectedRows, LoDTensorArray)
|
|
|
|
|
// Since partial GC is based on static analysis of memory size of each variable
|
|
|
|
|
// So we should skip SelectedRows and LoDTensorArray here
|
|
|
|
|
static void SplitIntoLoDTensorAndNonLoDTensorVars(
|
|
|
|
|
const OpToVarNameSetMap &m, const GraphVars &vars,
|
|
|
|
|
OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
|
|
|
|
@ -69,76 +76,106 @@ static void SplitIntoLoDTensorAndNonLoDTensorVars(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static OpToVarNameSetMap ShrinkGCVars(const OpToVarNameSetMap &m,
|
|
|
|
|
const GraphVars &vars,
|
|
|
|
|
double fraction_of_memory_size,
|
|
|
|
|
bool delete_lod_tensor_only = false) {
|
|
|
|
|
// Do not perform gc
|
|
|
|
|
struct GCVarInfo {
|
|
|
|
|
GCVarInfo(const std::string &name, int64_t memory_size,
|
|
|
|
|
ComputationOpHandle *op, size_t scope_idx)
|
|
|
|
|
: name_(name),
|
|
|
|
|
memory_size_(memory_size),
|
|
|
|
|
op_(op),
|
|
|
|
|
scope_idx_(scope_idx) {}
|
|
|
|
|
|
|
|
|
|
std::string name_; // variable name
|
|
|
|
|
int64_t memory_size_; // memory size
|
|
|
|
|
ComputationOpHandle *op_; // op after which the variable could be deleted
|
|
|
|
|
size_t scope_idx_; // scope index where the variable locates
|
|
|
|
|
|
|
|
|
|
int64_t AbsMemorySize() const { return std::abs(memory_size_); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Delete delete_lod_tensor_only is not used currently
|
|
|
|
|
static OpToVarNameSetMap ShrinkGCVars(
|
|
|
|
|
const OpToVarNameSetMap &m, const GraphVars &vars,
|
|
|
|
|
const std::vector<platform::Place> &places, double fraction_of_memory_size,
|
|
|
|
|
bool delete_lod_tensor_only = false) {
|
|
|
|
|
// Do not perform gc when fraction_of_memory_size = 0
|
|
|
|
|
if (fraction_of_memory_size <= 0.0) return {};
|
|
|
|
|
|
|
|
|
|
// Perform complete gc
|
|
|
|
|
/**
|
|
|
|
|
* Step 1: Split all variables into LoDTensor and Non-LoDTensor.
|
|
|
|
|
* We can only calculate memory size of LoDTensors
|
|
|
|
|
*/
|
|
|
|
|
OpToVarNameSetMap lod_tensors, other_vars;
|
|
|
|
|
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
|
|
|
|
|
|
|
|
|
|
// Perform complete gc when fraction_of_memory_size >= 1
|
|
|
|
|
if (fraction_of_memory_size >= 1.0) {
|
|
|
|
|
if (delete_lod_tensor_only) {
|
|
|
|
|
OpToVarNameSetMap lod_tensors, other_vars;
|
|
|
|
|
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
|
|
|
|
|
return lod_tensors;
|
|
|
|
|
} else {
|
|
|
|
|
return m;
|
|
|
|
|
}
|
|
|
|
|
return delete_lod_tensor_only ? lod_tensors : m;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Perform partial gc
|
|
|
|
|
OpToVarNameSetMap lod_tensors, other_vars;
|
|
|
|
|
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
|
|
|
|
|
/**
|
|
|
|
|
* Step 2: build GCVarInfos, and calculate total memory sizes of each device
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
using TupleType = std::tuple<std::string, ComputationOpHandle *, int64_t>;
|
|
|
|
|
// place -> variable info (name, memory size, place, scope_idx)
|
|
|
|
|
std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
|
|
|
|
|
|
|
|
|
|
std::unordered_map<size_t, std::vector<TupleType>> place_to_vars;
|
|
|
|
|
std::unordered_map<size_t, int64_t> total_memory_size;
|
|
|
|
|
// place -> total memory sizes
|
|
|
|
|
std::map<platform::Place, int64_t> place_to_size;
|
|
|
|
|
for (auto &op_vars_pair : lod_tensors) {
|
|
|
|
|
auto scope_idx = op_vars_pair.first->GetScopeIdx();
|
|
|
|
|
int64_t size = 0;
|
|
|
|
|
for (auto &var_name : op_vars_pair.second) {
|
|
|
|
|
auto var_size = GetNumel(vars, var_name, scope_idx);
|
|
|
|
|
size += std::abs(var_size);
|
|
|
|
|
place_to_vars[scope_idx].emplace_back(var_name, op_vars_pair.first,
|
|
|
|
|
var_size);
|
|
|
|
|
auto *op = op_vars_pair.first;
|
|
|
|
|
auto &var_names = op_vars_pair.second;
|
|
|
|
|
auto scope_idx = op->GetScopeIdx();
|
|
|
|
|
auto &place = places[scope_idx];
|
|
|
|
|
|
|
|
|
|
for (auto &var_name : var_names) {
|
|
|
|
|
auto var_size = GetMemorySize(vars[scope_idx], var_name);
|
|
|
|
|
GCVarInfo var_info(var_name, var_size, op, scope_idx);
|
|
|
|
|
place_to_size[place] += var_info.AbsMemorySize();
|
|
|
|
|
place_to_vars[place].emplace_back(std::move(var_info));
|
|
|
|
|
}
|
|
|
|
|
total_memory_size.emplace(scope_idx, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto &pair : place_to_vars) {
|
|
|
|
|
std::sort(pair.second.begin(), pair.second.end(),
|
|
|
|
|
[](const TupleType &t1, const TupleType &t2) {
|
|
|
|
|
return std::abs(std::get<2>(t1)) > std::abs(std::get<2>(t2));
|
|
|
|
|
/**
|
|
|
|
|
* Step 3: sort GCVarInfos, and only delete the largest variables.
|
|
|
|
|
*/
|
|
|
|
|
OpToVarNameSetMap partial_vars;
|
|
|
|
|
for (auto &place_to_var_pair : place_to_vars) {
|
|
|
|
|
auto &place = place_to_var_pair.first;
|
|
|
|
|
auto &gc_vars = place_to_var_pair.second;
|
|
|
|
|
std::sort(gc_vars.begin(), gc_vars.end(),
|
|
|
|
|
[](const GCVarInfo &var1, const GCVarInfo &var2) {
|
|
|
|
|
return var1.AbsMemorySize() > var2.AbsMemorySize();
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OpToVarNameSetMap ret;
|
|
|
|
|
for (auto &pair : place_to_vars) {
|
|
|
|
|
auto desired_delete_size = static_cast<int64_t>(
|
|
|
|
|
fraction_of_memory_size * total_memory_size.at(pair.first));
|
|
|
|
|
int64_t cur_size = 0;
|
|
|
|
|
for (size_t i = 0; i < pair.second.size() && cur_size < desired_delete_size;
|
|
|
|
|
int64_t accumulated_size = 0;
|
|
|
|
|
int64_t size_threshold =
|
|
|
|
|
static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
|
|
|
|
|
for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
|
|
|
|
|
++i) {
|
|
|
|
|
auto &var_name = std::get<0>(pair.second[i]);
|
|
|
|
|
auto *op = std::get<1>(pair.second[i]);
|
|
|
|
|
cur_size += std::get<2>(pair.second[i]);
|
|
|
|
|
ret[op].insert(var_name);
|
|
|
|
|
partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
|
|
|
|
|
accumulated_size += gc_vars[i].AbsMemorySize();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Step 4: Combine other vars (SelectedRows, LoDTensorArray)
|
|
|
|
|
*/
|
|
|
|
|
if (!delete_lod_tensor_only) {
|
|
|
|
|
for (auto &op_vars_pair : other_vars) {
|
|
|
|
|
for (auto &var_name : op_vars_pair.second) {
|
|
|
|
|
ret[op_vars_pair.first].insert(var_name);
|
|
|
|
|
}
|
|
|
|
|
partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
|
|
|
|
|
op_vars_pair.second.end());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
return partial_vars;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class EagerDeletionPass : public ir::Pass {
|
|
|
|
|
protected:
|
|
|
|
|
std::unique_ptr<ir::Graph> ApplyImpl(
|
|
|
|
|
std::unique_ptr<ir::Graph> graph) const override;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
|
|
|
|
|
std::unique_ptr<ir::Graph> graph) const {
|
|
|
|
|
auto &ref_cnts =
|
|
|
|
@ -166,9 +203,8 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
op_vars_map =
|
|
|
|
|
ShrinkGCVars(op_vars_map, vars, FLAGS_fraction_of_eager_deletion,
|
|
|
|
|
FLAGS_eager_delete_tensor_only);
|
|
|
|
|
op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
|
|
|
|
|
FLAGS_memory_fraction_of_eager_deletion);
|
|
|
|
|
|
|
|
|
|
for (auto &pair : op_vars_map) {
|
|
|
|
|
auto *op = pair.first;
|
|
|
|
@ -200,12 +236,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
|
|
|
|
|
eager_deletion_op->AddOutput(dummy_leaf);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VLOG(10) << "FLAGS_fraction_of_eager_deletion = "
|
|
|
|
|
<< FLAGS_fraction_of_eager_deletion;
|
|
|
|
|
VLOG(10) << "FLAGS_eager_delete_tensor_only = "
|
|
|
|
|
<< FLAGS_eager_delete_tensor_only;
|
|
|
|
|
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
|
|
|
|
|
<< FLAGS_memory_fraction_of_eager_deletion;
|
|
|
|
|
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
|
|
|
|
|
return graph;
|
|
|
|
|
|
|
|
|
|
auto while_op_eager_deletion_pass =
|
|
|
|
|
ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
|
|
|
|
|
return while_op_eager_deletion_pass->Apply(std::move(graph));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace details
|
|
|
|
@ -218,3 +255,5 @@ REGISTER_PASS(eager_deletion_pass,
|
|
|
|
|
.RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
|
|
|
|
|
.RequirePassAttr(paddle::framework::details::kAllPlaces)
|
|
|
|
|
.RequirePassAttr(paddle::framework::details::kGarbageCollector);
|
|
|
|
|
|
|
|
|
|
USE_PASS(while_op_eager_deletion_pass);
|
|
|
|
|