squash commits. test=develop

revert-15296-async_double_buffered_py_reader
dzhwinter 6 years ago
parent 266e625d2e
commit 8f3b252392

@ -200,6 +200,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

@ -50,7 +50,8 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc memory_optimize_helper.cc DEPS graph graph_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
@ -65,12 +66,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph)
cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

@ -17,7 +17,7 @@ limitations under the License. */
#include <glog/logging.h>
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
@ -42,6 +42,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) {
if (strategy_.enable_inplace_) {
AppendPass("inplace_pass");
}
if (strategy_.enable_sequential_execution_) {
AppendPass("sequential_execution_pass");
}
@ -87,7 +90,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass");
auto memory_optimize_pass = AppendPass("memory_optimize_pass");
}
AppendMultiDevPass(strategy);
@ -185,8 +188,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "analysis_var_pass") {
} else if (pass->Type() == "memory_optimize_pass") {
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
@ -213,6 +215,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "inplace_pass") {
if (graph->Has(kAllOpDescs)) {
graph->Erase(kAllOpDescs);
}
graph->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
if (!use_cuda) {
LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
@ -238,8 +247,9 @@ USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass);
USE_PASS(memory_optimize_pass);
USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass);

@ -80,6 +80,8 @@ struct BuildStrategy {
bool memory_early_delete_{false};
bool enable_inplace_{false};
bool enable_sequential_execution_{false};
bool fuse_broadcast_op_{false};

File diff suppressed because it is too large Load Diff

@ -0,0 +1,74 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
class GraphView {
public:
GraphView() = default;
void Build(ir::Graph* g);
const std::vector<ir::Node*> AllOps();
ir::Node* GetNodeByName(const std::string& name,
const std::vector<ir::Node*>& nodes) const;
std::vector<ir::Node*> PendingOpsOnVar(ir::Node* var);
bool OutConnectInputByCtrlVar(ir::Node* in_var, ir::Node* out_var);
private:
std::vector<ir::Node*> ops_;
};
class InplacePass : public ir::Pass {
public:
InplacePass();
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
void InitSSAGraphNodes() const;
private:
void InplaceModifyVar(const std::string& in_var, const std::string& out_var,
const size_t& idx, ir::Graph* graph) const;
void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
const size_t& idx) const;
void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const;
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
mutable std::unordered_set<std::string> whitelist_;
mutable GraphView view_;
};
} // namespace details
} // namespace framework
} // namespace paddle

@ -16,7 +16,7 @@
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <iostream>
#include <sstream>
#include <string>
@ -83,7 +83,7 @@ struct NodeComparator {
}
};
void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) {
@ -119,11 +119,11 @@ void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
mark_table_[var->Name()] = it;
}
int OrderedNodePairPool::GetIndex(ir::Node* var) {
int OrderedNodeList::GetIndex(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]);
}
ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const {
ir::Node* found_node = nullptr;
NodeComparator compare_node;
@ -136,13 +136,15 @@ ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
return found_node;
}
void OrderedNodePairPool::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var->Name()));
nodes_.erase(mark_table_[var->Name()]);
mark_table_.erase(var->Name());
void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); }
void OrderedNodeList::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]);
mark_table_.erase(var);
}
std::string OrderedNodePairPool::ToString() const {
std::string OrderedNodeList::ToString() const {
std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " ";
@ -150,6 +152,38 @@ std::string OrderedNodePairPool::ToString() const {
return ss.str();
}
bool NodeCanReused(ir::Node* node) {
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
auto* desc = node->Var();
auto type = desc->GetType();
if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
desc->GetShape().empty()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std::string name = node->Name();
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
return false;
for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
return framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0;
}
}
return true;
}
bool OpHasSubBlock(OpDesc* desc) {
const AttributeMap& attrs = desc->GetAttrMap();
for (auto& attr : attrs) {
if (attr.second.type() == typeid(BlockDesc*) || // NOLINT
attr.second.type() == typeid(std::vector<BlockDesc*>)) // NOLINT
return true;
}
return false;
}
} // namespace details
} // namespace framework
} // namespace paddle

@ -43,7 +43,7 @@ using GraphNodePool = std::vector<
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodePairPool {
class OrderedNodeList {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
@ -53,8 +53,12 @@ class OrderedNodePairPool {
void Erase(ir::Node* var);
void Erase(const std::string& var);
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
bool Has(const std::string& var) { return mark_table_.count(var); }
ir::Node* NodeMatch(ir::Node* var) const;
// map store non-const iterator, can not promise const
int GetIndex(ir::Node* var);
@ -67,6 +71,11 @@ class OrderedNodePairPool {
ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
void Clear() {
mark_table_.clear();
nodes_.clear();
}
private:
// for searching.
std::unordered_map<std::string, Iter> mark_table_;
@ -74,14 +83,47 @@ class OrderedNodePairPool {
std::list<NodePair> nodes_;
};
// valid a tensor can be reuse or not
bool NodeCanReused(ir::Node* node);
// check op has subblock or not
bool OpHasSubBlock(OpDesc* desc);
// node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n);
std::string DebugString(ir::Node* var);
// std::string DebugString(VarDesc* var);
VarDesc* FindVarDescInBlock(ir::Node* n);
template <typename Container, typename Callback>
class FilterVariableImpl {
public:
void operator()(const Container& nodes, Callback callback) {
for (auto* node : nodes) {
callback(node);
}
}
};
// filter var node for op->inputs/outputs
template <typename Callback>
class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
public:
void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
for (auto* var : nodes) {
if (var->IsVar() && !var->IsCtrlVar()) {
callback(var);
}
}
}
};
template <typename Container, typename Callback>
void FilterVariables(const Container& nodes, Callback callback) {
FilterVariableImpl<Container, Callback>()(nodes, callback);
}
} // namespace details
} // namespace framework
} // namespace paddle

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <iostream>
#include <memory>
@ -27,8 +27,8 @@ namespace paddle {
namespace framework {
namespace details {
TEST(OrderedNodePairPool, Normal) {
OrderedNodePairPool pool;
TEST(OrderedNodeList, Normal) {
OrderedNodeList pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <atomic>
#include <deque>
@ -48,35 +48,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback>
class FilterVariableImpl {
public:
void operator()(const Container& nodes, Callback callback) {
for (auto* node : nodes) {
callback(node);
}
}
};
// filter var node for op->inputs/outputs
template <typename Callback>
class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
public:
void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
for (auto* var : nodes) {
if (var->IsVar() && !var->IsCtrlVar()) {
callback(var);
}
}
}
};
template <typename Container, typename Callback>
void FilterVariables(const Container& nodes, Callback callback) {
FilterVariableImpl<Container, Callback>()(nodes, callback);
}
std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes);
@ -103,48 +75,53 @@ std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
}
for (auto& var : op->outputs) {
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache != nullptr) {
if (var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused."
<< var->Name() << " is re-filled to the pool after"
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
continue;
}
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
skip_set_.count(var->Name()))
continue;
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
int node_idx_in_pool = pool_.GetIndex(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size()));
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
// NOTE(dzhwinter): we need to both update the ProgramDesc
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
if (cache == nullptr) continue;
if (var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused." << var->Name()
<< " is re-filled to the pool after"
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
continue;
int node_idx_in_pool = pool_.GetIndex(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size()));
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
// NOTE(dzhwinter): we need to both update the ProgramDesc
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
// fill the pool
std::unordered_set<std::string> unlived_vars;
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
unlived_vars.emplace(var);
}
}
}
// fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
for (auto var : unlived_vars) {
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
if (var_node == nullptr) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op);
}
@ -177,7 +154,7 @@ std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
return graph;
}
void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// conditional block, while op and their grad op
auto* sub_block_desc =
AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
@ -247,7 +224,7 @@ void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
}
}
std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars;
for (auto& op : nodes) {
@ -263,9 +240,9 @@ std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
return vars;
}
void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var,
size_t idx) const {
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var,
size_t idx) const {
for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
auto* op = cfg_->Ops()[i];
PADDLE_ENFORCE(op->IsOp() && op->Op());
@ -277,7 +254,7 @@ void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
}
}
void AnalysisVarPass::InitSSAGraphNodes() const {
void MemoryOptimizePass::InitSSAGraphNodes() const {
std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
if (var_nodes_.empty()) {
for (auto* op : cfg_->Ops()) {
@ -297,9 +274,10 @@ void AnalysisVarPass::InitSSAGraphNodes() const {
}
}
void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
const std::string& cache_var,
size_t idx, ir::Graph* graph) const {
void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
const std::string& cache_var,
size_t idx,
ir::Graph* graph) const {
// if replace happens, we need to create a newer version cache_var
// but use the same dims/data_type with var.
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
@ -358,39 +336,6 @@ void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
var_nodes_.at(var).clear();
}
bool AnalysisVarPass::NodeCanReused(ir::Node* node) const {
if (!node->IsVar() || node->IsCtrlVar()) return false;
auto* desc = node->Var();
auto type = desc->GetType();
if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
desc->GetShape().empty()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std::string name = node->Name();
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
return false;
if (skip_set_.count(name)) return false;
for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
return framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0;
}
}
return true;
}
bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const {
const AttributeMap& attrs = desc->GetAttrMap();
for (auto& attr : attrs) {
if (attr.second.type() == typeid(BlockDesc*) || // NOLINT
attr.second.type() == typeid(std::vector<BlockDesc*>)) // NOLINT
return true;
}
return false;
}
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
@ -651,6 +596,7 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
} // namespace framework
} // namespace paddle
REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass)
REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs);

@ -25,7 +25,7 @@
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
@ -35,12 +35,10 @@ namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// sort op in bfs order
std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
class ControlFlowGraph;
class AnalysisVarPass : public ir::Pass {
class MemoryOptimizePass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
@ -57,17 +55,13 @@ class AnalysisVarPass : public ir::Pass {
ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const;
// valid a tensor can be reuse or not
bool NodeCanReused(ir::Node* node) const;
// scan subblock and collect the output/input variables.
std::unordered_set<std::string> GetSubBlockVars(
const std::unordered_set<ir::Node*>&) const;
// check op has subblock or not
bool OpHasSubBlock(OpDesc* desc) const;
private:
// Reuse Node Pool, Owned.
mutable OrderedNodePairPool pool_;
mutable OrderedNodeList pool_;
// controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>

@ -18,6 +18,7 @@ limitations under the License. */
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
@ -32,7 +33,8 @@ enum OpInfoFillType {
kOpProtoAndCheckerMaker = 1,
kGradOpDescMaker = 2,
kVarTypeInference = 3,
kShapeInference = 4
kShapeInference = 4,
kInplaceOpInference = 5
};
template <typename T>
@ -48,8 +50,11 @@ struct OpInfoFillTypeID {
? kVarTypeInference
: (std::is_base_of<InferShapeBase, T>::value
? kShapeInference
: static_cast<OpInfoFillType>(
-1)))));
: (std::is_base_of<
InplaceOpInference, T>::value
? kInplaceOpInference
: static_cast<OpInfoFillType>(
-1))))));
}
};
@ -139,6 +144,16 @@ struct OpInfoFiller<T, kShapeInference> {
}
};
template <typename T>
struct OpInfoFiller<T, kInplaceOpInference> {
void operator()(const char* op_type, OpInfo* info) const {
info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) {
T infer;
return infer(op_desc, block);
};
}
};
} // namespace details
} // namespace framework

@ -0,0 +1,135 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <numeric>
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h"
namespace paddle {
namespace framework {
/*
Inplace Inference for create In->Out pairs for inplaced operator.
If we specify a pair of corresponding names. For example, X->Out.
then Out will inplaced use X's memory. The base class will do
legality validation for both variables.
*/
class InplaceOpInference {
public:
virtual ~InplaceOpInference() {}
virtual std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const = 0;
};
class InplaceInToOut : public InplaceOpInference {
public:
std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const {
std::unordered_map<std::string, std::string> ret;
auto in_out_var_names_pair = this->Apply(op_desc, block);
for (auto& pair : in_out_var_names_pair) {
PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
string::Sprintf("op %s do not have input of %s!",
op_desc.Type(), pair.first));
PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
string::Sprintf("op %s do not have output of %s!",
op_desc.Type(), pair.second));
auto& in_name = op_desc.Input(pair.first).at(0);
auto& out_name = op_desc.Output(pair.second).at(0);
auto in = block->FindRecursiveOrCreateVar(in_name);
auto out = block->FindRecursiveOrCreateVar(out_name);
if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
}
return ret;
}
protected:
virtual std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const = 0;
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
auto var_can_reused = [&](const VarDesc& node) -> bool {
auto type = node.GetType();
if (node.Persistable() || type != proto::VarType::LOD_TENSOR ||
node.GetShape().empty()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std::string name = node.Name();
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
return false;
return true;
};
auto var_size_in_bytes = [&](const VarDesc& node) -> size_t {
auto shape = node.GetShape();
int size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int>());
size_t type_size = SizeOfType(node.GetDataType());
return type_size * std::abs(size);
};
return in.Name() != out.Name() && var_can_reused(in) &&
var_can_reused(out) &&
var_size_in_bytes(out) <= var_size_in_bytes(in);
}
};
/*
Inplace In and Out for operator only have an Input and an Output.
For example, activation op.
*/
class SingleOpInplaceInToOut : public InplaceInToOut {
protected:
std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const override {
PADDLE_ENFORCE(!op_desc.InputNames().empty(),
"Op inputs must not be empty");
PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
"Op outputs must not be empty");
auto x_name = op_desc.InputNames().at(0);
auto out_name = op_desc.OutputNames().at(0);
return std::unordered_map<std::string, std::string>{{x_name, out_name}};
}
};
/*
Gradient op. Inplace output use it's Input.
For example, Input@Grad->Input reuse strategy.
*/
class GradOpInplaceInToOut : public InplaceInToOut {
protected:
std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const override {
std::unordered_map<std::string, std::string> ret;
std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
op_desc.OutputNames().end());
for (auto& input_name : op_desc.InputNames()) {
if (output_names.count(GradVarName(input_name))) {
ret.insert({input_name, GradVarName(input_name)});
}
}
return ret;
}
};
} // namespace framework
} // namespace paddle

File diff suppressed because it is too large Load Diff

@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <functional>
#include <string>
#include <typeindex>
#include <typeinfo>

@ -38,6 +38,7 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_;
InferInplaceOpFN infer_inplace_;
bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr;

@ -57,5 +57,8 @@ using InferVarTypeFN =
using InferShapeFN = std::function<void(InferShapeContext*)>;
using InplacePair = std::unordered_map<std::string, std::string>;
using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>;
} // namespace framework
} // namespace paddle

@ -547,12 +547,14 @@ namespace ops = paddle::operators;
__macro(Swish, swish); \
__macro(ThresholdedRelu, thresholded_relu);
#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::operators::OP_NAME##GradMaker); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::operators::OP_NAME##GradMaker, \
::paddle::framework::SingleOpInplaceInToOut); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \
::paddle::framework::SingleOpInplaceInToOut)
#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \

@ -602,13 +602,48 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
}
};
class BatchNormInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"},
};
return inplace_in_to_out;
}
};
class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
// Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
{framework::GradVarName("Y"), framework::GradVarName("X")},
{"SavedMean", framework::GradVarName("Scale")},
{"SavedVariance", framework::GradVarName("Bias")},
};
return inplace_in_to_out;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker,
ops::BatchNormInplaceInToOut);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
ops::BatchNormGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,

@ -18,6 +18,7 @@ namespace ops = paddle::operators;
REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
"X");
REGISTER_OP_CPU_KERNEL(
elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,

@ -250,6 +250,20 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
}
};
class ElementwiseOpInplace : public framework::InplaceInToOut {
public:
using framework::InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return std::unordered_map<std::string, std::string>{
{"X", "Out"},
};
}
};
} // namespace operators
} // namespace paddle
@ -299,6 +313,7 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker); \
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)

@ -267,6 +267,35 @@ class Flatten2GradOp : public framework::OperatorBase {
}
};
class FlattenOpInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"X", "Out"},
};
return inplace_in_to_out;
}
};
class FlattenGradInplaceinToOut : public framework::InplaceInToOut {
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
{framework::GradVarName("Out"), framework::GradVarName("X")},
};
return inplace_in_to_out;
}
};
} // namespace operators
} // namespace paddle
@ -275,10 +304,13 @@ USE_OP(reshape);
namespace ops = paddle::operators;
REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
ops::FlattenOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
paddle::framework::DefaultGradOpDescMaker<true>,
ops::FlattenOpInplaceInToOut);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape,
ops::FlattenGradInplaceinToOut);
REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker,
ops::FlattenOpInplaceInToOut);
REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
ops::Flatten2GradInferShape);
ops::Flatten2GradInferShape, ops::FlattenGradInplaceinToOut);

@ -327,13 +327,44 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
}
};
class ReshapeOpInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"X", "Out"},
};
return inplace_in_to_out;
}
};
class ReshapeGradInplaceInToOut : public framework::InplaceInToOut {
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = {
{framework::GradVarName("Out"), framework::GradVarName("X")},
};
return inplace_in_to_out;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
paddle::framework::DefaultGradOpDescMaker<true>,
ops::ReshapeOpInplaceInToOut);
REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp,
ops::ReshapeGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
@ -343,8 +374,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
ops::ReshapeGradKernel);
REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
ops::Reshape2GradMaker);
REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
ops::Reshape2GradMaker, ops::ReshapeOpInplaceInToOut);
REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp,
ops::ReshapeGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save