Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_shape_api_doc

test=develop
revert-15774-anakin_subgraph_engine
chengduozh 7 years ago
commit d79d2f686c

@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

@ -240,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue; continue;
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
} }

@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced." "If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op." "If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add" "Such as scale, elementwise_add"
"By default, it's turned on"); "By default, it's turned off");
DECLARE_string(memory_optimize_debug); DECLARE_string(memory_optimize_debug);

@ -13,13 +13,19 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle { namespace paddle {
namespace framework { namespace framework {
@ -166,6 +172,11 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs); auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape(); auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
@ -230,6 +241,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
return found_node; return found_node;
} }
ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const { bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) { if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name()); auto& node_in_samename = mark_table_.at(var->Name());
@ -241,10 +273,15 @@ bool OrderedSet::Has(ir::Node* var) const {
return false; return false;
} }
void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]);
mark_table_.erase(var);
}
void OrderedSet::Erase(ir::Node* var) { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var->Name())); PADDLE_ENFORCE(var != nullptr);
nodes_.erase(mark_table_[var->Name()]); Erase(var->Name());
mark_table_.erase(var->Name());
} }
std::string OrderedSet::ToString() const { std::string OrderedSet::ToString() const {
@ -274,14 +311,35 @@ bool NodeCanReused(ir::Node* node) {
return flag; return flag;
} }
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS || type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
if (node.Persistable() || node.GetShape().empty()) { // persistable variable is parameter
if (node.Persistable()) {
return false;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@ -461,7 +519,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
for (auto* node : ops_) { for (auto* node : ops_) {
if (node == op) break; if (node == op) break;
for (auto& output : node->outputs) { for (auto& output : node->outputs) {
if (output->Name() == name) { PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output; found_node = output;
} }
} }

@ -55,6 +55,7 @@ class OrderedSet {
void Insert(ir::Node* var); void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
void Erase(const std::string& var);
bool Has(ir::Node* var) const; bool Has(ir::Node* var) const;
void Clear() { void Clear() {
mark_table_.clear(); mark_table_.clear();
@ -62,6 +63,7 @@ class OrderedSet {
} }
// find the bestfit shape node block with var. // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const; ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetNodeIndexInPool(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string

@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
} }
} }
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -69,55 +69,59 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
for (auto& var : op->outputs) { for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
skip_set_.count(var->Name())) VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue; continue;
ir::Node* cache = pool_.FindBestFitNode(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
} }
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue; if (cache != nullptr) {
if (var->Name() == cache->Name()) { int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << "The same cache variable is cascade reused." << var->Name() VLOG(3) << string::Sprintf(
<< " is re-filled to the pool after" "!!! %s, %s => %s, cache idx %d, pool size %d",
<< "the reused op is finished. Current op can not " std::to_string(reuse_id++), DebugString(var), DebugString(cache),
<< "replace it again. Skip this candidate."; node_idx_in_pool, static_cast<int>(pool_.size()));
continue; // NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); //
VLOG(3) << string::Sprintf( // IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d", //
std::to_string(reuse_id++), DebugString(var), DebugString(cache), // ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool, static_cast<int>(pool_.size())); // CreateOp, CreateVar when running happens.
//
// update CFG Graph on the fly. // CFG Graph store the liveness information, when reuse happens
// reused var maybe re-fill into the pool // we also need to update the variable liveness.
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); const std::string var_name = var->Name();
// NOTE(dzhwinter): we need to both update the ProgramDesc const std::string cache_name = cache->Name();
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
// fill the pool cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
std::unordered_set<std::string> unlived_vars; RenameVarInGraphDesc(var_name, cache_name, idx);
for (auto var : cfg_->LiveIn(op)) { RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
if (cfg_->LiveOut(op).count(var) == 0) { pool_.Erase(cache_name);
unlived_vars.emplace(var);
} }
} }
for (auto var : unlived_vars) { }
// fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op); ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node); pool_.Insert(var_node);
} }
@ -273,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var // redirect the input to the latest version of cache_var
for (auto* node : op->inputs) { for (auto* node : op->inputs) {
if (node->Name() == var) { if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); ir::Node* cache_node = var_nodes_[cache_var].back();
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node // swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(), cache_node->outputs.insert(cache_node->outputs.end(),
@ -283,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0]; auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node); cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) { for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
@ -307,15 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
} }
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
} }
} // namespace details } // namespace details

@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"}); op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());

@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale") ->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.) ->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.); ->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern() auto scale_out =
->NewNode("scale_out") detector.mutable_pattern()
->assert_is_op_output("scale"); ->NewNode("scale_out")
->assert_is_op_output("scale")
// scale's output var should has only one consumer, or it can't be
// removed.
->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in}); pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); scale_op->LinksFrom({scale_in}).LinksTo({scale_out});

@ -207,7 +207,7 @@ framework::LoDTensor& VarBase::GradValue() {
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) { if (grad_op_descs_.empty() && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type(); VLOG(3) << "op with no grad: " << op_desc_->Type();
return {}; return {};
} }

@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n; return node.inputs.size() == n;
} }
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (Agent(p).deleted()) {
visited.insert(p);
to_visit.erase(p);
}
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle

@ -30,6 +30,7 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__"; const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_; framework::ir::Node *x_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
framework::ir::Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
framework::ir::Node *operator->();
private:
std::vector<framework::ir::Node *> sorted_;
size_t cursor_{0};
};
// The nodes those have no input will be treated as start points. // The nodes those have no input will be treated as start points.
static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) { static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
std::vector<framework::ir::Node *> result; std::vector<framework::ir::Node *> result;

@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
for (int i = 0; i < fixed_ratios.size(); i++) { for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
} }
@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
} }
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
var_t.mutable_data<T>( var_t.mutable_data<T>(
@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int i = 0; i < box_num; ++i) { for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) { for (size_t j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j]; e_vars(i, j) = variances[j];
} }
} }

@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
} }
} }
template <typename T>
struct ClipFunctor {
HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename T> template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> { class PriorBoxOpKernel : public framework::OpKernel<T> {
public: public:
@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); T* b_t = boxes->data<T>();
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
T center_x = (w + offset) * step_width; T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height; T center_y = (h + offset) * step_height;
T box_width, box_height; T box_width, box_height;
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) { if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.; box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
// priors with different aspect ratios // priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) { for (size_t r = 0; r < aspect_ratios.size(); ++r) {
@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} else { } else {
// priors with different aspect ratios // priors with different aspect ratios
@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float ar = aspect_ratios[r]; float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} }
} }
@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;

@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
} }
}; };
class GroupNormInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{"X", "Y"}};
}
};
class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
}
};
class GroupNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return {{"X", /*->*/ "Y"}};
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker, REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
ops::GroupNormGradMaker); ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp); ops::GroupNormInplaceInToOut);
REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
ops::GroupNormGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>, group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>); ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);

@ -339,6 +339,71 @@ void BenchSoftmaxKernel() {
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchLayerNormKernel() {
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
int left = n * x_dim_0;
for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
int sz = left * right;
Tensor x, mean, var, scale, bias, out;
x.Resize({n, x_dim_0, x_dim_1});
out.Resize({n, x_dim_0, x_dim_1});
mean.Resize({n, x_dim_0});
var.Resize({n, x_dim_0});
scale.Resize({x_dim_1});
bias.Resize({x_dim_1});
RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* scale_data = scale.data<T>();
const T* bias_data = bias.data<T>();
T* x_data = x.data<T>();
T* mean_data = mean.data<T>();
T* var_data = var.data<T>();
T* out_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchCRFDecodingKernel() {
constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) {
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
Tensor x, w, alpha, track;
x.Resize({seq_len, tag_num});
w.Resize({tag_num + state_trans_base_idx, tag_num});
alpha.Resize({seq_len, tag_num});
track.Resize({seq_len, tag_num});
RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
const T* w_data = w.data<T>();
T* alpha_data = alpha.mutable_data<T>(PlaceType());
int* track_data = track.mutable_data<int>(PlaceType());
BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
}
}
}
using T = float; using T = float;
using CPUPlace = paddle::platform::CPUPlace; using CPUPlace = paddle::platform::CPUPlace;
@ -382,6 +447,16 @@ BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
// softmax // softmax
BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); } BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
// layernorm
BENCH_FP32_CPU(kLayerNorm) {
BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
}
// crfdecoding
BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
}
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
// Options: // Options:

@ -292,6 +292,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
} }
}; };
template <typename T>
struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<T>, std::vector<T>, int, float, int> {
void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
std::vector<T>& x, std::vector<T>& outref, // NOLINT
std::vector<T>& mean, std::vector<T>& var, // NOLINT
const std::vector<T>& scale, const std::vector<T>& bias,
int left, const float epsilon, int right) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
EXPECT_EQ(mean.size(), static_cast<size_t>(left));
EXPECT_EQ(var.size(), static_cast<size_t>(left));
EXPECT_EQ(scale.size(), static_cast<size_t>(right));
EXPECT_EQ(bias.size(), static_cast<size_t>(right));
std::vector<T> outtgt(outref.size());
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
T* outtgt_data = outtgt.data();
tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
epsilon, right);
ExpectEQ<T>(outtgt_data, outref_data, left * right);
}
};
template <typename T>
struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<int>,
int> {
void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
const int seq_len, const std::vector<T>& x,
const std::vector<T>& w, std::vector<T>& alpharef, // NOLINT
std::vector<int>& trackref, int tag_num) { // NOLINT
constexpr int state_trans_base_idx = 2;
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(w.size(),
static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
std::vector<T> alphatgt(alpharef.size());
std::vector<int> tracktgt(trackref.size());
memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
tracktgt.data(), tag_num);
ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
}
};
template <jit::KernelType KT, typename KernelTuples, typename PlaceType, template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
typename... Args> typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
@ -640,6 +697,71 @@ void TestNCHW16CMulNCKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestLayerNormKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
int left = n * x_dim_0;
for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int sz = left * right;
std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
outref(sz);
RandomVec<T>(sz, x.data(), -2.f, 2.f);
RandomVec<T>(left, mean.data(), -2.f, 2.f);
RandomVec<T>(left, var.data(), -2.f, 2.f);
RandomVec<T>(right, scale.data(), -2.f, 2.f);
RandomVec<T>(right, bias.data(), -2.f, 2.f);
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<T>, std::vector<T>, int, float>(
right, x, outref, mean, var, scale, bias, left, epsilon, right);
}
}
}
}
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestCRFDecodingKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
std::vector<int> trackref(x_sz);
RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
trackref.data(), tag_num);
TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
trackref, tag_num);
}
}
}
// XYZNTuple // XYZNTuple
TEST(JITKernel, kVMul) { TEST(JITKernel, kVMul) {
TestXYZNKernel<jit::kVMul, float, CPUPlace>(); TestXYZNKernel<jit::kVMul, float, CPUPlace>();
@ -761,7 +883,16 @@ TEST(JITKernel, kNCHW16CMulNC) {
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>(); TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
} }
// TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, kLayerNorm) {
TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
}
TEST(JITKernel, kCRFDecoding) {
TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
TestCRFDecodingKernel<jit::kCRFDecoding, double,
paddle::platform::CPUPlace>();
}
TEST(JITKernel, pool) { TEST(JITKernel, pool) {
// TODO(TJ): add some test // TODO(TJ): add some test

@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase {
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking // Error checking
PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more"); PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx); DeserializeFromStream(*buffer, tensor, dev_ctx);
@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase {
tensor->ShareDataWith(fp16_tensor); tensor->ShareDataWith(fp16_tensor);
} }
} }
buffer->peek();
PADDLE_ENFORCE(buffer->eof(),
"You are not allowed to load partial data via "
"load_combine_op, use load_op instead.");
} }
}; };

@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,

@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value.output_value = nullptr;
lstmp_grad.state_active_grad = nullptr;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);

@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
unidirectional recurrent neural networks. The row convolution operator is unidirectional recurrent neural networks. The row convolution operator is
different from the 1D sequence convolution, and is computed as follows: different from the 1D sequence convolution, and is computed as follows:
Given an input sequence $in$ of length $t$ and input dimension $d$, Given an input sequence $X$ of length $t$ and input dimension $D$,
and a filter ($W$) of size $context \times d$, and a filter ($W$) of size $context \times D$,
the output sequence is convolved as: the output sequence is convolved as:
$$ $$
out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :} out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
$$ $$
In the above equation: In the above equation:
* $Out_{i}$: The i-th row of output variable with shape [1, D]. * $Out_{i}$: The i-th row of output variable with shape [1, D].
* $\\tau$: Future context size. * $context$: Future context size.
* $X_{j}$: The j-th row of input variable with shape [1, D]. * $X_{j}$: The j-th row of input variable with shape [1, D].
* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D]. * $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
More details about row_conv please refer to More details about row_conv please refer to
the design document the design document

@ -233,9 +233,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
#endif // __APPLE__ and windows #endif // __APPLE__ and windows
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
throw ::paddle::platform::EnforceNotMet( \ do { \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__) throw ::paddle::platform::EnforceNotMet( \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
} while (0)
#define PADDLE_ENFORCE(COND, ...) \ #define PADDLE_ENFORCE(COND, ...) \
do { \ do { \
@ -270,23 +272,25 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
* extra messages is also supported, for example: * extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
*/ */
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \ do { \
if (UNLIKELY(nullptr == (__VAL))) { \ if (UNLIKELY(nullptr == (__VAL))) { \
PADDLE_THROW(#__VAL " should not be null\n%s", \ PADDLE_THROW(#__VAL " should not be null\n%s", \
paddle::string::Sprintf("" __VA_ARGS__)); \ ::paddle::string::Sprintf(__VA_ARGS__)); \
} \ } \
} while (0) } while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \ do { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ auto __cond1__ = (__VAL0); \
auto __cond2__ = (__VAL1); \
if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) { \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \ " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, #__VAL0, \ #__VAL0, #__VAL1, #__VAL0, \
paddle::string::to_string(__VAL0), #__VAL1, \ ::paddle::string::to_string(__cond1__), #__VAL1, \
paddle::string::to_string(__VAL1), \ ::paddle::string::to_string(__cond2__), \
paddle::string::Sprintf("" __VA_ARGS__)); \ ::paddle::string::Sprintf(__VA_ARGS__)); \
} \ } \
} while (0) } while (0)

@ -13,10 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/ir.h"
#include <algorithm>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
@ -27,6 +29,10 @@ namespace py = pybind11;
using paddle::framework::ir::Graph; using paddle::framework::ir::Graph;
using paddle::framework::ir::Node; using paddle::framework::ir::Node;
using paddle::framework::ir::GraphSafeRemoveNodes; using paddle::framework::ir::GraphSafeRemoveNodes;
using paddle::framework::ir::HasCircle;
using paddle::framework::ir::GraphNum;
using paddle::framework::ir::TopologySortOperations;
using paddle::framework::ir::BuildOperationAdjList;
using paddle::framework::OpDesc; using paddle::framework::OpDesc;
using paddle::framework::ProgramDesc; using paddle::framework::ProgramDesc;
using paddle::framework::VarDesc; using paddle::framework::VarDesc;
@ -36,6 +42,12 @@ namespace paddle {
namespace pybind { namespace pybind {
void BindGraph(py::module *m) { void BindGraph(py::module *m) {
m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes); m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
m->def("has_circle", HasCircle);
m->def("graph_num", GraphNum);
m->def("topology_sort", TopologySortOperations,
return_value_policy::reference);
m->def("build_adjacency_list", BuildOperationAdjList,
return_value_policy::reference);
py::class_<Graph, std::shared_ptr<Graph>>( py::class_<Graph, std::shared_ptr<Graph>>(
*m, "Graph", *m, "Graph",
"The graph is a Directed Acyclic Single Static Assignment Graph, see " "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@ -46,7 +58,6 @@ void BindGraph(py::module *m) {
.def("get_float", &Graph::Get<float>) .def("get_float", &Graph::Get<float>)
.def("get_double", &Graph::Get<double>) .def("get_double", &Graph::Get<double>)
.def("get_string", &Graph::Get<std::string>) .def("get_string", &Graph::Get<std::string>)
.def("get_program", &Graph::Get<ProgramDesc>)
.def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>) .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
.def("set", [](Graph &self, const std::string &attr_name, .def("set", [](Graph &self, const std::string &attr_name,
int attr) { return self.Set(attr_name, new int(attr)); }) int attr) { return self.Set(attr_name, new int(attr)); })
@ -63,11 +74,6 @@ void BindGraph(py::module *m) {
[](Graph &self, const std::string &attr_name, double attr) { [](Graph &self, const std::string &attr_name, double attr) {
return self.Set(attr_name, new double(attr)); return self.Set(attr_name, new double(attr));
}) })
.def("set",
[](Graph &self, const std::string &attr_name,
const ProgramDesc &attr) {
return self.Set(attr_name, new ProgramDesc(attr));
})
.def("set", .def("set",
[](Graph &self, const std::string &attr_name, [](Graph &self, const std::string &attr_name,
const std::unordered_set<const Node *> &attr) { const std::unordered_set<const Node *> &attr) {
@ -108,42 +114,42 @@ void BindNode(py::module *m) {
.def("is_op", &Node::IsOp) .def("is_op", &Node::IsOp)
.def("is_var", &Node::IsVar) .def("is_var", &Node::IsVar)
.def("is_ctrl_var", &Node::IsCtrlVar) .def("is_ctrl_var", &Node::IsCtrlVar)
.def("clear_inputs", [](Node &self) { self.inputs.clear(); })
.def("inputs_remove", .def("inputs_remove",
[](Node &self, int node_id) { [](Node &self, int node_id) {
for (auto it = self.inputs.begin(); it != self.inputs.end(); auto pos = std::find_if(
it++) { self.inputs.begin(), self.inputs.end(),
if ((*it)->id() == node_id) { [&node_id](const Node *n) { return n->id() == node_id; });
self.inputs.erase(it); if (pos != self.inputs.end()) {
} self.inputs.erase(pos);
} }
}) })
.def("inputs_remove", .def("inputs_remove",
[](Node &self, Node &node) { [](Node &self, Node &node) {
for (auto it = self.inputs.begin(); it != self.inputs.end(); auto pos =
it++) { std::find(self.inputs.begin(), self.inputs.end(), &node);
if (*it == &node) { if (pos != self.inputs.end()) {
self.inputs.erase(it); self.inputs.erase(pos);
}
} }
}) })
.def("inputs_append", .def("inputs_append",
[](Node &self, Node &node) { self.inputs.push_back(&node); }) [](Node &self, Node &node) { self.inputs.push_back(&node); })
.def("clear_outputs", [](Node &self) { self.outputs.clear(); })
.def("outputs_remove", .def("outputs_remove",
[](Node &self, int node_id) { [](Node &self, int node_id) {
for (auto it = self.outputs.begin(); it != self.outputs.end(); auto pos = std::find_if(
it++) { self.outputs.begin(), self.outputs.end(),
if ((*it)->id() == node_id) { [&node_id](const Node *n) { return n->id() == node_id; });
self.outputs.erase(it); if (pos != self.outputs.end()) {
} self.outputs.erase(pos);
} }
}) })
.def("outputs_remove", .def("outputs_remove",
[](Node &self, Node &node) { [](Node &self, Node &node) {
for (auto it = self.outputs.begin(); it != self.outputs.end(); auto pos =
it++) { std::find(self.outputs.begin(), self.outputs.end(), &node);
if (*it == &node) { if (pos != self.outputs.end()) {
self.outputs.erase(it); self.outputs.erase(pos);
}
} }
}) })
.def("outputs_append", .def("outputs_append",

@ -829,8 +829,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
m.def("get_pass", [](const py::bytes &binary_str) { m.def("get_pass", [](const std::string &pass_type) {
std::string pass_type(binary_str);
auto pass = framework::ir::PassRegistry::Instance().Get(pass_type); auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
return std::shared_ptr<framework::ir::Pass>(std::move(pass)); return std::shared_ptr<framework::ir::Pass>(std::move(pass));
}); });
@ -838,10 +837,9 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass"); py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init()) pass.def(py::init())
.def("has", &ir::Pass::Has) .def("has", &ir::Pass::Has)
.def("set", .def("set_not_owned",
[](ir::Pass &self, const std::string &attr_name, [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
const ProgramDesc &attr) { self.SetNotOwned<ProgramDesc>(attr_name, &attr);
return self.Set(attr_name, new ProgramDesc(attr));
}) })
.def( .def(
"set", "set",
@ -850,7 +848,6 @@ All parameter, weight, gradient are variables in Paddle.
}) })
.def("set", [](ir::Pass &self, const std::string &name, .def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); }) int val) { self.Set<const int>(name, new int(val)); })
.def("get_program", &ir::Pass::Get<ProgramDesc>)
.def("type", &ir::Pass::Type) .def("type", &ir::Pass::Type)
.def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) { .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
std::unique_ptr<ir::Graph> origin_graph(graph.get()); std::unique_ptr<ir::Graph> origin_graph(graph.get());

@ -64,6 +64,7 @@ if (WITH_TESTING)
add_subdirectory(paddle/dataset/tests) add_subdirectory(paddle/dataset/tests)
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests) add_subdirectory(paddle/fluid/contrib/tests)
add_subdirectory(paddle/fluid/contrib/slim/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save