From d384d39a68457c837b46df3260a343041e934637 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 19 Jul 2018 03:14:04 +0000 Subject: [PATCH 01/37] add Temporarily add code with bug --- .../inference/tensorrt/convert/CMakeLists.txt | 5 +- .../inference/tensorrt/convert/pool2d_op.cc | 75 +++++++++++++++++++ .../tensorrt/convert/test_pool2d_op.cc | 63 ++++++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 748f5a084e..c841510ae6 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,6 +1,6 @@ # Add TRT tests nv_library(tensorrt_converter - SRCS mul_op.cc conv2d_op.cc fc_op.cc + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc DEPS tensorrt_engine mul_op) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -13,3 +13,6 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) + +nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc new file mode 100644 index 0000000000..ea282a79d5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights. + */ +class Pool2dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) + << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + std::string pool_type = + boost::get(op_desc.GetAttr("pooling_type")); + std::vector ksize = + boost::get>(op_desc.GetAttr("ksize")); + std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + + const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX; + if (pool_type == "max") { + pool_t = nvinfer1::PoolingType::kMAX; + } else if (pool_type == "avg") { + pool_t = nvinfer1::PoolingType::kAVERAGE; + } else { + PADDLE_THROW("TensorRT unsupported pooling type!"); + } + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, + *const_cast(input1), + pool_t, nv_ksize); + PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created."); + layer->setStride(nv_strides); + layer->setPadding(nv_paddings); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); +REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc new file mode 100644 index 0000000000..261eb2c6ce --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(Pool2dOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("pool2d-X", nvinfer1::Dims4(10, 3, 2, 2)); + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims4(10, 3, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d-X"}); + desc.SetOutput("Out", {"pool2d-Out"}); + + std::vector ksize({2, 2}); + std::vector strides({1, 1}); + std::vector paddings({0, 0}); + std::string pooling_t = "max"; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + // std::string temp = ""; + // (*desc.Proto()).SerializeToString(&temp); + + // std::cout << temp << std::endl; + // std::ofstream f("__temp__", std::ios::out); + // f << temp; + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); From dcaf183daa2629d0032874632a0b90fa78d3d9ff Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 17 Jul 2018 21:34:00 +0800 Subject: [PATCH 02/37] builder SSA graph at the beginning. --- .../details/multi_devices_graph_builder.cc | 27 ++-- .../framework/details/ssa_graph_builder.cc | 11 ++ paddle/fluid/framework/ir/graph.cc | 133 ++++++++++++++++-- paddle/fluid/framework/ir/graph.h | 3 + paddle/fluid/framework/ir/graph_test.cc | 1 + paddle/fluid/framework/ir/node.h | 1 + 6 files changed, 150 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index f1f8674caf..dc9183e96a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -221,15 +221,15 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // forward, backward nodes. E.g. you can't append an forward node // at the end of the node list. // TODO(panyx0718): FIXME: Needs to sort by forward->backward order. - for (auto &node : nodes) { - if (node->NodeType() != ir::Node::Type::kOperation) continue; + for (ir::Node *node : TopologySortOperationFromInToOut(nodes)) { + VLOG(3) << "apply node: " << node->Name() << reinterpret_cast(node); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, node.get()); - } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) { - CreateDistTrainOp(&result, node.get()); - } else if (IsScaleLossOp(node.get())) { + CreateRPCOp(&result, node); + } else if (IsDistTrainOp(node, send_vars, recv_vars)) { + CreateDistTrainOp(&result, node); + } else if (IsScaleLossOp(node)) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != BuildStrategy::GradientScaleStrategy::kCustomized) { @@ -240,10 +240,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // It also assumes backward op will always follow the forward op in // the block. is_forwarding = false; + LOG(ERROR) << "forward flipping!!!!!!!"; } else { - int op_dev_id = GetOpDeviceID(node.get()); + int op_dev_id = GetOpDeviceID(node); if (op_dev_id != -1) { // This op only runs on one specific device. - CreateComputationalOp(&result, node.get(), op_dev_id); + CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { var_name_on_devices_.emplace(n->Name(), op_dev_id); } @@ -252,13 +253,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // gradients. if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { node->Op()->SetAttr("throw_eof_exp", false); - CreateComputationalOps(&result, node.get(), places_.size()); - // TODO(paddle-dev): builder shouldn't depend on the out logic of - // a specific op. + CreateComputationalOps(&result, node, places_.size()); const auto &data_var_names = node->Op()->Output("Out"); InsertDataBalanceOp(&result, data_var_names); } else { - CreateComputationalOps(&result, node.get(), places_.size()); + CreateComputationalOps(&result, node, places_.size()); } if (!is_forwarding && places_.size() > 1) { @@ -479,8 +478,8 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { PADDLE_ENFORCE_EQ(param_grad.size(), 2U); int dev_id = GetVarDeviceID(param_grad[1]); - PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", - node->Op()->Type(), param_grad[0]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", + node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 7bc130ef6e..2be4bb009e 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -37,6 +37,17 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) { continue; } + bool has_dep = false; + for (auto read_out : read_op->Outputs()) { + for (auto write_in : write_op->Inputs()) { + if (read_out == write_in) { + has_dep = true; + break; + } + } + } + if (has_dep) continue; + auto *dep_var = new DummyVarHandle( graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); read_op->AddOutput(dep_var); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index e4021aa92b..f297461ab2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -12,14 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include + #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { +namespace { +void SortHelper( + const std::map> &adj_list, + ir::Node *node, std::unordered_set *visited, + std::vector *ret) { + visited->insert(node); + + for (auto adj : adj_list.at(node)) { + if (visited->find(adj) == visited->end()) { + SortHelper(adj_list, adj, visited, ret); + } + } + + VLOG(3) << "topology sort insert: " << node->Name() + << reinterpret_cast(node) << " input " << node->inputs.size(); + ret->push_back(node); +} +} // namespace -// NOTE(paddle-dev): This graph contains circle. Graph::Graph(const ProgramDesc &program) : program_(program) { VLOG(3) << "block in program:" << program_.Size(); std::unordered_map all_vars; @@ -27,40 +48,128 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { all_vars.emplace(var->Name(), var); } - std::map var_nodes; + ir::Node *last_backward = nullptr; + std::vector optimize_ops; + std::map> var_nodes; for (auto *op : program.Block(0).AllOps()) { ir::Node *node = CreateOpNode(op); + if (boost::get( + op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kBackward)) { + last_backward = node; + } else if (boost::get( + op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kOptimize)) { + optimize_ops.push_back(node); + } for (auto &each_var_name : op->InputArgumentNames()) { ir::Node *var = nullptr; if (var_nodes.find(each_var_name) != var_nodes.end()) { - var = var_nodes.at(each_var_name); + var = var_nodes.at(each_var_name).back(); } else if (all_vars.count(each_var_name) != 0) { var = CreateVarNode(all_vars.at(each_var_name)); - var_nodes[each_var_name] = var; + var_nodes[each_var_name].push_back(var); } else { // TODO(paddle-dev): Seems some assumption doesn't hold? VLOG(3) << op->Type() << " input var not in all_var list: " << each_var_name; var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); - var_nodes[each_var_name] = var; + var_nodes[each_var_name].push_back(var); } node->inputs.push_back(var); var->outputs.push_back(node); } for (auto &each_var_name : op->OutputArgumentNames()) { - ir::Node *var = nullptr; - if (var_nodes.find(each_var_name) != var_nodes.end()) { - var = var_nodes.at(each_var_name); - } else { - var = CreateVarNode(all_vars.at(each_var_name)); - var_nodes[each_var_name] = var; - } + ir::Node *var = CreateVarNode(all_vars.at(each_var_name)); + var_nodes[each_var_name].push_back(var); node->outputs.push_back(var); var->inputs.push_back(node); } } + for (auto &var : var_nodes) { + auto &versions = var.second; + if (versions.size() <= 1) continue; + + auto it_new = versions.rbegin(); + auto it_old = versions.rbegin(); + ++it_old; + for (; it_old != versions.rend(); it_new = it_old, ++it_old) { + ir::Node *write_op = + (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; + const auto &read_ops = (*it_old)->outputs; + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable); + read_op->outputs.push_back(dep_var); + dep_var->inputs.push_back(read_op); + write_op->inputs.push_back(dep_var); + dep_var->outputs.push_back(write_op); + } + } + } + + if (last_backward) { + for (ir::Node *opt_node : optimize_ops) { + ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable); + last_backward->outputs.push_back(dep_var); + dep_var->inputs.push_back(last_backward); + opt_node->inputs.push_back(dep_var); + dep_var->outputs.push_back(opt_node); + VLOG(3) << "appending connect: " << last_backward->Name() + << reinterpret_cast(last_backward) << "->" + << opt_node->Name() << reinterpret_cast(opt_node); + } + } +} + +std::vector TopologySortOperationFromInToOut( + const std::vector> &nodes) { + std::map> adj_list; + std::unordered_set visited; + std::vector ret; + + for (auto &n : nodes) { + if (n->NodeType() != ir::Node::Type::kOperation) continue; + if (adj_list.find(n.get()) == adj_list.end()) { + adj_list[n.get()] = std::unordered_set(); + } + for (auto &var : n->inputs) { + for (auto &adj_n : var->inputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + adj_list[n.get()].insert(adj_n); + LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n.get()) + << " via " << var->Name() << reinterpret_cast(var); + } + } + } + + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &ret); + } + } + + for (ir::Node *n : ret) { + std::unordered_set dummy; + n->inputs.erase( + std::remove_if(n->inputs.begin(), n->inputs.end(), + [n](ir::Node *t) { return t->Name() == "dummy"; }), + n->inputs.end()); + n->outputs.erase( + std::remove_if(n->outputs.begin(), n->outputs.end(), + [n](ir::Node *t) { return t->Name() == "dummy"; }), + n->outputs.end()); + } + return ret; } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index b4ac135b02..0242edecf4 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -78,5 +78,8 @@ class Graph { std::map> attr_dels_; }; +std::vector TopologySortOperationFromInToOut( + const std::vector>& nodes); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 4e23bf124f..186047b370 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -76,6 +76,7 @@ TEST(GraphTest, Basic) { op->SetType("sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); op->SetOutput("Out", {"test_out"}); + op->SetAttr("op_role", 1); prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS); prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index b98c29b81d..97b64a6017 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -50,6 +50,7 @@ class Node { PADDLE_ENFORCE(type_ == Type::kVariable); return var_desc_; } + OpDesc* Op() { PADDLE_ENFORCE(type_ == Type::kOperation); return op_desc_; From 0b3465d215f9639bffc79f2336d623acca08ba6d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 20 Jul 2018 10:37:04 +0800 Subject: [PATCH 03/37] better --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/multi_devices_graph_builder.cc | 55 ++++++- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- paddle/fluid/framework/ir/graph.cc | 136 +++++++++++++----- paddle/fluid/framework/ir/graph.h | 3 - paddle/fluid/framework/ir/graph_helper.cc | 116 +++++++++++++++ paddle/fluid/framework/ir/graph_helper.h | 35 +++++ paddle/fluid/framework/ir/node.cc | 6 +- paddle/fluid/framework/ir/node.h | 2 + 9 files changed, 308 insertions(+), 49 deletions(-) create mode 100644 paddle/fluid/framework/ir/graph_helper.cc create mode 100644 paddle/fluid/framework/ir/graph_helper.h diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index df55b3d054..620d202d33 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph) +cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph graph_helper) cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index dc9183e96a..4cc6e5727b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" @@ -186,9 +187,55 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( return dev_id; } +// Topology sort the graph nodes from inputs to outputs. +// Since SSAGraphBuilder depends on forward/backward nodes to assign devices +// to parameter/gradients before optimizer ops, topo sort is insufficient. ( +// some optimizer ops might not depend on any nodes), we manually move all +// optimizer nodes after last backward nodes. +std::vector SortOpsAndDelayOptimizeOp(const Graph &graph) { + std::vector ret = ir::TopologySort(graph); + size_t last_backward = 0; + std::vector optimize_ops; + std::vector sorted_ret; + for (size_t i = 0; i < ret.size(); ++i) { + if (boost::get( + ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kBackward)) { + sorted_ret.push_back(ret[i]); + last_backward = sorted_ret.size(); + } else if (boost::get(ret[i]->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kOptimize)) { + optimize_ops.push_back(ret[i]); + } else { + sorted_ret.push_back(ret[i]); + } + } + + sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(), + optimize_ops.end()); + + for (ir::Node *n : sorted_ret) { + n->inputs.erase(std::remove_if(n->inputs.begin(), n->inputs.end(), + [n](ir::Node *t) { + return t->Name() == + ir::Node::kControlDepVarName; + }), + n->inputs.end()); + n->outputs.erase(std::remove_if(n->outputs.begin(), n->outputs.end(), + [n](ir::Node *t) { + return t->Name() == + ir::Node::kControlDepVarName; + }), + n->outputs.end()); + } + return sorted_ret; +} + std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unique_ptr graph) const { // Rebuild the graph structure. + std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); auto nodes = std::move(graph->nodes); graph->nodes.clear(); @@ -217,12 +264,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( size_t cur_device_id = 0; bool is_forwarding = true; - // NOTE: Currently, passes before SSAGraphBuilder cannot reorder - // forward, backward nodes. E.g. you can't append an forward node - // at the end of the node list. - // TODO(panyx0718): FIXME: Needs to sort by forward->backward order. - for (ir::Node *node : TopologySortOperationFromInToOut(nodes)) { - VLOG(3) << "apply node: " << node->Name() << reinterpret_cast(node); + for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -240,7 +282,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // It also assumes backward op will always follow the forward op in // the block. is_forwarding = false; - LOG(ERROR) << "forward flipping!!!!!!!"; } else { int op_dev_id = GetOpDeviceID(node); if (op_dev_id != -1) { // This op only runs on one specific device. diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index ee0604383e..744696ebb0 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) +cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node) - cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index f297461ab2..46640fedcc 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -22,6 +22,7 @@ limitations under the License. */ namespace paddle { namespace framework { +/* namespace { void SortHelper( const std::map> &adj_list, @@ -39,7 +40,21 @@ void SortHelper( << reinterpret_cast(node) << " input " << node->inputs.size(); ret->push_back(node); } + +std::vector TopologySort( + const std::map> &adj_list) { + std::unordered_set visited; + std::vector ret; + + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &ret); + } + } + return ret; +} } // namespace +*/ Graph::Graph(const ProgramDesc &program) : program_(program) { VLOG(3) << "block in program:" << program_.Size(); @@ -48,20 +63,9 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { all_vars.emplace(var->Name(), var); } - ir::Node *last_backward = nullptr; - std::vector optimize_ops; std::map> var_nodes; for (auto *op : program.Block(0).AllOps()) { ir::Node *node = CreateOpNode(op); - if (boost::get( - op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = node; - } else if (boost::get( - op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kOptimize)) { - optimize_ops.push_back(node); - } for (auto &each_var_name : op->InputArgumentNames()) { ir::Node *var = nullptr; @@ -106,7 +110,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { // Read Write is the same op. continue; } - ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable); + ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName, + ir::Node::Type::kVariable); read_op->outputs.push_back(dep_var); dep_var->inputs.push_back(read_op); write_op->inputs.push_back(dep_var); @@ -114,62 +119,121 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } } +} - if (last_backward) { - for (ir::Node *opt_node : optimize_ops) { - ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable); - last_backward->outputs.push_back(dep_var); - dep_var->inputs.push_back(last_backward); - opt_node->inputs.push_back(dep_var); - dep_var->outputs.push_back(opt_node); - VLOG(3) << "appending connect: " << last_backward->Name() - << reinterpret_cast(last_backward) << "->" - << opt_node->Name() << reinterpret_cast(opt_node); +/* +bool HasCircleHelper(ir::Node* node, + const std::map> +&adj_list, + std::unordered_set* visited, + std::unordered_set* in_trace) { + if (visited->find(node) == visited->end()) { + visited->insert(node); + in_trace->insert(node); + + for (ir::Node *in : adj_list.at(node)) { + if (visited->find(in) == visited->end() && + HasCircleHelper(in, adj_list, visited, in_trace)) { + return true; + } else if (in_trace->find(in) != in_trace->end()) { + return true; + } } } + in_trace->erase(node); + return false; } -std::vector TopologySortOperationFromInToOut( - const std::vector> &nodes) { +bool HasCircle(const std::map> +&adj_list) { + std::unordered_set visited; + std::unordered_set in_trace; + for (auto& adj : adj_list) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { + return true; + } + } + return false; +} + +std::map> BuildAdjList( + const std::vector &nodes) { std::map> adj_list; - std::unordered_set visited; - std::vector ret; for (auto &n : nodes) { if (n->NodeType() != ir::Node::Type::kOperation) continue; - if (adj_list.find(n.get()) == adj_list.end()) { - adj_list[n.get()] = std::unordered_set(); + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::unordered_set(); } for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - adj_list[n.get()].insert(adj_n); + adj_list[n].insert(adj_n); LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) - << " -> " << n->Name() << reinterpret_cast(n.get()) + << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); } } } + return adj_list; +} - for (auto adj : adj_list) { - if (visited.find(adj.first) == visited.end()) { - SortHelper(adj_list, adj.first, &visited, &ret); +std::vector TopologySortOperationFromInToOut( + const std::vector> &nodes) { + std::vector tmp; + for (auto& n : nodes) { + tmp.push_back(n.get()); + } + std::map> adj_list = +BuildAdjList(tmp); + + PADDLE_ENFORCE(!HasCircle(adj_list)); + std::vector ret = TopologySort(adj_list); + + ir::Node *last_backward = nullptr; + std::vector optimize_ops; + for (ir::Node* n : ret) { + if (boost::get( + n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kBackward)) { + last_backward = n; + } else if (boost::get( + n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kOptimize)) { + optimize_ops.push_back(n); } } + if (last_backward) { + for (ir::Node *opt_node : optimize_ops) { + ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName, + ir::Node::Type::kVariable); + last_backward->outputs.push_back(dep_var); + dep_var->inputs.push_back(last_backward); + opt_node->inputs.push_back(dep_var); + dep_var->outputs.push_back(opt_node); + VLOG(3) << "appending connect: " << last_backward->Name() + << reinterpret_cast(last_backward) << "->" + << opt_node->Name() << reinterpret_cast(opt_node); + } + } + + PADDLE_ENFORCE(!HasCircle(adj_list)); for (ir::Node *n : ret) { std::unordered_set dummy; n->inputs.erase( std::remove_if(n->inputs.begin(), n->inputs.end(), - [n](ir::Node *t) { return t->Name() == "dummy"; }), + [n](ir::Node *t) { + return t->Name() == ir::Node::kControlDepVarName; }), n->inputs.end()); n->outputs.erase( std::remove_if(n->outputs.begin(), n->outputs.end(), - [n](ir::Node *t) { return t->Name() == "dummy"; }), + [n](ir::Node *t) { + return t->Name() == ir::Node::kControlDepVarName; }), n->outputs.end()); } return ret; -} +}*/ } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 0242edecf4..b4ac135b02 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -78,8 +78,5 @@ class Graph { std::map> attr_dels_; }; -std::vector TopologySortOperationFromInToOut( - const std::vector>& nodes); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc new file mode 100644 index 0000000000..ecd90f4f3e --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace { +void SortHelper( + const std::map> &adj_list, + ir::Node *node, std::unordered_set *visited, + std::vector *ret) { + visited->insert(node); + + for (auto adj : adj_list.at(node)) { + if (visited->find(adj) == visited->end()) { + SortHelper(adj_list, adj, visited, ret); + } + } + + LOG(ERROR) << "topology sort insert: " << node->Name() + << reinterpret_cast(node) << " input " + << node->inputs.size(); + ret->push_back(node); +} + +bool HasCircleHelper( + ir::Node *node, + const std::map> &adj_list, + std::unordered_set *visited, + std::unordered_set *in_trace) { + if (visited->find(node) == visited->end()) { + visited->insert(node); + in_trace->insert(node); + + for (ir::Node *in : adj_list.at(node)) { + if (visited->find(in) == visited->end() && + HasCircleHelper(in, adj_list, visited, in_trace)) { + return true; + } else if (in_trace->find(in) != in_trace->end()) { + return true; + } + } + } + in_trace->erase(node); + return false; +} +} // namespace + +bool HasCircle(const Graph &graph) { + std::map> adj_list = + BuildAdjList(graph); + + std::unordered_set visited; + std::unordered_set in_trace; + for (auto &adj : adj_list) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { + return true; + } + } + return false; +} + +std::vector TopologySort(const Graph &graph) { + std::map> adj_list = + BuildAdjList(graph); + std::unordered_set visited; + std::vector ret; + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &ret); + } + } + return ret; +} + +std::map> BuildAdjList( + const Graph &graph) { + std::map> adj_list; + + for (auto &n : graph.nodes) { + if (n->NodeType() != ir::Node::Type::kOperation) continue; + if (adj_list.find(n.get()) == adj_list.end()) { + adj_list[n.get()] = std::unordered_set(); + } + for (auto &var : n->inputs) { + for (auto &adj_n : var->inputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + adj_list[n.get()].insert(adj_n); + LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n.get()) + << " via " << var->Name() << reinterpret_cast(var); + } + } + } + return adj_list; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h new file mode 100644 index 0000000000..b8714eb5be --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace framework { +namespace ir { +bool HasCircle(const Graph &graph); + +std::vector TopologySort(const Graph &graph); + +std::map> BuildAdjList( + const Graph &graph); +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 86376e7e8b..aca77da8d6 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -15,5 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +namespace ir { +const char Node::kControlDepVarName[] = "__control_var"; +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 97b64a6017..b3138fccee 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -27,6 +27,8 @@ namespace ir { class Node { public: enum class Type { kOperation, kVariable }; + static const char kControlDepVarName[]; + explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} From c3f6e0e8a2f2185559cff1a0e79c50431d41ae2f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 20 Jul 2018 12:20:03 +0800 Subject: [PATCH 04/37] add namespace to Graph --- .../details/multi_devices_graph_builder.cc | 35 ++++++++++--------- .../details/multi_devices_graph_builder.h | 27 +++++++------- .../framework/details/ssa_graph_builder.cc | 8 ++--- .../framework/details/ssa_graph_builder.h | 8 ++--- .../framework/details/ssa_graph_checker.cc | 2 +- .../framework/details/ssa_graph_checker.h | 5 +-- .../framework/details/ssa_graph_printer.cc | 4 +-- .../framework/details/ssa_graph_printer.h | 7 ++-- .../details/threaded_ssa_graph_executor.cc | 3 +- .../details/threaded_ssa_graph_executor.h | 4 +-- paddle/fluid/framework/ir/graph.cc | 12 ++++--- paddle/fluid/framework/ir/graph.h | 22 ++++++------ paddle/fluid/framework/ir/graph_helper.cc | 8 ++--- paddle/fluid/framework/ir/graph_helper.h | 4 +-- paddle/fluid/framework/ir/graph_test.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 2 +- 16 files changed, 82 insertions(+), 71 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 4cc6e5727b..4050424e70 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -68,7 +68,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node, +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, size_t place_id) const { auto p = places_[place_id]; auto *op_handle = result->Get("ops").back().get(); @@ -192,8 +193,9 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( // to parameter/gradients before optimizer ops, topo sort is insufficient. ( // some optimizer ops might not depend on any nodes), we manually move all // optimizer nodes after last backward nodes. -std::vector SortOpsAndDelayOptimizeOp(const Graph &graph) { - std::vector ret = ir::TopologySort(graph); +// However, the assumption by SSAGraphBuilder should be relaxed in the future. +std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { + std::vector ret = ir::TopologySortOperations(graph); size_t last_backward = 0; std::vector optimize_ops; std::vector sorted_ret; @@ -232,8 +234,8 @@ std::vector SortOpsAndDelayOptimizeOp(const Graph &graph) { return sorted_ret; } -std::unique_ptr MultiDevSSAGraphBuilder::Apply( - std::unique_ptr graph) const { +std::unique_ptr MultiDevSSAGraphBuilder::Apply( + std::unique_ptr graph) const { // Rebuild the graph structure. std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); auto nodes = std::move(graph->nodes); @@ -245,7 +247,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( } } - Graph &result = *graph; + ir::Graph &result = *graph; std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 @@ -397,7 +399,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( #endif } -void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result, +void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { #ifdef PADDLE_WITH_CUDA @@ -427,7 +429,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result, } } -void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result, +void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const { result->Get("ops").emplace_back( @@ -436,7 +438,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result, CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result, +void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA result->Get("ops").emplace_back(new AllReduceOpHandle( @@ -466,7 +468,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result, } void MultiDevSSAGraphBuilder::InsertDataBalanceOp( - Graph *result, const std::vector &datas) const { + ir::Graph *result, const std::vector &datas) const { #ifdef PADDLE_WITH_CUDA result->Get("ops").emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), @@ -529,7 +531,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { return got == var_name_on_devices_.end() ? -1 : got->second; } -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const { +void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA @@ -559,7 +561,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const { } } -void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result, +void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { @@ -571,7 +573,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result, } } -VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result, +VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { #ifdef PADDLE_WITH_CUDA @@ -604,7 +606,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result, // Find the first occurence of `prev_op_name` and make current `op` depend // on it. -void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op, +void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { for (auto &prev_op : result->Get("ops")) { if (prev_op->Name() == prev_op_name) { @@ -617,7 +619,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op, } } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result, +void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; std::vector input_var_names; @@ -664,7 +666,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result, } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const { +void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 2b7f4f586b..c2c764bb94 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -46,11 +46,13 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &local_scopes, const BuildStrategy &strategy); #endif - std::unique_ptr Apply(std::unique_ptr graph) const override; + std::unique_ptr Apply( + std::unique_ptr graph) const override; int GetVarDeviceID(const std::string &varname) const override; private: - void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const; + void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, + size_t device_id) const; private: std::string loss_var_name_; @@ -64,8 +66,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(Graph *result, ir::Node *node) const; - void CreateDistTrainOp(Graph *result, ir::Node *node) const; + void CreateRPCOp(ir::Graph *result, ir::Node *node) const; + void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. @@ -79,16 +81,17 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { std::vector FindDistTrainRecvVars( const std::vector> &nodes) const; - void ConnectOp(Graph *result, OpHandleBase *op, + void ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const; - void CreateComputationalOps(Graph *result, ir::Node *node, + void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; - void CreateScaleLossGradOp(Graph *result) const; - VarHandle *CreateReduceOp(Graph *result, const std::string &og, + void CreateScaleLossGradOp(ir::Graph *result) const; + VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; - void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const; + void CreateComputationalOp(ir::Graph *result, ir::Node *node, + int dev_id) const; bool IsParameterGradientOnce( const std::string &og, @@ -96,12 +99,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { int GetOpDeviceID(ir::Node *node) const; - void InsertAllReduceOp(Graph *result, const std::string &og) const; + void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; - void InsertDataBalanceOp(Graph *result, + void InsertDataBalanceOp(ir::Graph *result, const std::vector &datas) const; - void CreateBroadcastOp(Graph *result, const std::string &p_name, + void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; bool IsSparseGradient(const std::string &og) const; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 2be4bb009e..3c579f427e 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,7 +17,7 @@ namespace paddle { namespace framework { namespace details { -void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) { +void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get("vars")) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { @@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) { } VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( - Graph *graph, ir::Node *node, const platform::Place &place, + ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { auto &var_holders = graph->Get("vars")[place_offset]; auto &var_holder = var_holders[node->Name()]; @@ -81,7 +81,7 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( return var; } -void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle, +void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, ir::Node *new_node, const platform::Place &place, size_t place_offset) { @@ -93,7 +93,7 @@ void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle, op_handle->AddOutput(var); } -void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) { +void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) { for (auto &op : graph->Get("ops")) { if (!op->Outputs().empty()) { continue; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index e8e8acdb38..f64445b470 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -64,19 +64,19 @@ class SSAGraphBuilder : public ir::Pass { * * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) */ - static void PolishGraphToSupportDataHazards(Graph *graph); + static void PolishGraphToSupportDataHazards(ir::Graph *graph); - static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node, + static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); // Add an output variable (each_var_name, place, place_offset) to op_handle, // which belongs to graph - static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle, + static void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, ir::Node *new_node, const platform::Place &place, size_t place_offset); - static void AddOutputToLeafOps(Graph *graph); + static void AddOutputToLeafOps(ir::Graph *graph); }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index 7c79d7f1e8..0438b09610 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const { +bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { std::unordered_map pending_ops; std::unordered_set pending_vars; std::unordered_set ready_vars; diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 2e397e8682..51ce6e5eca 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -28,7 +28,8 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { std::unique_ptr&& builder) : builder_(std::move(builder)) {} - std::unique_ptr Apply(std::unique_ptr graph) const override { + std::unique_ptr Apply( + std::unique_ptr graph) const override { auto new_graph = builder_->Apply(std::move(graph)); PADDLE_ENFORCE(IsValidGraph(new_graph.get())); return new_graph; @@ -38,7 +39,7 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { return builder_->GetVarDeviceID(var_name); } - bool IsValidGraph(const Graph* graph) const; + bool IsValidGraph(const ir::Graph* graph) const; private: std::unique_ptr builder_; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 6dd6fd262e..20aab14644 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -21,7 +21,7 @@ namespace framework { namespace details { template -static inline void IterAllVar(const Graph &graph, Callback callback) { +static inline void IterAllVar(const ir::Graph &graph, Callback callback) { for (auto &each : graph.Get("vars")) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { @@ -35,7 +35,7 @@ static inline void IterAllVar(const Graph &graph, Callback callback) { } } -void GraphvizSSAGraphPrinter::Print(const Graph &graph, +void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, std::ostream &sout) const { size_t var_id = 0; std::unordered_map vars; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index cd72162f44..a77c1bad3f 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -25,12 +25,12 @@ namespace details { class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} - virtual void Print(const Graph& graph, std::ostream& sout) const = 0; + virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0; }; class GraphvizSSAGraphPrinter : public SSAGraphPrinter { public: - void Print(const Graph& graph, std::ostream& sout) const override; + void Print(const ir::Graph& graph, std::ostream& sout) const override; }; class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { @@ -50,7 +50,8 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { stream_ptr_(std::move(sout)), stream_ref_(*stream_ptr_) {} - std::unique_ptr Apply(std::unique_ptr graph) const override { + std::unique_ptr Apply( + std::unique_ptr graph) const override { auto new_graph = builder_->Apply(std::move(graph)); printer_->Print(*new_graph, stream_ref_); return new_graph; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index f85c62dd6c..c19f74476f 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -21,7 +21,8 @@ namespace framework { namespace details { ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, std::unique_ptr &&graph) + const std::vector &places, + std::unique_ptr &&graph) : graph_(std::move(graph)), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index bf7c0a367a..3d67daa45e 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -40,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + std::unique_ptr &&graph); // Run a SSAGraph by a thread pool // Use topological sort algorithm @@ -53,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: - std::unique_ptr graph_; + std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; std::vector places_; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 46640fedcc..18211f2e2b 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -22,6 +22,7 @@ limitations under the License. */ namespace paddle { namespace framework { +namespace ir { /* namespace { void SortHelper( @@ -41,7 +42,7 @@ void SortHelper( ret->push_back(node); } -std::vector TopologySort( +std::vector TopologySortOperations( const std::map> &adj_list) { std::unordered_set visited; std::vector ret; @@ -156,7 +157,7 @@ bool HasCircle(const std::map> return false; } -std::map> BuildAdjList( +std::map> BuildOperationAdjList( const std::vector &nodes) { std::map> adj_list; @@ -178,17 +179,17 @@ std::map> BuildAdjList( return adj_list; } -std::vector TopologySortOperationFromInToOut( +std::vector TopologySortOperationsOperationFromInToOut( const std::vector> &nodes) { std::vector tmp; for (auto& n : nodes) { tmp.push_back(n.get()); } std::map> adj_list = -BuildAdjList(tmp); +BuildOperationAdjList(tmp); PADDLE_ENFORCE(!HasCircle(adj_list)); - std::vector ret = TopologySort(adj_list); + std::vector ret = TopologySortOperations(adj_list); ir::Node *last_backward = nullptr; std::vector optimize_ops; @@ -235,5 +236,6 @@ BuildAdjList(tmp); return ret; }*/ +} // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index b4ac135b02..a1e39b08a4 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -26,13 +26,13 @@ limitations under the License. */ namespace paddle { namespace framework { - +namespace ir { class Graph { public: - explicit Graph(const ProgramDesc& program); + explicit Graph(const ProgramDesc &program); virtual ~Graph() { - for (auto& attr : attrs_) { + for (auto &attr : attrs_) { attr_dels_[attr.first](); } attrs_.clear(); @@ -40,12 +40,12 @@ class Graph { } template - AttrType& Get(const std::string& attr_name) const { - return *boost::any_cast(attrs_.at(attr_name)); + AttrType &Get(const std::string &attr_name) const { + return *boost::any_cast(attrs_.at(attr_name)); } template - void Set(const std::string& attr_name, AttrType* attr) { + void Set(const std::string &attr_name, AttrType *attr) { PADDLE_ENFORCE(attrs_.count(attr_name) == 0); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { @@ -54,17 +54,17 @@ class Graph { }; } - ir::Node* CreateVarNode(VarDesc* var_desc) { + ir::Node *CreateVarNode(VarDesc *var_desc) { nodes.emplace_back(new ir::Node(var_desc)); return nodes.back().get(); } - ir::Node* CreateOpNode(OpDesc* op_desc) { + ir::Node *CreateOpNode(OpDesc *op_desc) { nodes.emplace_back(new ir::Node(op_desc)); return nodes.back().get(); } - ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) { + ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { nodes.emplace_back(new ir::Node(name, type)); return nodes.back().get(); } @@ -73,10 +73,10 @@ class Graph { private: // NOTE: program_ shouldn't be exposed to user. - const ProgramDesc& program_; + const ProgramDesc &program_; std::map attrs_; std::map> attr_dels_; }; - +} // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index ecd90f4f3e..76458be135 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -64,7 +64,7 @@ bool HasCircleHelper( bool HasCircle(const Graph &graph) { std::map> adj_list = - BuildAdjList(graph); + BuildOperationAdjList(graph); std::unordered_set visited; std::unordered_set in_trace; @@ -76,9 +76,9 @@ bool HasCircle(const Graph &graph) { return false; } -std::vector TopologySort(const Graph &graph) { +std::vector TopologySortOperations(const Graph &graph) { std::map> adj_list = - BuildAdjList(graph); + BuildOperationAdjList(graph); std::unordered_set visited; std::vector ret; for (auto adj : adj_list) { @@ -89,7 +89,7 @@ std::vector TopologySort(const Graph &graph) { return ret; } -std::map> BuildAdjList( +std::map> BuildOperationAdjList( const Graph &graph) { std::map> adj_list; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index b8714eb5be..55b2e3f5ca 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -26,9 +26,9 @@ namespace framework { namespace ir { bool HasCircle(const Graph &graph); -std::vector TopologySort(const Graph &graph); +std::vector TopologySortOperations(const Graph &graph); -std::map> BuildAdjList( +std::map> BuildOperationAdjList( const Graph &graph); } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 186047b370..f8fbd2242d 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -93,7 +93,7 @@ TEST(GraphTest, Basic) { ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test_out")->GetType()); - std::unique_ptr g(new Graph(prog)); + std::unique_ptr g(new ir::Graph(prog)); ASSERT_EQ(g->nodes[0]->Name(), "sum"); ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a"); ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b"); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1e5bba62b5..02c836bea1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,7 +132,7 @@ ParallelExecutor::ParallelExecutor( #endif } builder_ = builder_factory.Create(); - std::unique_ptr graph(new Graph(main_program)); + std::unique_ptr graph(new ir::Graph(main_program)); graph = builder_->Apply(std::move(graph)); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); From f6d99d1f73f1b57ee94a0db7fb6c039ff72085de Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 20 Jul 2018 15:15:21 +0800 Subject: [PATCH 05/37] polish --- .../details/multi_devices_graph_builder.cc | 21 --- .../framework/details/ssa_graph_builder.cc | 42 ----- .../framework/details/ssa_graph_builder.h | 9 - paddle/fluid/framework/ir/graph.cc | 155 +----------------- 4 files changed, 7 insertions(+), 220 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 4050424e70..f5e99c5748 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -216,21 +216,6 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(), optimize_ops.end()); - - for (ir::Node *n : sorted_ret) { - n->inputs.erase(std::remove_if(n->inputs.begin(), n->inputs.end(), - [n](ir::Node *t) { - return t->Name() == - ir::Node::kControlDepVarName; - }), - n->inputs.end()); - n->outputs.erase(std::remove_if(n->outputs.begin(), n->outputs.end(), - [n](ir::Node *t) { - return t->Name() == - ir::Node::kControlDepVarName; - }), - n->outputs.end()); - } return sorted_ret; } @@ -365,12 +350,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( } } - /* - Dependency graph has been constructed. However, there are still data - hazards need to be handled. - */ - PolishGraphToSupportDataHazards(&result); - /* * Only variables should be the leaves of graph. */ diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 3c579f427e..dcdcb28ac4 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,48 +17,6 @@ namespace paddle { namespace framework { namespace details { -void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { - for (auto &var_map : graph->Get("vars")) { - for (auto &name_pair : var_map) { - if (name_pair.second.size() <= 1) { - continue; - } - auto it_new = name_pair.second.rbegin(); - auto it_old = name_pair.second.rbegin(); - ++it_old; - for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - OpHandleBase *write_op = (*it_new)->GeneratedOp(); - const auto &read_ops = (*it_old)->PendingOps(); - - for (auto *read_op : read_ops) { - // Manually add a dependency var from read_op to write_op; - if (read_op == write_op) { - // Read Write is the same op. - continue; - } - - bool has_dep = false; - for (auto read_out : read_op->Outputs()) { - for (auto write_in : write_op->Inputs()) { - if (read_out == write_in) { - has_dep = true; - break; - } - } - } - if (has_dep) continue; - - auto *dep_var = new DummyVarHandle( - graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); - read_op->AddOutput(dep_var); - write_op->AddInput(dep_var); - graph->Get("dep_vars").emplace(dep_var); - } - } - } - } -} - VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index f64445b470..e99bab518e 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -57,15 +57,6 @@ class SSAGraphBuilder : public ir::Pass { DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: - /** - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) - */ - static void PolishGraphToSupportDataHazards(ir::Graph *graph); - static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 18211f2e2b..769dddbc59 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,39 +23,6 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -/* -namespace { -void SortHelper( - const std::map> &adj_list, - ir::Node *node, std::unordered_set *visited, - std::vector *ret) { - visited->insert(node); - - for (auto adj : adj_list.at(node)) { - if (visited->find(adj) == visited->end()) { - SortHelper(adj_list, adj, visited, ret); - } - } - - VLOG(3) << "topology sort insert: " << node->Name() - << reinterpret_cast(node) << " input " << node->inputs.size(); - ret->push_back(node); -} - -std::vector TopologySortOperations( - const std::map> &adj_list) { - std::unordered_set visited; - std::vector ret; - - for (auto adj : adj_list) { - if (visited.find(adj.first) == visited.end()) { - SortHelper(adj_list, adj.first, &visited, &ret); - } - } - return ret; -} -} // namespace -*/ Graph::Graph(const ProgramDesc &program) : program_(program) { VLOG(3) << "block in program:" << program_.Size(); @@ -93,6 +60,13 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { var->inputs.push_back(node); } } + /** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ for (auto &var : var_nodes) { auto &versions = var.second; if (versions.size() <= 1) continue; @@ -121,121 +95,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } } - -/* -bool HasCircleHelper(ir::Node* node, - const std::map> -&adj_list, - std::unordered_set* visited, - std::unordered_set* in_trace) { - if (visited->find(node) == visited->end()) { - visited->insert(node); - in_trace->insert(node); - - for (ir::Node *in : adj_list.at(node)) { - if (visited->find(in) == visited->end() && - HasCircleHelper(in, adj_list, visited, in_trace)) { - return true; - } else if (in_trace->find(in) != in_trace->end()) { - return true; - } - } - } - in_trace->erase(node); - return false; -} - -bool HasCircle(const std::map> -&adj_list) { - std::unordered_set visited; - std::unordered_set in_trace; - for (auto& adj : adj_list) { - if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { - return true; - } - } - return false; -} - -std::map> BuildOperationAdjList( - const std::vector &nodes) { - std::map> adj_list; - - for (auto &n : nodes) { - if (n->NodeType() != ir::Node::Type::kOperation) continue; - if (adj_list.find(n) == adj_list.end()) { - adj_list[n] = std::unordered_set(); - } - for (auto &var : n->inputs) { - for (auto &adj_n : var->inputs) { - PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - adj_list[n].insert(adj_n); - LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) - << " -> " << n->Name() << reinterpret_cast(n) - << " via " << var->Name() << reinterpret_cast(var); - } - } - } - return adj_list; -} - -std::vector TopologySortOperationsOperationFromInToOut( - const std::vector> &nodes) { - std::vector tmp; - for (auto& n : nodes) { - tmp.push_back(n.get()); - } - std::map> adj_list = -BuildOperationAdjList(tmp); - - PADDLE_ENFORCE(!HasCircle(adj_list)); - std::vector ret = TopologySortOperations(adj_list); - - ir::Node *last_backward = nullptr; - std::vector optimize_ops; - for (ir::Node* n : ret) { - if (boost::get( - n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = n; - } else if (boost::get( - n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kOptimize)) { - optimize_ops.push_back(n); - } - } - - if (last_backward) { - for (ir::Node *opt_node : optimize_ops) { - ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName, - ir::Node::Type::kVariable); - last_backward->outputs.push_back(dep_var); - dep_var->inputs.push_back(last_backward); - opt_node->inputs.push_back(dep_var); - dep_var->outputs.push_back(opt_node); - VLOG(3) << "appending connect: " << last_backward->Name() - << reinterpret_cast(last_backward) << "->" - << opt_node->Name() << reinterpret_cast(opt_node); - } - } - - PADDLE_ENFORCE(!HasCircle(adj_list)); - for (ir::Node *n : ret) { - std::unordered_set dummy; - n->inputs.erase( - std::remove_if(n->inputs.begin(), n->inputs.end(), - [n](ir::Node *t) { - return t->Name() == ir::Node::kControlDepVarName; }), - n->inputs.end()); - n->outputs.erase( - std::remove_if(n->outputs.begin(), n->outputs.end(), - [n](ir::Node *t) { - return t->Name() == ir::Node::kControlDepVarName; }), - n->outputs.end()); - } - return ret; -}*/ - } // namespace ir } // namespace framework } // namespace paddle From 93355cc0d222b558689f963805d21b366ee8699d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sat, 21 Jul 2018 17:27:25 +0800 Subject: [PATCH 06/37] fix control deps --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/multi_devices_graph_builder.cc | 34 ++++++++------ .../details/multi_devices_graph_builder.h | 4 +- .../fluid/framework/details/rpc_op_handle.cc | 3 +- .../framework/details/ssa_graph_builder.cc | 33 +++++++++++++- .../framework/details/ssa_graph_builder.h | 9 ++++ paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 26 ++++++++--- paddle/fluid/framework/ir/graph.h | 45 ++++++++++++++++--- paddle/fluid/framework/ir/graph_helper.cc | 19 ++++---- paddle/fluid/framework/ir/graph_helper.h | 1 + paddle/fluid/framework/ir/graph_test.cc | 29 ++++++------ paddle/fluid/operators/CMakeLists.txt | 4 +- paddle/fluid/operators/send_recv_util.h | 5 ++- 14 files changed, 155 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 620d202d33..9df7df1f42 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto) +cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index f5e99c5748..150f1534c8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -94,12 +94,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, } std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector> &nodes) const { + const std::vector &nodes) const { std::vector send_vars; // since parameters are all in block 0, // it's enough to only scan send ops in block 0 for (auto &node : nodes) { - if (node->NodeType() != ir::Node::Type::kOperation) continue; OpDesc *op = node->Op(); // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string @@ -114,10 +113,9 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( } std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector> &nodes) const { + const std::vector &nodes) const { std::vector recv_vars; for (auto &node : nodes) { - if (node->NodeType() != ir::Node::Type::kOperation) continue; OpDesc *op = node->Op(); // TODO(Yancey1989): use a graceful method to find recv op, // instead of the hard code string @@ -214,6 +212,19 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { } } + // Verify that no operations before optimize ops depends on optimize ops. + std::unordered_set optimize_set(optimize_ops.begin(), + optimize_ops.end()); + for (size_t i = 0; i < last_backward; ++i) { + for (ir::Node *in : sorted_ret[i]->inputs) { + for (ir::Node *pre_n : in->inputs) { + PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), + "optimize operations cannot be depended by forward " + "or backward node %s -> %s", + pre_n->Name(), sorted_ret[i]->Name()); + } + } + } sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(), optimize_ops.end()); return sorted_ret; @@ -221,18 +232,16 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unique_ptr graph) const { - // Rebuild the graph structure. + // Give the topology sort order and rebuild the graph structure. std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); - auto nodes = std::move(graph->nodes); - graph->nodes.clear(); + auto nodes = graph->ReleaseNodes(); + ir::Graph &result = *graph; for (auto &node : nodes) { if (node->NodeType() == ir::Node::Type::kVariable) { all_vars_.emplace(node->Name(), node->Var()); } } - - ir::Graph &result = *graph; std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 @@ -242,8 +251,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // find send/recv vars so that we can place the distributed training // realted op in the place 0 - auto send_vars = FindDistTrainSendVars(nodes); - auto recv_vars = FindDistTrainRecvVars(nodes); + auto send_vars = FindDistTrainSendVars(sorted_ops); + auto recv_vars = FindDistTrainRecvVars(sorted_ops); std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); @@ -589,8 +598,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { for (auto &prev_op : result->Get("ops")) { if (prev_op->Name() == prev_op_name) { - auto *dep_var = new DummyVarHandle( - result->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); + auto *dep_var = new DummyVarHandle(result->CreateControlDepVar()); prev_op->AddOutput(dep_var); result->Get("dep_vars").emplace(dep_var); op->AddInput(dep_var); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index c2c764bb94..55076f227b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -76,10 +76,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &recv_vars) const; std::vector FindDistTrainSendVars( - const std::vector> &nodes) const; + const std::vector &nodes) const; std::vector FindDistTrainRecvVars( - const std::vector> &nodes) const; + const std::vector &nodes) const; void ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const; diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 924ff4d118..c52a07530e 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -33,7 +33,8 @@ void RPCOpHandle::RunImpl() { for (auto *in : inputs_) { auto &p = static_cast(in)->place_; // FIXME(Yancey1989): need a better solution instead of use DebugString() - if (in->DebugString() == "dummy") { // HACK + if (in->Node()->Name().find(ir::Node::kControlDepVarName) != + std::string::npos) { // HACK continue; } if (in->GeneratedOp()) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index dcdcb28ac4..e7db8729cf 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,6 +17,36 @@ namespace paddle { namespace framework { namespace details { +void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { + for (auto &var_map : graph->Get("vars")) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + continue; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + OpHandleBase *write_op = (*it_new)->GeneratedOp(); + const auto &read_ops = (*it_old)->PendingOps(); + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->Get("dep_vars").emplace(dep_var); + } + } + } + } +} + VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { @@ -56,8 +86,7 @@ void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) { if (!op->Outputs().empty()) { continue; } - auto *dummy_leaf = new DummyVarHandle( - graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); graph->Get("dep_vars").emplace(dummy_leaf); op->AddOutput(dummy_leaf); } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index e99bab518e..f64445b470 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -57,6 +57,15 @@ class SSAGraphBuilder : public ir::Pass { DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: + /** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ + static void PolishGraphToSupportDataHazards(ir::Graph *graph); + static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 6f00abd947..5457870e9f 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const { return ss.str(); } -std::string DummyVarHandle::DebugString() const { return "dummy"; } +std::string DummyVarHandle::DebugString() const { return node_->Name(); } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 769dddbc59..6ad8773567 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -34,7 +34,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { std::map> var_nodes; for (auto *op : program.Block(0).AllOps()) { ir::Node *node = CreateOpNode(op); - + // For input args, reuse the same var name if it was created before. + // Otherwise, create a new one. for (auto &each_var_name : op->InputArgumentNames()) { ir::Node *var = nullptr; if (var_nodes.find(each_var_name) != var_nodes.end()) { @@ -43,16 +44,16 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { var = CreateVarNode(all_vars.at(each_var_name)); var_nodes[each_var_name].push_back(var); } else { - // TODO(paddle-dev): Seems some assumption doesn't hold? - VLOG(3) << op->Type() - << " input var not in all_var list: " << each_var_name; + // Operation input var can be optional (dispensable). Which means + // the operation doesn't really need the var at runtime. In this + // case, the no-existed var is ready at the beginning. var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); var_nodes[each_var_name].push_back(var); } node->inputs.push_back(var); var->outputs.push_back(node); } - + // For output args, always create a new var. for (auto &each_var_name : op->OutputArgumentNames()) { ir::Node *var = CreateVarNode(all_vars.at(each_var_name)); var_nodes[each_var_name].push_back(var); @@ -67,6 +68,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { * * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) */ + for (auto &var : var_nodes) { auto &versions = var.second; if (versions.size() <= 1) continue; @@ -85,8 +87,18 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { // Read Write is the same op. continue; } - ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName, - ir::Node::Type::kVariable); + // 2 ops might have been connected via other vars. + bool has_dep = false; + for (ir::Node *r_out : read_op->outputs) { + for (ir::Node *w_in : write_op->inputs) { + if (r_out == w_in) { + has_dep = true; + break; + } + } + } + if (has_dep) continue; + ir::Node *dep_var = CreateControlDepVar(); read_op->outputs.push_back(dep_var); dep_var->inputs.push_back(read_op); write_op->inputs.push_back(dep_var); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index a1e39b08a4..fccad9fd4f 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + class Graph { public: explicit Graph(const ProgramDesc &program); @@ -54,28 +55,58 @@ class Graph { }; } + const std::unordered_set &Nodes() const { return node_set_; } + ir::Node *CreateVarNode(VarDesc *var_desc) { - nodes.emplace_back(new ir::Node(var_desc)); - return nodes.back().get(); + return AddNode(new ir::Node(var_desc)); } ir::Node *CreateOpNode(OpDesc *op_desc) { - nodes.emplace_back(new ir::Node(op_desc)); - return nodes.back().get(); + return AddNode(new ir::Node(op_desc)); + } + + ir::Node *CreateControlDepVar() { + // TODO(panyx0718): control var name should be unique. + const std::string name = string::Sprintf( + "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); + return AddNode(new ir::Node(name, ir::Node::Type::kVariable)); } ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { - nodes.emplace_back(new ir::Node(name, type)); - return nodes.back().get(); + return AddNode(new ir::Node(name, type)); } - std::vector> nodes; + std::vector> ReleaseNodes() { + std::vector> ret; + for (auto &n : nodes_) { + ret.emplace_back(n.second.release()); + } + nodes_.clear(); + node_set_.clear(); + return ret; + } private: + // This method takes ownership of `node`. + ir::Node *AddNode(ir::Node *node) { + PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); + nodes_[node].reset(node); + node_set_.insert(node); + return node; + } + + void RemoveNode(ir::Node *node) { + PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); + node_set_.erase(node); + nodes_.erase(node); + } + // NOTE: program_ shouldn't be exposed to user. const ProgramDesc &program_; std::map attrs_; std::map> attr_dels_; + std::map> nodes_; + std::unordered_set node_set_; }; } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 76458be135..b829cf204d 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -33,9 +33,8 @@ void SortHelper( } } - LOG(ERROR) << "topology sort insert: " << node->Name() - << reinterpret_cast(node) << " input " - << node->inputs.size(); + VLOG(3) << "topology sort insert: " << node->Name() + << reinterpret_cast(node) << " input " << node->inputs.size(); ret->push_back(node); } @@ -93,18 +92,18 @@ std::map> BuildOperationAdjList( const Graph &graph) { std::map> adj_list; - for (auto &n : graph.nodes) { + for (auto &n : graph.Nodes()) { if (n->NodeType() != ir::Node::Type::kOperation) continue; - if (adj_list.find(n.get()) == adj_list.end()) { - adj_list[n.get()] = std::unordered_set(); + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::unordered_set(); } for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - adj_list[n.get()].insert(adj_n); - LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) - << " -> " << n->Name() << reinterpret_cast(n.get()) - << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); + VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); } } } diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 55b2e3f5ca..118c16ab7a 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -30,6 +30,7 @@ std::vector TopologySortOperations(const Graph &graph); std::map> BuildOperationAdjList( const Graph &graph); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index f8fbd2242d..feb4e8e76b 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -94,20 +94,21 @@ TEST(GraphTest, Basic) { prog.MutableBlock(0)->Var("test_out")->GetType()); std::unique_ptr g(new ir::Graph(prog)); - ASSERT_EQ(g->nodes[0]->Name(), "sum"); - ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a"); - ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b"); - ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c"); - ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out"); - ASSERT_EQ(g->nodes[1]->Name(), "test_a"); - ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum"); - ASSERT_EQ(g->nodes[2]->Name(), "test_b"); - ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum"); - ASSERT_EQ(g->nodes[3]->Name(), "test_c"); - ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum"); - ASSERT_EQ(g->nodes[4]->Name(), "test_out"); - ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum"); - ASSERT_EQ(g->nodes.size(), 5); + std::vector nodes(g->Nodes().begin(), g->Nodes().end()); + ASSERT_EQ(nodes[0]->Name(), "sum"); + ASSERT_EQ(nodes[0]->inputs[0]->Name(), "test_a"); + ASSERT_EQ(nodes[0]->inputs[1]->Name(), "test_b"); + ASSERT_EQ(nodes[0]->inputs[2]->Name(), "test_c"); + ASSERT_EQ(nodes[0]->outputs[0]->Name(), "test_out"); + ASSERT_EQ(nodes[1]->Name(), "test_a"); + ASSERT_EQ(nodes[1]->outputs[0]->Name(), "sum"); + ASSERT_EQ(nodes[2]->Name(), "test_b"); + ASSERT_EQ(nodes[2]->outputs[0]->Name(), "sum"); + ASSERT_EQ(nodes[3]->Name(), "test_c"); + ASSERT_EQ(nodes[3]->outputs[0]->Name(), "sum"); + ASSERT_EQ(nodes[4]->Name(), "test_out"); + ASSERT_EQ(nodes[4]->inputs[0]->Name(), "sum"); + ASSERT_EQ(nodes.size(), 5); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4e2002ad24..9b56ad4c55 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -192,9 +192,9 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) + set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h index deab005149..500230d553 100644 --- a/paddle/fluid/operators/send_recv_util.h +++ b/paddle/fluid/operators/send_recv_util.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include "paddle/fluid/framework/ir/node.h" namespace paddle { namespace operators { @@ -22,7 +23,9 @@ inline bool NeedSend(const framework::Scope& scope, const std::string& varname) { // dummy variable is only used in parallel executor to represent // some dependency relationship, we don't need to send/recv it. - if (varname == "dummy") return false; + if (varname.find(framework::ir::Node::kControlDepVarName) != + std::string::npos) + return false; auto* var = scope.FindVar(varname); PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", varname); From 2782e71a11082edf83906efaecfc5c0a47731a72 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 22 Jul 2018 12:43:27 +0800 Subject: [PATCH 07/37] fix graph_test --- paddle/fluid/framework/ir/graph_test.cc | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index feb4e8e76b..73ef55756c 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -95,19 +95,19 @@ TEST(GraphTest, Basic) { std::unique_ptr g(new ir::Graph(prog)); std::vector nodes(g->Nodes().begin(), g->Nodes().end()); - ASSERT_EQ(nodes[0]->Name(), "sum"); - ASSERT_EQ(nodes[0]->inputs[0]->Name(), "test_a"); - ASSERT_EQ(nodes[0]->inputs[1]->Name(), "test_b"); - ASSERT_EQ(nodes[0]->inputs[2]->Name(), "test_c"); - ASSERT_EQ(nodes[0]->outputs[0]->Name(), "test_out"); - ASSERT_EQ(nodes[1]->Name(), "test_a"); - ASSERT_EQ(nodes[1]->outputs[0]->Name(), "sum"); - ASSERT_EQ(nodes[2]->Name(), "test_b"); - ASSERT_EQ(nodes[2]->outputs[0]->Name(), "sum"); - ASSERT_EQ(nodes[3]->Name(), "test_c"); - ASSERT_EQ(nodes[3]->outputs[0]->Name(), "sum"); - ASSERT_EQ(nodes[4]->Name(), "test_out"); - ASSERT_EQ(nodes[4]->inputs[0]->Name(), "sum"); + for (ir::Node *n : nodes) { + if (n->Name() == "sum") { + ASSERT_EQ(n->inputs.size(), 3); + ASSERT_EQ(n->outputs.size(), 1); + } else if (n->Name() == "test_a" || n->Name() == "test_b" || + n->Name() == "test_c") { + ASSERT_EQ(n->inputs.size(), 0); + ASSERT_EQ(n->outputs.size(), 1); + } else if (n->Name() == "test_out") { + ASSERT_EQ(n->inputs.size(), 1); + ASSERT_EQ(n->outputs.size(), 0); + } + } ASSERT_EQ(nodes.size(), 5); } } // namespace framework From 21a45420f0dc4da9413b002d380e8b31bff88e05 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 22 Jul 2018 21:10:46 +0800 Subject: [PATCH 08/37] polish and test --- .../details/multi_devices_graph_builder.cc | 8 ++ .../fluid/framework/details/rpc_op_handle.cc | 4 +- .../framework/details/ssa_graph_builder.cc | 30 ----- .../framework/details/ssa_graph_builder.h | 9 -- paddle/fluid/framework/ir/CMakeLists.txt | 3 +- paddle/fluid/framework/ir/graph.cc | 4 + paddle/fluid/framework/ir/graph.h | 13 +- paddle/fluid/framework/ir/graph_helper.cc | 13 +- paddle/fluid/framework/ir/graph_helper.h | 4 + .../fluid/framework/ir/graph_helper_test.cc | 125 ++++++++++++++++++ paddle/fluid/operators/send_recv_util.h | 1 + 11 files changed, 166 insertions(+), 48 deletions(-) create mode 100644 paddle/fluid/framework/ir/graph_helper_test.cc diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 150f1534c8..de0cac0f1a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -170,6 +170,7 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( const std::vector &var_names) const { int64_t numel_sum = 0; for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; auto var_desc = all_vars_.at(var_name); PADDLE_ENFORCE_NOT_NULL(var_desc); auto dim = framework::make_ddim(var_desc->GetShape()); @@ -271,6 +272,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != BuildStrategy::GradientScaleStrategy::kCustomized) { + // TODO(paddle-dev): Why is there no input for this op_handle? CreateScaleLossGradOp(&result); } // This assumes the backward generating code will ensure IsScaleLossOp @@ -288,6 +290,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( } else { // This op runs on all devices, and its output may have parameter's // gradients. + // TODO(paddle-dev): Why is so special about "read" op? if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { node->Op()->SetAttr("throw_eof_exp", false); CreateComputationalOps(&result, node, places_.size()); @@ -363,6 +366,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); + PADDLE_ENFORCE(!ir::HasCircle(result)); return graph; } @@ -620,6 +624,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, if (node->Op()->Type() == "split_byref" || node->Op()->Type() == "split_selected_rows") { + // TODO(paddle-dev): getting the first var is not safe. op_dev_id = GetVarDeviceID(input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); @@ -657,7 +662,10 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { + // TODO(paddle-dev): getting the first var is not safe. op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); + PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), + "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by // split_byref op // so that we can balance the variable blocks to all the pserver diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index c52a07530e..f44b374edb 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/rpc_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { @@ -33,8 +34,7 @@ void RPCOpHandle::RunImpl() { for (auto *in : inputs_) { auto &p = static_cast(in)->place_; // FIXME(Yancey1989): need a better solution instead of use DebugString() - if (in->Node()->Name().find(ir::Node::kControlDepVarName) != - std::string::npos) { // HACK + if (ir::IsControlDepVar(*in->Node())) { // HACK continue; } if (in->GeneratedOp()) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index e7db8729cf..203d5fbbc1 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,36 +17,6 @@ namespace paddle { namespace framework { namespace details { -void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { - for (auto &var_map : graph->Get("vars")) { - for (auto &name_pair : var_map) { - if (name_pair.second.size() <= 1) { - continue; - } - auto it_new = name_pair.second.rbegin(); - auto it_old = name_pair.second.rbegin(); - ++it_old; - for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - OpHandleBase *write_op = (*it_new)->GeneratedOp(); - const auto &read_ops = (*it_old)->PendingOps(); - - for (auto *read_op : read_ops) { - // Manually add a dependency var from read_op to write_op; - if (read_op == write_op) { - // Read Write is the same op. - continue; - } - - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - read_op->AddOutput(dep_var); - write_op->AddInput(dep_var); - graph->Get("dep_vars").emplace(dep_var); - } - } - } - } -} - VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index f64445b470..e99bab518e 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -57,15 +57,6 @@ class SSAGraphBuilder : public ir::Pass { DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: - /** - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) - */ - static void PolishGraphToSupportDataHazards(ir::Graph *graph); - static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 744696ebb0..6447452ae5 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -2,4 +2,5 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node) -cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry) +cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry) +cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 6ad8773567..c5b4851477 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -107,6 +107,10 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } } + +bool IsControlDepVar(const ir::Node &var) { + return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; +} } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index fccad9fd4f..4f59ec82a7 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -57,25 +57,34 @@ class Graph { const std::unordered_set &Nodes() const { return node_set_; } + // Create a normal variable with non-null VarDesc. ir::Node *CreateVarNode(VarDesc *var_desc) { return AddNode(new ir::Node(var_desc)); } + // Create a normal runnable operator with OpDesc. ir::Node *CreateOpNode(OpDesc *op_desc) { return AddNode(new ir::Node(op_desc)); } + // Create a control dependency var that connects 2 operations. The + // var doesn't hold any data. Other than that, it's no different from + // other var, considering dependency analysis. ir::Node *CreateControlDepVar() { - // TODO(panyx0718): control var name should be unique. + // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); return AddNode(new ir::Node(name, ir::Node::Type::kVariable)); } + // A more free style way of creating a graph node. Mostly use for test + // or "copy" from another node. Avoid using it if possible. ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { return AddNode(new ir::Node(name, type)); } + // Clear all node information of the graph and return the ownership of the + // nodes. std::vector> ReleaseNodes() { std::vector> ret; for (auto &n : nodes_) { @@ -108,6 +117,8 @@ class Graph { std::map> nodes_; std::unordered_set node_set_; }; + +bool IsControlDepVar(const ir::Node &var); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index b829cf204d..b1c19e6535 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -59,12 +59,9 @@ bool HasCircleHelper( in_trace->erase(node); return false; } -} // namespace - -bool HasCircle(const Graph &graph) { - std::map> adj_list = - BuildOperationAdjList(graph); +bool HasCircleInternal( + const std::map> &adj_list) { std::unordered_set visited; std::unordered_set in_trace; for (auto &adj : adj_list) { @@ -74,10 +71,16 @@ bool HasCircle(const Graph &graph) { } return false; } +} // namespace + +bool HasCircle(const Graph &graph) { + return HasCircleInternal(BuildOperationAdjList(graph)); +} std::vector TopologySortOperations(const Graph &graph) { std::map> adj_list = BuildOperationAdjList(graph); + PADDLE_ENFORCE(!HasCircleInternal(adj_list)); std::unordered_set visited; std::vector ret; for (auto adj : adj_list) { diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 118c16ab7a..cd6c53a07f 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,10 +24,14 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +// Test if the graph contains circle. bool HasCircle(const Graph &graph); +// Topology Sort the operations in the graph from inputs to outputs. +// `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); +// Build an adjacency list of operations for the `graph`. std::map> BuildOperationAdjList( const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc new file mode 100644 index 0000000000..b517442bb7 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -0,0 +1,125 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o1->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o1); +} + +void BuildCircleGraph2(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +void BuildNoCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, Basic) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + ASSERT_TRUE(HasCircle(g)); + + Graph g2(prog); + BuildCircleGraph2(&g2); + ASSERT_TRUE(HasCircle(g2)); + + auto adj_list = BuildOperationAdjList(g2); + for (auto& adj : adj_list) { + auto& adj_set = adj.second; + if (adj.first->Name() == "op1") { + ASSERT_EQ((*adj_set.begin())->Name(), "op2"); + } else if (adj.first->Name() == "op2") { + ASSERT_EQ((*adj_set.begin())->Name(), "op1"); + } else { + ASSERT_TRUE(false); + } + } + + Graph g3(prog); + BuildNoCircleGraph(&g3); + ASSERT_FALSE(HasCircle(g3)); + auto sorted = TopologySortOperations(g3); + std::map node_map; + for (size_t i = 0; i < sorted.size(); ++i) { + node_map[sorted[i]->Name()] = i; + } + ASSERT_EQ(node_map.at("op1"), 0); + ASSERT_EQ(node_map.at("op2"), 1); + ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h index 500230d553..dc26c53c64 100644 --- a/paddle/fluid/operators/send_recv_util.h +++ b/paddle/fluid/operators/send_recv_util.h @@ -23,6 +23,7 @@ inline bool NeedSend(const framework::Scope& scope, const std::string& varname) { // dummy variable is only used in parallel executor to represent // some dependency relationship, we don't need to send/recv it. + // TODO(paddle-dev): Why would parallel executor logic leaked into here? if (varname.find(framework::ir::Node::kControlDepVarName) != std::string::npos) return false; From 5173a53c8a6ac2e398c4624aff76c125716f775d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 23 Jul 2018 13:03:00 +0800 Subject: [PATCH 09/37] fix reorder issue. --- doc/fluid/design/ir/draft.md | 48 +++++++-------- .../details/multi_devices_graph_builder.cc | 59 +++++++++++-------- .../framework/details/ssa_graph_builder.cc | 40 +++++++++++++ .../framework/details/ssa_graph_builder.h | 6 ++ paddle/fluid/framework/ir/graph.cc | 1 + 5 files changed, 107 insertions(+), 47 deletions(-) diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index a141dcbca5..a33b5a9c93 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -1,16 +1,16 @@ ## Motivation -There is a ```gap``` between the ```Program``` defined by -user and the ```Executable``` that can be scheduled +There is a `gap` between the `Program` defined by +user and the `Executable` that can be scheduled efficiently on heterogeneous hardware, either locally or distributedly. -Usually, the ```gap``` is bridged by +Usually, the `gap` is bridged by * A serious transformations with defined order. * These transformations usually involve -```insert, delete, clustering, split, dependency analysis```. +`insert, delete, clustering, split, dependency analysis`. * Has a simple way to verify and debug each transformation. @@ -38,44 +38,44 @@ design below. #### Node -```Node``` represents an operation that performs some computation or +`Node` represents an operation that performs some computation or a variable that is input or output of operation. -```Node```s are connected to other ```Node```s via inputs and outputs. +`Node`s are connected to other `Node`s via inputs and outputs. Other properties (maybe device placement information) can be added -to ```Node``` in the future if it's a -common requirement of many other ```Pass```es. Otherwise, it should live -in a ```Node``` wrapper class that is private to some ```Pass``` or be -a local member of a ```Pass```. +to `Node` in the future if it's a +common requirement of many other `Pass`es. Otherwise, it should live +in a `Node` wrapper class that is private to some `Pass` or be +a local member of a `Pass`. #### Graph -```Graph``` contains a list of ```Node```s, which are connected to +`Graph` contains a list of `Node`s, which are connected to each other via inputs and outputs. TODO: Better definitions for the graph. -```Graph``` can also contain ```Attribute```s. ```Attribute```s -can be ``any`` thing. For example, it can be a list of "wraper" -nodes. The ```wrapper``` nodes compose ```Node```s and provide -helper method for execution or transformation. ```Attribute``` +`Graph` can also contain `Attribute`s. `Attribute`s +can be `any` thing. For example, it can be a list of "wraper" +nodes. The `wrapper` nodes compose `Node`s and provide +helper method for execution or transformation. `Attribute` can also contain other things that describe some properties of -the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed -across ```Pass```. However, it should be used with care. +the `Graph` or `Graph` nodes. `Attribute` can be passed +across `Pass`. However, it should be used with care. #### Pass -```Pass``` represents a transformation of ```Graph```. Its input -is a ```Graph``` and its output is also a ```Graph```. For example, -a ```Pass``` can simply print out the ```Graph```. A ```Pass``` -can also fuse some ```Graph```'s ```Node```s. +`Pass` represents a transformation of `Graph`. Its input +is a `Graph` and its output is also a `Graph`. For example, +a `Pass` can simply print out the `Graph`. A `Pass` +can also fuse some `Graph`'s `Node`s. #### Optimize -```Optimize``` contains a series of ```Pass``` with defined order. -```Optimize``` transforms a ```Graph``` that only contains raw -modeling logic to a ```Graph``` that can be run efficiently while +`Optimize` contains a series of `Pass` with defined order. +`Optimize` transforms a `Graph` that only contains raw +modeling logic to a `Graph` that can be run efficiently while maintaining the original modeling logic. diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index de0cac0f1a..22f0cb20d0 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -196,38 +196,46 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { std::vector ret = ir::TopologySortOperations(graph); size_t last_backward = 0; - std::vector optimize_ops; - std::vector sorted_ret; for (size_t i = 0; i < ret.size(); ++i) { if (boost::get( ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kBackward)) { - sorted_ret.push_back(ret[i]); - last_backward = sorted_ret.size(); - } else if (boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kOptimize)) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); + last_backward = i; } } - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (size_t i = 0; i < last_backward; ++i) { - for (ir::Node *in : sorted_ret[i]->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), sorted_ret[i]->Name()); + std::vector optimize_ops; + std::vector sorted_ret; + for (size_t i = 0; i < ret.size(); ++i) { + if (i < last_backward) { + if (boost::get(ret[i]->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kOptimize)) { + optimize_ops.push_back(ret[i]); + } else { + sorted_ret.push_back(ret[i]); + } + } else if (i == last_backward) { + sorted_ret.push_back(ret[i]); + // Verify that no operations before optimize ops depends on optimize ops. + std::unordered_set optimize_set(optimize_ops.begin(), + optimize_ops.end()); + for (ir::Node *n : sorted_ret) { + for (ir::Node *in : n->inputs) { + for (ir::Node *pre_n : in->inputs) { + PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), + "optimize operations cannot be depended by forward " + "or backward node %s -> %s", + pre_n->Name(), n->Name()); + } + } } + sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), + optimize_ops.end()); + } else { + sorted_ret.push_back(ret[i]); } } - sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(), - optimize_ops.end()); return sorted_ret; } @@ -239,7 +247,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( ir::Graph &result = *graph; for (auto &node : nodes) { - if (node->NodeType() == ir::Node::Type::kVariable) { + if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); } } @@ -361,6 +369,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( } } } + /* + Dependency graph has been constructed. However, there are still data + hazards need to be handled. + */ + PolishGraphToSupportDataHazards(&result); /* * Only variables should be the leaves of graph. diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 203d5fbbc1..506e7eb35c 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,6 +17,46 @@ namespace paddle { namespace framework { namespace details { +void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { + for (auto &var_map : graph->Get("vars")) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + continue; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + OpHandleBase *write_op = (*it_new)->GeneratedOp(); + const auto &read_ops = (*it_old)->PendingOps(); + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + bool has_dep = false; + for (auto *r_out : read_op->Outputs()) { + for (auto *w_in : write_op->Inputs()) { + if (r_out->Node() == w_in->Node()) { + has_dep = true; + break; + } + } + } + if (has_dep) continue; + + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->Get("dep_vars").emplace(dep_var); + } + } + } + } +} + VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index e99bab518e..2b4f31f2ff 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -57,6 +57,12 @@ class SSAGraphBuilder : public ir::Pass { DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: + /* + Dependency graph has been constructed. However, there are still data + hazards need to be handled. + */ + static void PolishGraphToSupportDataHazards(ir::Graph *graph); + static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index c5b4851477..740acfafb7 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -98,6 +98,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } if (has_dep) continue; + ir::Node *dep_var = CreateControlDepVar(); read_op->outputs.push_back(dep_var); dep_var->inputs.push_back(read_op); From 8a521c0b4da5118098d57e34c1c4150e276f140a Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 16 Jul 2018 17:44:14 +0800 Subject: [PATCH 10/37] Remove buggy get_test_program and refine c++ reader demo --- python/paddle/fluid/io.py | 98 --------------- .../convert_data_to_recordio.py | 8 +- .../tests/demo/text_classification/train.py | 115 +++++++++--------- 3 files changed, 62 insertions(+), 159 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 0eb1194e27..32368d3c0c 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -789,101 +789,3 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) - - -def get_test_program(filelist, program=None, startup_program=None): - """ - Transpile current train program to a program to read test dataset - if the program is using reader ops like "open_files_op". - """ - - def _copy_reader_var_(block, var, new_name=None): - if new_name == None: - new_name = var.name - new_var = block.create_var( - name=str(new_name), type=core.VarDesc.VarType.READER) - new_var.desc.set_shapes(var.desc.shapes()) - new_var.desc.set_dtypes(var.desc.dtypes()) - new_var.persistable = True - return new_var - - def _get_test_reader_name(train_reader_name): - return train_reader_name + "_test" - - def _is_reader_op(op): - block = op.block - if "Out" in op.output_names: - reader_out = block.vars[op.output("Out")[0]] - if reader_out.type == core.VarDesc.VarType.READER: - return True - return False - - if program == None: - program = default_main_program() - if startup_program == None: - startup_program = default_startup_program() - startup_block = startup_program.global_block() - - # 1. find out the orignal reader var name - startup_reader_op_list = [] - - for op in startup_block.ops: - if _is_reader_op(op): - startup_reader_op_list.append(op) - - if len(startup_reader_op_list) == 0: - return program - - root_reader_op = startup_reader_op_list[0] - train_test_reader_map = {} - # 2. add operators to startup to read open and read test data files - for op in startup_reader_op_list: - assert (len(op.output("Out")) == 1) - train_reader_name = op.output("Out")[0] - train_reader = startup_block.vars[train_reader_name] - test_reader = _copy_reader_var_( - startup_block, - train_reader, - new_name=_get_test_reader_name(train_reader_name)) - train_test_reader_map[train_reader.name] = test_reader - - test_op_inputs = {} - for name in op.input_names: - train_arg_names = op.input(name) - test_arg_vars = [] - for arg_name in train_arg_names: - arg_var = train_test_reader_map[ - arg_name] if name == "UnderlyingReader" else startup_block.vars[ - arg_name] - test_arg_vars.append(arg_var) - test_op_inputs[name] = test_arg_vars - - test_op = startup_block.append_op( - type=op.type, - inputs=test_op_inputs, - outputs={'Out': [test_reader]}, - attrs=op.attrs) - # root reader op's filelist attr for read test files - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - # 3. rename reader vars in inference program to different name - # to avoid read from train data. - main_block = program.global_block() - for var in main_block.vars.values(): - if var.type == core.VarDesc.VarType.READER: - main_block.rename_var( - str(var.name), str(_get_test_reader_name(var.name))) - - for op in main_block.ops: - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - startup_program.sync_with_cpp() - program.sync_with_cpp() - - return program diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py index 9425d472a4..2dd8f352f7 100644 --- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py +++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py @@ -31,8 +31,12 @@ def load_vocab(filename): # load word dict with paddle inner function -word_dict = load_vocab(sys.argv[1]) -word_dict[""] = len(word_dict) +if len(sys.argv) > 1: + word_dict = load_vocab(sys.argv[1]) + word_dict[""] = len(word_dict) +else: + word_dict = paddle.dataset.imdb.word_dict() + print "Dict dim = ", len(word_dict) # input text data diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py index e408684c6e..9e930b67a4 100644 --- a/python/paddle/fluid/tests/demo/text_classification/train.py +++ b/python/paddle/fluid/tests/demo/text_classification/train.py @@ -19,7 +19,7 @@ import sys TRAIN_FILES = ['train.recordio'] TEST_FILES = ['test.recordio'] -DICT_DIM = 89528 +DICT_DIM = 5147 # embedding dim emb_dim = 128 @@ -33,33 +33,24 @@ hid_dim2 = 96 # class num class_dim = 2 +# epoch num +epoch_num = 10 -def network_cfg(is_train, pass_num=100): - with fluid.unique_name.guard(): - train_file_obj = fluid.layers.open_files( - filenames=TRAIN_FILES, - pass_num=pass_num, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) - - test_file_obj = fluid.layers.open_files( - filenames=TEST_FILES, - pass_num=1, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) - if is_train: - file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000) - else: - file_obj = test_file_obj +def build_program(is_train): + file_obj_handle = fluid.layers.io.open_files( + filenames=TRAIN_FILES if is_train else TEST_FILES, + shapes=[[-1, 1], [-1, 1]], + lod_levels=[1, 0], + dtypes=['int64', 'int64'], + thread_num=1) + if is_train: + file_obj = fluid.layers.io.shuffle(file_obj_handle, buffer_size=1000) + else: + file_obj = file_obj_handle + file_obj = fluid.layers.io.double_buffer(file_obj) - file_obj = fluid.layers.double_buffer( - file_obj, - name="train_double_buffer" if is_train else 'test_double_buffer') + with fluid.unique_name.guard(): data, label = fluid.layers.read_file(file_obj) @@ -90,58 +81,64 @@ def network_cfg(is_train, pass_num=100): if is_train: # SGD optimizer - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) - return { - 'loss': avg_cost, - 'log': [avg_cost, acc], - 'file': train_file_obj if is_train else test_file_obj - } + return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle} def main(): train = fluid.Program() startup = fluid.Program() + test = fluid.Program() with fluid.program_guard(train, startup): - train_args = network_cfg(is_train=True) - - test = fluid.Program() + train_args = build_program(is_train=True) - with fluid.program_guard(test, fluid.Program()): - test_args = network_cfg(is_train=False) + with fluid.program_guard(test, startup): + test_args = build_program(is_train=False) + use_cuda = fluid.core.is_compiled_with_cuda() # startup - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place=place) exe.run(startup) train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=train_args['loss'].name, main_program=train) + use_cuda=use_cuda, + loss_name=train_args['loss'].name, + main_program=train) + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, main_program=test, share_vars_from=train_exe) fetch_var_list = [var.name for var in train_args['log']] - for i in xrange(sys.maxint): - result = map(numpy.array, - train_exe.run(fetch_list=fetch_var_list - if i % 1000 == 0 else [])) - if len(result) != 0: - print 'Train: ', result - - if i % 1000 == 0: - test_exe = fluid.ParallelExecutor( - use_cuda=True, main_program=test, share_vars_from=train_exe) - loss = [] - acc = [] - try: - while True: - loss_np, acc_np = map( - numpy.array, test_exe.run(fetch_list=fetch_var_list)) - loss.append(loss_np[0]) - acc.append(acc_np[0]) - except: - test_args['file'].reset() - print 'TEST: ', numpy.mean(loss), numpy.mean(acc) + for epoch_id in range(epoch_num): + # train + try: + batch_id = 0 + while True: + result = map(numpy.array, + train_exe.run(fetch_list=fetch_var_list + if batch_id % 10 == 0 else [])) + if len(result) != 0: + print 'Train loss: ', result + batch_id += 1 + except fluid.core.EOFException: + print 'End of epoch', epoch_id + train_args['file'].reset() + + # test + loss = [] + acc = [] + try: + while True: + loss_np, acc_np = map(numpy.array, + test_exe.run(fetch_list=fetch_var_list)) + loss.append(loss_np[0]) + acc.append(acc_np[0]) + except: + test_args['file'].reset() + print 'TEST: ', numpy.mean(loss), numpy.mean(acc) if __name__ == '__main__': From aae994fd26f5d09488d5993efa29349a056ca83c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 24 Jul 2018 14:49:45 +0800 Subject: [PATCH 11/37] refine im2col no padding --- paddle/fluid/operators/math/im2col.cc | 34 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index a50b9ace39..1a42ef1264 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -40,22 +40,46 @@ class Im2ColFunctordims()[1]; int filter_width = col->dims()[2]; - int col_height = col->dims()[3]; - int col_width = col->dims()[4]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); T* col_data = col->data(); + // TODO(TJ): change me to template + // further optimaze: + // 1. padding != 1 + // 2. could also support stride_h != 1 + if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && + dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { + int col_matrix_width = output_width * output_height; + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + oh * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_height * im_width; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } + return; + } + for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { + for (int h = 0; h < output_height; ++h) { int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { + for (int w = 0; w < output_width; ++w) { int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * col_height + h) * col_width + w; + int col_idx = (c * output_height + h) * output_width + w; int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || From 5022b14de85ee6787629839506919363a3d85b84 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 24 Jul 2018 16:18:22 +0800 Subject: [PATCH 12/37] fix mixed tensor compile and add cpu unit test --- paddle/fluid/framework/CMakeLists.txt | 7 ++- paddle/fluid/framework/mixed_vector.h | 10 ++-- ...ed_vector_test.cu => mixed_vector_test.cc} | 60 ++++++++++++------- 3 files changed, 51 insertions(+), 26 deletions(-) rename paddle/fluid/framework/{mixed_vector_test.cu => mixed_vector_test.cc} (92%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index de06c860f5..a526c47aa9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -22,7 +22,12 @@ endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) -nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor) +if(WITH_GPU) + nv_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) +else() + cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) +endif() + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 71bebeea63..f4fe334342 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/tensor.h" @@ -386,13 +387,14 @@ template class CPUVector : public std::vector> { public: CPUVector() : std::vector() {} - CPUVector(size_t count, const T &value = T()) + explicit CPUVector(size_t count, const T &value = T()) : std::vector(count, value) {} CPUVector(std::initializer_list init) : std::vector(init) {} - CPUVector(const std::vector &other) : std::vector(other) {} + explicit CPUVector(const std::vector &other) : std::vector(other) {} explicit CPUVector(const CPUVector &other) : std::vector(other) {} CPUVector(CPUVector &&other) : std::vector(std::move(other)) {} - CPUVector(std::vector &&other) : std::vector(std::move(other)) {} + explicit CPUVector(std::vector &&other) + : std::vector(std::move(other)) {} CPUVector &operator=(const CPUVector &other) { this->assign(other.begin(), other.end()); return *this; @@ -410,8 +412,6 @@ class CPUVector : public std::vector> { return os; } - void resize(size_t size) { this->resize(size); } - T &operator[](size_t id) { return this->at(id); } const T &operator[](size_t id) const { return this->at(id); } diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cc similarity index 92% rename from paddle/fluid/framework/mixed_vector_test.cu rename to paddle/fluid/framework/mixed_vector_test.cc index d57f825108..90d9a7c828 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cc @@ -11,8 +11,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA + #include +#endif + +#include + #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/mixed_vector.h" @@ -41,6 +48,38 @@ TEST(mixed_vector, CPU_VECTOR) { } } +TEST(mixed_vector, InitWithCount) { + paddle::framework::Vector vec(10, 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(vec[i], 10); + } +} + +TEST(mixed_vector, ForEach) { + vec tmp; + for (auto& v : tmp) { + VLOG(3) << v; + } +} + +TEST(mixed_vector, Reserve) { + paddle::framework::Vector vec; + vec.reserve(1); + vec.push_back(0); + vec.push_back(0); + vec.push_back(0); +} + +TEST(mixed_vector, Resize) { + paddle::framework::Vector vec; + vec.resize(1); + vec.push_back(0); + vec.push_back(0); + vec.push_back(0); +} + +#ifdef PADDLE_WITH_CUDA + static __global__ void multiply_10(int* ptr) { for (int i = 0; i < 10; ++i) { ptr[i] *= 10; @@ -92,23 +131,4 @@ TEST(mixed_vector, MultiGPU) { } } -TEST(mixed_vector, InitWithCount) { - paddle::framework::Vector vec(10, 10); - for (int i = 0; i < 10; ++i) { - ASSERT_EQ(vec[i], 10); - } -} - -TEST(mixed_vector, ForEach) { - vec tmp; - for (auto& v : tmp) { - } -} - -TEST(mixed_vector, Reserve) { - paddle::framework::Vector vec; - vec.reserve(1); - vec.push_back(0); - vec.push_back(0); - vec.push_back(0); -} +#endif From 0388d1cb2ac73b28896e28b510cf929da7fa7776 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 24 Jul 2018 16:36:33 +0800 Subject: [PATCH 13/37] some update --- .../.gitignore | 0 .../convert_data_to_recordio.py | 2 +- .../train.py | 41 ++++++++----------- 3 files changed, 18 insertions(+), 25 deletions(-) rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/.gitignore (100%) rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/convert_data_to_recordio.py (96%) rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/train.py (78%) diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore similarity index 100% rename from python/paddle/fluid/tests/demo/text_classification/.gitignore rename to python/paddle/fluid/tests/demo/file_reader/.gitignore diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py similarity index 96% rename from python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py rename to python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py index aaa713df88..b839e14889 100644 --- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py +++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py @@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace()) BATCH_SIZE = 128 train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=10000), + paddle.dataset.imdb.train(word_dict), buf_size=25000), batch_size=BATCH_SIZE) test_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py similarity index 78% rename from python/paddle/fluid/tests/demo/text_classification/train.py rename to python/paddle/fluid/tests/demo/file_reader/train.py index 9e930b67a4..bc3a6dc81d 100644 --- a/python/paddle/fluid/tests/demo/text_classification/train.py +++ b/python/paddle/fluid/tests/demo/file_reader/train.py @@ -27,9 +27,6 @@ emb_dim = 128 # hidden dim hid_dim = 128 -# hidden dim2 -hid_dim2 = 96 - # class num class_dim = 2 @@ -42,13 +39,9 @@ def build_program(is_train): filenames=TRAIN_FILES if is_train else TEST_FILES, shapes=[[-1, 1], [-1, 1]], lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) - if is_train: - file_obj = fluid.layers.io.shuffle(file_obj_handle, buffer_size=1000) - else: - file_obj = file_obj_handle - file_obj = fluid.layers.io.double_buffer(file_obj) + dtypes=['int64', 'int64']) + + file_obj = fluid.layers.io.double_buffer(file_obj_handle) with fluid.unique_name.guard(): @@ -56,22 +49,24 @@ def build_program(is_train): emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim]) - # sequence conv with window size = 3 - win_size = 3 conv_3 = fluid.nets.sequence_conv_pool( input=emb, num_filters=hid_dim, - filter_size=win_size, + filter_size=3, act="tanh", - pool_type="max") + pool_type="sqrt") - # fc layer after conv - fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") - # probability of each class - prediction = fluid.layers.fc(input=[fc_1], + prediction = fluid.layers.fc(input=[conv_3, conv_4], size=class_dim, act="softmax") + # cross entropy loss cost = fluid.layers.cross_entropy(input=prediction, label=label) @@ -117,11 +112,9 @@ def main(): try: batch_id = 0 while True: - result = map(numpy.array, - train_exe.run(fetch_list=fetch_var_list - if batch_id % 10 == 0 else [])) - if len(result) != 0: - print 'Train loss: ', result + loss, acc = map(numpy.array, + train_exe.run(fetch_list=fetch_var_list)) + print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc batch_id += 1 except fluid.core.EOFException: print 'End of epoch', epoch_id @@ -138,7 +131,7 @@ def main(): acc.append(acc_np[0]) except: test_args['file'].reset() - print 'TEST: ', numpy.mean(loss), numpy.mean(acc) + print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc) if __name__ == '__main__': From b163e601b6a4886523fb869b2055eb07be0799b9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 24 Jul 2018 17:38:40 +0800 Subject: [PATCH 14/37] add gtest --- paddle/fluid/operators/math/im2col_test.cc | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 8e3f0f2868..87bfc1e95a 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -160,8 +160,93 @@ void testIm2col() { delete context; } +void testIm2colCPU() { + paddle::framework::Tensor input; + paddle::framework::Tensor output; + int input_height = 3; + int input_width = 4; + int filter_size = 2; + int ic = 2; + std::vector stride({1, 1}); // stride_y, stride_x + std::vector padding({0, 0}); + std::vector dilation({1, 1}); // dilation_y, dilation_x + int output_height = + (input_height - filter_size + padding[0] * 2) / stride[0] + 1; + int output_width = + (input_width - filter_size + padding[1] * 2) / stride[1] + 1; + float* input_ptr = input.mutable_data({ic, input_height, input_width}, + paddle::platform::CPUPlace()); + for (int i = 0; i < input.numel(); ++i) { + input_ptr[i] = static_cast(i); + } + + paddle::platform::CPUPlace place; + paddle::platform::CPUDeviceContext context(place); + output.mutable_data( + {ic, filter_size, filter_size, output_height, output_width}, place); + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, + paddle::platform::CPUDeviceContext, float> + im2col; + im2col(context, input, dilation, stride, padding, &output); + auto ref_im2col = [&]( + const paddle::framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, const std::vector& padding, + paddle::framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + int channels_col = im_channels * filter_height * filter_width; + + const float* im_data = im.data(); + float* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? 0.f + : im_data[im_idx]; + } + } + } + }; + + paddle::framework::Tensor ref_output; + ref_output.mutable_data( + {ic, filter_size, filter_size, output_height, output_width}, place); + ref_im2col(input, dilation, stride, padding, &ref_output); + + float* out_cfo_ptr = output.data(); + for (int i = 0; i < ic * filter_size * filter_size; ++i) { + for (int j = 0; j < output_height * output_width; ++j) { + std::cout << out_cfo_ptr[i * output_height * output_width + j] << ","; + } + std::cout << std::endl; + } + + float* out_ref_ptr = ref_output.data(); + for (int i = 0; i < output.numel(); ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); + } +} + TEST(math, im2col) { testIm2col(); + testIm2colCPU(); #ifdef PADDLE_WITH_CUDA testIm2col(); From 6788af4bf1a3d6fd6803b9506b09e77d79aeb18e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 24 Jul 2018 17:50:35 +0800 Subject: [PATCH 15/37] refine test cases --- paddle/fluid/operators/math/im2col_test.cc | 45 ++++++++-------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 87bfc1e95a..db61f68db3 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -160,30 +160,26 @@ void testIm2col() { delete context; } -void testIm2colCPU() { +void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { paddle::framework::Tensor input; paddle::framework::Tensor output; - int input_height = 3; - int input_width = 4; - int filter_size = 2; - int ic = 2; - std::vector stride({1, 1}); // stride_y, stride_x - std::vector padding({0, 0}); + paddle::framework::Tensor ref_output; + std::vector padding({ph, pw}); + std::vector stride({1, 1}); // stride_y, stride_x std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = - (input_height - filter_size + padding[0] * 2) / stride[0] + 1; - int output_width = - (input_width - filter_size + padding[1] * 2) / stride[1] + 1; - float* input_ptr = input.mutable_data({ic, input_height, input_width}, - paddle::platform::CPUPlace()); + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; + float* input_ptr = + input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); for (int i = 0; i < input.numel(); ++i) { - input_ptr[i] = static_cast(i); + input_ptr[i] = static_cast(i + 1); } paddle::platform::CPUPlace place; paddle::platform::CPUDeviceContext context(place); - output.mutable_data( - {ic, filter_size, filter_size, output_height, output_width}, place); + output.mutable_data({ic, fh, fw, output_height, output_width}, place); + ref_output.mutable_data({ic, fh, fw, output_height, output_width}, + place); paddle::operators::math::Im2ColFunctor< paddle::operators::math::ColFormat::kCFO, paddle::platform::CPUDeviceContext, float> @@ -200,7 +196,6 @@ void testIm2colCPU() { int filter_width = col->dims()[2]; int output_height = col->dims()[3]; int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; const float* im_data = im.data(); @@ -215,7 +210,6 @@ void testIm2colCPU() { int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; int col_idx = (c * output_height + h) * output_width + w; int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || im_col_idx < 0 || im_col_idx >= im_width) ? 0.f @@ -225,19 +219,9 @@ void testIm2colCPU() { } }; - paddle::framework::Tensor ref_output; - ref_output.mutable_data( - {ic, filter_size, filter_size, output_height, output_width}, place); ref_im2col(input, dilation, stride, padding, &ref_output); float* out_cfo_ptr = output.data(); - for (int i = 0; i < ic * filter_size * filter_size; ++i) { - for (int j = 0; j < output_height * output_width; ++j) { - std::cout << out_cfo_ptr[i * output_height * output_width + j] << ","; - } - std::cout << std::endl; - } - float* out_ref_ptr = ref_output.data(); for (int i = 0; i < output.numel(); ++i) { EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); @@ -246,7 +230,10 @@ void testIm2colCPU() { TEST(math, im2col) { testIm2col(); - testIm2colCPU(); + testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, + /*pw*/ 0); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 1); #ifdef PADDLE_WITH_CUDA testIm2col(); From b72befc5cc06b75f4fdc44ac27f13d9114fad12f Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 24 Jul 2018 23:22:53 +0800 Subject: [PATCH 16/37] reuse copy size --- paddle/fluid/operators/math/im2col.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 1a42ef1264..bb55ce21b0 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -54,6 +54,7 @@ class Im2ColFunctor Date: Wed, 25 Jul 2018 09:35:27 +0800 Subject: [PATCH 17/37] remove explicit for compile problem --- paddle/fluid/framework/mixed_vector.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index f4fe334342..7836ecb127 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -387,13 +387,13 @@ template class CPUVector : public std::vector> { public: CPUVector() : std::vector() {} - explicit CPUVector(size_t count, const T &value = T()) + CPUVector(size_t count, const T &value = T()) // NOLINT : std::vector(count, value) {} CPUVector(std::initializer_list init) : std::vector(init) {} - explicit CPUVector(const std::vector &other) : std::vector(other) {} - explicit CPUVector(const CPUVector &other) : std::vector(other) {} + CPUVector(const std::vector &other) : std::vector(other) {} // NOLINT + CPUVector(const CPUVector &other) : std::vector(other) {} CPUVector(CPUVector &&other) : std::vector(std::move(other)) {} - explicit CPUVector(std::vector &&other) + CPUVector(std::vector &&other) // NOLINT : std::vector(std::move(other)) {} CPUVector &operator=(const CPUVector &other) { this->assign(other.begin(), other.end()); From aa3618ed3e585802fea5e895430c8d6ff02beb45 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Wed, 25 Jul 2018 10:11:03 +0800 Subject: [PATCH 18/37] fix _create_prefetch_block in distribute_transpiler --- python/paddle/fluid/transpiler/distribute_transpiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index fc58703eca..e7698d8c52 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -887,7 +887,8 @@ class DistributeTranspiler(object): # create table optimize block in pserver program table_opt_op = [ op for op in self.optimize_ops - if op.input("Param")[0] == self.table_name + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name ][0] table_opt_block = pserver_program.create_block(pre_block_idx) # only support sgd now From 65e5aebd437f5352e1e937ca6de83f8baa318c8e Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Wed, 25 Jul 2018 12:59:40 +0800 Subject: [PATCH 19/37] fix mixed_vector_test --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/mixed_vector_test.cc | 62 ----------------- paddle/fluid/framework/mixed_vector_test.cu | 75 +++++++++++++++++++++ 3 files changed, 76 insertions(+), 63 deletions(-) create mode 100644 paddle/fluid/framework/mixed_vector_test.cu diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a526c47aa9..93ec047c80 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -23,7 +23,7 @@ endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) if(WITH_GPU) - nv_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) + nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc index 90d9a7c828..0599c8d384 100644 --- a/paddle/fluid/framework/mixed_vector_test.cc +++ b/paddle/fluid/framework/mixed_vector_test.cc @@ -12,18 +12,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_CUDA - -#include - -#endif - #include #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/platform/gpu_info.h" template using vec = paddle::framework::Vector; @@ -77,58 +70,3 @@ TEST(mixed_vector, Resize) { vec.push_back(0); vec.push_back(0); } - -#ifdef PADDLE_WITH_CUDA - -static __global__ void multiply_10(int* ptr) { - for (int i = 0; i < 10; ++i) { - ptr[i] *= 10; - } -} - -cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { - return reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(place)) - ->stream(); -} - -TEST(mixed_vector, GPU_VECTOR) { - vec tmp; - for (int i = 0; i < 10; ++i) { - tmp.push_back(i); - } - ASSERT_EQ(tmp.size(), 10UL); - paddle::platform::CUDAPlace gpu(0); - - multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); - - for (int i = 0; i < 10; ++i) { - ASSERT_EQ(tmp[i], i * 10); - } -} - -TEST(mixed_vector, MultiGPU) { - if (paddle::platform::GetCUDADeviceCount() < 2) { - LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " - "GPUs in your machine."; - return; - } - - vec tmp; - for (int i = 0; i < 10; ++i) { - tmp.push_back(i); - } - ASSERT_EQ(tmp.size(), 10UL); - paddle::platform::CUDAPlace gpu0(0); - paddle::platform::SetDeviceId(0); - multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); - paddle::platform::CUDAPlace gpu1(1); - auto* gpu1_ptr = tmp.MutableData(gpu1); - paddle::platform::SetDeviceId(1); - multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); - for (int i = 0; i < 10; ++i) { - ASSERT_EQ(tmp[i], i * 100); - } -} - -#endif diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu new file mode 100644 index 0000000000..4b0caa8d35 --- /dev/null +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/platform/gpu_info.h" + +template +using vec = paddle::framework::Vector; + +static __global__ void multiply_10(int* ptr) { + for (int i = 0; i < 10; ++i) { + ptr[i] *= 10; + } +} + +cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { + return reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); +} + +TEST(mixed_vector, GPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10UL); + paddle::platform::CUDAPlace gpu(0); + + multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); + + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 10); + } +} + +TEST(mixed_vector, MultiGPU) { + if (paddle::platform::GetCUDADeviceCount() < 2) { + LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " + "GPUs in your machine."; + return; + } + + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10UL); + paddle::platform::CUDAPlace gpu0(0); + paddle::platform::SetDeviceId(0); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + paddle::platform::CUDAPlace gpu1(1); + auto* gpu1_ptr = tmp.MutableData(gpu1); + paddle::platform::SetDeviceId(1); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 100); + } +} From a4c5223713ab1c2d1683071bd8218f2a37cc15e1 Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 25 Jul 2018 13:44:13 +0800 Subject: [PATCH 20/37] Update test_pe_mnist threshold (#12348) * update test_pe_mnist threshold * clean code --- .../unittests/test_parallel_executor_mnist.py | 85 ++++++------------- 1 file changed, 28 insertions(+), 57 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index b21e16439a..76389d916f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase): label = np.ones(shape=[32, 1], dtype='int64') return img, label - # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True): if use_cuda and not core.is_compiled_with_cuda(): return - self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( - simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) - - img, label = self._init_data() - + model, use_cuda=use_cuda, use_reduce=True) self.check_network_convergence( - simple_fc_net, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=use_reduce) + model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True) - def check_simple_fc_convergence_with_Reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - simple_fc_net, use_cuda=use_cuda, use_reduce=True) - self.check_network_convergence( - simple_fc_net, - use_cuda=use_cuda, - allow_op_delay=True, - use_reduce=True) - - img, label = self._init_data() + img, label = self._init_data(random_data) all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - simple_fc_net, + model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) reduce_first_loss, reduce_last_loss = self.check_network_convergence( - simple_fc_net, + model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, @@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase): for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-4) + + # simple_fc + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return + self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) + + img, label = self._init_data() + + self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=use_reduce) def test_simple_fc(self): # use_cuda @@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce - self.check_simple_fc_convergence_with_Reduce(True) - self.check_simple_fc_convergence_with_Reduce(False) + self._compare_reduce_and_allreduce(simple_fc_net, True) + self._compare_reduce_and_allreduce(simple_fc_net, False) def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): @@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda) - def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True) - - img, label = self._init_data() - - all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=False) - reduce_first_loss, reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=True) - - for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) - for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-4) - def test_batchnorm_fc(self): self.check_batchnorm_fc_convergence(True) self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence_use_reduce(True) - self.check_batchnorm_fc_convergence_use_reduce(False) + self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + self._compare_reduce_and_allreduce(fc_with_batchnorm, False) if __name__ == '__main__': From 754e96a30c1812a658c5ccf1281a1c39fef9416b Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Wed, 25 Jul 2018 14:04:58 +0800 Subject: [PATCH 21/37] distribute lookup table work with parallel executor --- paddle/fluid/framework/details/multi_devices_graph_builder.cc | 2 +- python/paddle/fluid/transpiler/distribute_transpiler.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index f1f8674caf..e0fb92cc3c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -207,7 +207,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( result.Set("ops", new GraphOps); // find send/recv vars so that we can place the distributed training - // realted op in the place 0 + // related op in the place 0 auto send_vars = FindDistTrainSendVars(nodes); auto recv_vars = FindDistTrainRecvVars(nodes); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index e7698d8c52..4a9ea6af74 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -779,7 +779,9 @@ class DistributeTranspiler(object): outputs={"Out": prefetch_output_vars}, attrs={ "epmap": pserver_endpoints, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + # FIXME(qiao) temporarily disable this config because prefetch + # is not act as other rpc op, it's more like a forward op + # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) # insert concat_op From 01566fb61b6605906339dad6eefa68a9a235299d Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 25 Jul 2018 06:25:03 +0000 Subject: [PATCH 22/37] 1. support mutil batch utest 2. support pool op --- .../inference/tensorrt/convert/pool2d_op.cc | 2 ++ .../tensorrt/convert/test_activation_op.cc | 5 +-- .../inference/tensorrt/convert/test_fc_op.cc | 8 ++--- .../inference/tensorrt/convert/test_mul_op.cc | 3 +- .../tensorrt/convert/test_pool2d_op.cc | 21 ++++++------ .../inference/tensorrt/convert/ut_helper.h | 26 +++++++++++---- .../fluid/inference/tensorrt/test_engine.cc | 33 ++++++++++++++++++- 7 files changed, 73 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index ea282a79d5..2db551758c 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -43,6 +43,8 @@ class Pool2dOpConverter : public OpConverter { const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL); + nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX; if (pool_type == "max") { pool_t = nvinfer1::PoolingType::kMAX; diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7dabfd9f6a..5a1543de81 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -23,7 +23,8 @@ namespace tensorrt { TEST(ReluOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - TRTConvertValidation validator(10, parameters, scope, 1000); + int runtime_batch = 3; + TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); @@ -37,7 +38,7 @@ TEST(ReluOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(1); + validator.Execute(runtime_batch); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 081f4d6059..0c62108332 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -23,10 +23,10 @@ namespace tensorrt { TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1)); + int runtime_batch = 2; + TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); + validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); - // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); // Prepare Op description @@ -38,7 +38,7 @@ TEST(fc_op, test) { validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(runtime_batch); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 674f37f2fd..a54870566a 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -23,7 +23,8 @@ namespace tensorrt { TEST(MulOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - TRTConvertValidation validator(10, parameters, scope, 1000); + int runtime_batch = 0; + TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index 261eb2c6ce..57c982f2c0 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -23,9 +23,14 @@ namespace tensorrt { TEST(Pool2dOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("pool2d-X", nvinfer1::Dims4(10, 3, 2, 2)); - validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims4(10, 3, 1, 1)); + int runtime_batch = 3; + TRTConvertValidation validator(5, parameters, scope, 1 << 15, runtime_batch); + + // We have already set the runtime batchsize, so the + // Dims should not contain the batch size. + // The ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; @@ -34,7 +39,7 @@ TEST(Pool2dOpConverter, main) { desc.SetOutput("Out", {"pool2d-Out"}); std::vector ksize({2, 2}); - std::vector strides({1, 1}); + std::vector strides({2, 2}); std::vector paddings({0, 0}); std::string pooling_t = "max"; @@ -42,18 +47,12 @@ TEST(Pool2dOpConverter, main) { desc.SetAttr("ksize", ksize); desc.SetAttr("strides", strides); desc.SetAttr("paddings", paddings); - // std::string temp = ""; - // (*desc.Proto()).SerializeToString(&temp); - - // std::cout << temp << std::endl; - // std::ofstream f("__temp__", std::ios::out); - // f << temp; LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(10); + validator.Execute(runtime_batch); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f14885b238..e20169c4cf 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -63,13 +63,15 @@ class TRTConvertValidation { public: TRTConvertValidation() = delete; - TRTConvertValidation(int batch_size, + TRTConvertValidation(int max_batch_size, const std::unordered_set& parameters, framework::Scope& scope, // NOLINT - int workspace_size = 1 << 10) - : parameters_(parameters), scope_(scope) { + int workspace_size = 1 << 10, int runtime_batch_size = 1) + : parameters_(parameters), + scope_(scope), + runtime_batch_size_(runtime_batch_size) { // create engine. - engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_)); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); @@ -84,7 +86,7 @@ class TRTConvertValidation { // Declare a parameter varaible in the scope. void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) { - DeclVar(name, dims); + DeclVar(name, dims, true); } void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { @@ -92,12 +94,20 @@ class TRTConvertValidation { } // Declare a variable in a fluid Scope. - void DeclVar(const std::string& name, const nvinfer1::Dims& dims) { + void DeclVar(const std::string& name, const nvinfer1::Dims& dims, + bool is_param = false) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); // Init Fluid tensor. std::vector dim_vec(dims.d, dims.d + dims.nbDims); + // There is no batchsize in ITensor's shape, but We should add it to + // tensor's + // shape of fluid. If the variable is not parameter and the batch size + // greater than 0, + // add the batchsize to dim_vec. + if (is_param != true && runtime_batch_size_ > 0) + dim_vec.insert(dim_vec.begin(), runtime_batch_size_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); @@ -167,6 +177,10 @@ class TRTConvertValidation { std::unique_ptr op_desc_; const std::unordered_set& parameters_; framework::Scope& scope_; + // It represents the runtime batchsize when we test. + // If the value greater than 0, we add this to + // the first dimension of tensor's shape of fluid. + int runtime_batch_size_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index f8732e51b6..dc03702990 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ASSERT_EQ(y_cpu[1], 14.5); } -TEST_F(TensorRTEngineTest, test_conv2d_temp) { +TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; @@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) { ASSERT_EQ(y_cpu[1], 6.0); } +TEST_F(TensorRTEngineTest, test_pool2d) { + // Weight in CPU memory. + auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 2, 2}); + + nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; + auto* pool_layer = + TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), + pool_t, nvinfer1::DimsHW{2, 2}); + + PADDLE_ENFORCE(pool_layer != nullptr); + pool_layer->setStride(nvinfer1::DimsHW{1, 1}); + pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); + + engine_->DeclareOutput(pool_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), + 8 * sizeof(float)); + engine_->Execute(2); + + LOG(INFO) << "to get output"; + float* y_cpu = new float[2]; + engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + + ASSERT_EQ(y_cpu[0], 2.0); + ASSERT_EQ(y_cpu[1], 5.0); +} + } // namespace tensorrt } // namespace inference } // namespace paddle From fd2d2c66e98e862d3b35e768149ebc18bc64c8a5 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 25 Jul 2018 17:17:34 +0800 Subject: [PATCH 23/37] add flag to prevent unnessary memory free --- paddle/fluid/memory/detail/buddy_allocator.cc | 17 ++++-- python/paddle/fluid/__init__.py | 56 +++++++++---------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 01a8501dd4..c2f45fdc99 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -15,6 +15,10 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "glog/logging.h" +DEFINE_bool(free_idle_memory, false, + "If it is true, Paddle will try to free idle memory trunks during " + "running time."); + namespace paddle { namespace memory { namespace detail { @@ -152,13 +156,14 @@ void BuddyAllocator::Free(void* p) { pool_.insert( IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); - // Clean up if existing too much free memory - - // Prefer freeing fallback allocation first - CleanIdleFallBackAlloc(); + if (FLAGS_free_idle_memory) { + // Clean up if existing too much free memory + // Prefer freeing fallback allocation first + CleanIdleFallBackAlloc(); - // Free normal allocation - CleanIdleNormalAlloc(); + // Free normal allocation + CleanIdleNormalAlloc(); + } } size_t BuddyAllocator::Used() { return total_used_; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 9903047f74..c2d641600c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -62,33 +62,33 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ - trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ - parallel_executor.__all__ + lod_tensor.__all__ + [ - 'io', - 'initializer', - 'layers', - 'contrib', - 'transpiler', - 'nets', - 'optimizer', - 'learning_rate_decay', - 'backward', - 'regularizer', - 'LoDTensor', - 'LoDTensorArray', - 'CPUPlace', - 'CUDAPlace', - 'CUDAPinnedPlace', - 'Tensor', - 'ParamAttr', - 'WeightNormParamAttr', - 'DataFeeder', - 'clip', - 'profiler', - 'unique_name', - 'recordio_writer', - 'Scope', - ] + trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ + parallel_executor.__all__ + lod_tensor.__all__ + [ + 'io', + 'initializer', + 'layers', + 'contrib', + 'transpiler', + 'nets', + 'optimizer', + 'learning_rate_decay', + 'backward', + 'regularizer', + 'LoDTensor', + 'LoDTensorArray', + 'CPUPlace', + 'CUDAPlace', + 'CUDAPinnedPlace', + 'Tensor', + 'ParamAttr', + 'WeightNormParamAttr', + 'DataFeeder', + 'clip', + 'profiler', + 'unique_name', + 'recordio_writer', + 'Scope', + ] def __bootstrap__(): @@ -123,7 +123,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem' + 'init_allocated_mem', 'free_idle_memory' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') From 5533400720d5f1d407320667ea09b58395d44607 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 25 Jul 2018 11:48:19 +0000 Subject: [PATCH 24/37] fix comments --- .../inference/tensorrt/convert/pool2d_op.cc | 9 ++--- .../tensorrt/convert/test_activation_op.cc | 5 ++- .../inference/tensorrt/convert/test_fc_op.cc | 5 ++- .../inference/tensorrt/convert/test_mul_op.cc | 5 ++- .../tensorrt/convert/test_pool2d_op.cc | 10 +++--- .../inference/tensorrt/convert/ut_helper.h | 33 +++++++++++-------- 6 files changed, 35 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 2db551758c..c5852bdb53 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -30,6 +30,7 @@ class Pool2dOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + std::string pool_type = boost::get(op_desc.GetAttr("pooling_type")); std::vector ksize = @@ -45,18 +46,18 @@ class Pool2dOpConverter : public OpConverter { PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL); - nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX; + nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX; if (pool_type == "max") { - pool_t = nvinfer1::PoolingType::kMAX; + nv_pool_type = nvinfer1::PoolingType::kMAX; } else if (pool_type == "avg") { - pool_t = nvinfer1::PoolingType::kAVERAGE; + nv_pool_type = nvinfer1::PoolingType::kAVERAGE; } else { PADDLE_THROW("TensorRT unsupported pooling type!"); } auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(input1), - pool_t, nv_ksize); + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created."); layer->setStride(nv_strides); layer->setPadding(nv_paddings); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 5a1543de81..e82762ea03 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -23,8 +23,7 @@ namespace tensorrt { TEST(ReluOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - int runtime_batch = 3; - TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); + TRTConvertValidation validator(10, parameters, scope, 1000); validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); @@ -38,7 +37,7 @@ TEST(ReluOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(runtime_batch); + validator.Execute(5); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 0c62108332..1ae2668e73 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -23,8 +23,7 @@ namespace tensorrt { TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; - int runtime_batch = 2; - TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); + TRTConvertValidation validator(10, parameters, scope, 1000); validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); @@ -38,7 +37,7 @@ TEST(fc_op, test) { validator.SetOp(*desc.Proto()); - validator.Execute(runtime_batch); + validator.Execute(10); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index a54870566a..3d34cd7d5d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -23,8 +23,7 @@ namespace tensorrt { TEST(MulOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - int runtime_batch = 0; - TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch); + TRTConvertValidation validator(10, parameters, scope, 1000, false); validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); @@ -40,7 +39,7 @@ TEST(MulOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(1); + validator.Execute(2); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index 57c982f2c0..c5dddbc8cd 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -23,12 +23,10 @@ namespace tensorrt { TEST(Pool2dOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - int runtime_batch = 3; - TRTConvertValidation validator(5, parameters, scope, 1 << 15, runtime_batch); + TRTConvertValidation validator(5, parameters, scope, 1 << 15); - // We have already set the runtime batchsize, so the - // Dims should not contain the batch size. - // The ITensor's Dims of input and output should be C * H * W. + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); @@ -52,7 +50,7 @@ TEST(Pool2dOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(runtime_batch); + validator.Execute(3); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index e20169c4cf..a4ab111eb4 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -66,10 +66,11 @@ class TRTConvertValidation { TRTConvertValidation(int max_batch_size, const std::unordered_set& parameters, framework::Scope& scope, // NOLINT - int workspace_size = 1 << 10, int runtime_batch_size = 1) + int workspace_size = 1 << 10, bool if_add_batch = true) : parameters_(parameters), scope_(scope), - runtime_batch_size_(runtime_batch_size) { + if_add_batch_(if_add_batch), + max_batch_size_(max_batch_size) { // create engine. engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); engine_->InitNetwork(); @@ -102,12 +103,10 @@ class TRTConvertValidation { // Init Fluid tensor. std::vector dim_vec(dims.d, dims.d + dims.nbDims); // There is no batchsize in ITensor's shape, but We should add it to - // tensor's - // shape of fluid. If the variable is not parameter and the batch size - // greater than 0, - // add the batchsize to dim_vec. - if (is_param != true && runtime_batch_size_ > 0) - dim_vec.insert(dim_vec.begin(), runtime_batch_size_); + // tensor's shape of fluid. If the variable is not parameter and the + // if_add_batch_ flag is true, add the max batchsize to dim_vec. + if (is_param != true && if_add_batch_ == true) + dim_vec.insert(dim_vec.begin(), max_batch_size_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); @@ -141,6 +140,7 @@ class TRTConvertValidation { void Execute(int batch_size) { // Execute Fluid Op + PADDLE_ENFORCE_LE(batch_size, max_batch_size_); platform::CPUPlace place; platform::CPUDeviceContext ctx(place); op_->Run(scope_, place); @@ -159,9 +159,14 @@ class TRTConvertValidation { auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + + size_t fluid_out_size = fluid_out.size(); + if (if_add_batch_ == true) { + fluid_out_size = batch_size * (tensor->dims().size() / max_batch_size_); + } // Compare two output ASSERT_FALSE(fluid_out.empty()); - for (size_t i = 0; i < fluid_out.size(); i++) { + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); } @@ -177,10 +182,12 @@ class TRTConvertValidation { std::unique_ptr op_desc_; const std::unordered_set& parameters_; framework::Scope& scope_; - // It represents the runtime batchsize when we test. - // If the value greater than 0, we add this to - // the first dimension of tensor's shape of fluid. - int runtime_batch_size_; + // The ITensor of trt does not cotain the batch size, + // bug, in most cases, we need to set batch size for + // fluid's tensor shape. This variable indicates + // whether to add batch size to tensor shape of fluid. + bool if_add_batch_; + int max_batch_size_; }; } // namespace tensorrt From c8adfb3451281aad51e1398023d5af80b31fa7f3 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 25 Jul 2018 12:32:47 +0000 Subject: [PATCH 25/37] add paddle_enforce --- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index c5852bdb53..11cad95361 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -29,6 +29,8 @@ class Pool2dOpConverter : public OpConverter { << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); std::string pool_type = From 83e59257d0a156b5bbe992d44d9b5d9ff5207d3d Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 26 Jul 2018 12:16:14 +0800 Subject: [PATCH 26/37] fix manylinux1 Failed to publish artifacts --- paddle/fluid/inference/api/demo_ci/clean.sh | 4 ++++ paddle/scripts/paddle_build.sh | 1 + 2 files changed, 5 insertions(+) create mode 100755 paddle/fluid/inference/api/demo_ci/clean.sh diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh new file mode 100755 index 0000000000..0d9f3d2aa2 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/clean.sh @@ -0,0 +1,4 @@ +set -x +cd `dirname $0` +rm -rf build/ data/ +set +x diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9e58a39eb0..0f334b2892 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -547,6 +547,7 @@ function test_fluid_inference_lib() { EOF cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} + ./clean.sh fi } From 4f71a3b12bfb5401f1f4e49fe8e71634a5d1c645 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 26 Jul 2018 04:33:08 +0000 Subject: [PATCH 27/37] fix a bug --- paddle/fluid/inference/tensorrt/convert/ut_helper.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index a4ab111eb4..39529cc2c7 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -162,7 +162,8 @@ class TRTConvertValidation { size_t fluid_out_size = fluid_out.size(); if (if_add_batch_ == true) { - fluid_out_size = batch_size * (tensor->dims().size() / max_batch_size_); + fluid_out_size = + batch_size * (framework::product(tensor->dims()) / max_batch_size_); } // Compare two output ASSERT_FALSE(fluid_out.empty()); From 73fcfc06ec76bae730906583b44c649a7a008bc0 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 26 Jul 2018 13:46:08 +0800 Subject: [PATCH 28/37] refine conv cudnn enforce (#12353) * refine conv cudnn enforce * update * update all cudnn ops * fix --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 26 +++++++++---------- .../operators/conv_transpose_cudnn_op.cu.cc | 18 ++++++------- paddle/fluid/operators/math/softmax.cu | 4 +-- paddle/fluid/operators/pool_cudnn_op.cu.cc | 4 +-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 1828be57b5..b3781ded01 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( cudnn_conv_desc, groups)); groups = 1; #endif @@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &algo)); @@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel { if (dev_ctx.GetComputeCapability() >= 70 && std::type_index(typeid(T)) == std::type_index(typeid(platform::float16))) { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); // Currently tensor core is only enabled using this algo algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } else { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); } #endif // get workspace size able to allocate - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); // It is possible for float16 on Volta GPU to allocate more memory than @@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, @@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( cudnn_conv_desc, groups)); groups = 1; #endif @@ -273,7 +273,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); if (input_grad) { if (FLAGS_cudnn_deterministic) { - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, // dyDesc: Handle to the previously initialized input @@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); @@ -298,7 +298,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (filter_grad) { if (FLAGS_cudnn_deterministic) { - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, @@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size)); @@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, @@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, filter_algo, cudnn_workspace, diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 038ea89990..82fff68e75 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); // Get the algorithm - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, // dxDesc: Handle to the previously initialized output tensor // descriptor. @@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); // get workspace size able to allocate - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); @@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, @@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &data_algo)); - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_input_desc, data_algo, &fwd_ws_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); @@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { if (filter_grad) { // choose backward algorithm for filter - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, @@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit, &filter_algo)); // get workspace for backwards filter algorithm - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); @@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_output_desc, output_grad_data + output_grad_offset * g, cudnn_filter_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo, @@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_output_desc, output_grad_data + output_grad_offset * g, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, filter_algo, diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index a579182ec1..3effe77625 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward( + CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, @@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward( + CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index be55bc43b1..31f083565f 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel { // ------------------- cudnn pool algorithm --------------------- auto handle = ctx.cuda_device_context().cudnn_handle(); ScalingParamType alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( + CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, cudnn_output_desc, output_data)); } @@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( + CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, &beta, cudnn_input_desc, input_grad_data)); From 604bd85a458ea710342c705d45228dcd822653bf Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 26 Jul 2018 15:48:39 +0800 Subject: [PATCH 29/37] update inference_optimize() --- paddle/fluid/framework/block_desc.h | 5 ++--- python/paddle/fluid/framework.py | 23 ++++++++++++++++++++++- python/paddle/fluid/layers/io.py | 3 --- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index ce48548418..960ca39e1e 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -88,9 +88,8 @@ class BlockDesc { OpDesc *InsertOp(size_t index); /* - * Remove Op and its input/output variables. - * Note that for either input or output variable, if it is also an input or - * output variable of other ops, we should remain it. + * Only remove op itself, + * do nothing to its input and output variables */ void RemoveOp(size_t s, size_t e); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index db550eccf9..e10f8325e4 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1540,7 +1540,12 @@ class Program(object): def inference_optimize(self): """ - This method will create a new program and change the :code:`is_test` + This method will create a new program and do following adjustments on it: + 1. Remove all reader variables and their creator ops if exist. + + 2. Remove the :code:`read_op` if exists. + + 3. change the :code:`is_test` attribute of operators to :code:`True`. All the :code:`Parameter` information will be lost. @@ -1554,6 +1559,22 @@ class Program(object): # core.inference_optimize being fixed. res = Program() res.desc = core.ProgramDesc(self.desc) + + # remove all readers and the read_op if exist + read_op_idx = 0 + root_block = res.desc.block(0) + while True: + if read_op_idx >= root_block.op_size() or root_block.op( + read_op_idx).type() == 'read': + break + read_op_idx += 1 + if read_op_idx < root_block.op_size(): + root_block._remove_op(0, read_op_idx + 1) + for var in root_block.all_vars(): + if var.type() == core.VarDesc.VarType.READER: + root_block._remove_var(var.name()) + + # change all `is_test` attributes to True for i in xrange(res.desc.num_blocks()): block = res.desc.block(i) for j in xrange(block.op_size()): diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index df6becabd1..fab4a92a0a 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -443,9 +443,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): main_prog_var = _copy_reader_var_(default_main_program().current_block(), startup_var) - if for_parallel: - main_prog_var = parallel(reader=main_prog_var) - return monkey_patch_reader_methods(main_prog_var) From 54e9fd3f61fcba58e77a253ceab9af77754f176d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 26 Jul 2018 18:25:55 +0800 Subject: [PATCH 30/37] fix cudnn enforce --- paddle/fluid/platform/cudnn_helper.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 6ea4f8b7cb..bb8b14bb9f 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_VERSION_MIN(major, minor, patch) \ (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) -#define CUDNN_ENFORCE(condition) \ - do { \ - cudnnStatus_t status = condition; \ - if (status != CUDNN_STATUS_SUCCESS) { \ - VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \ - PADDLE_THROW("cuDNN call failed"); \ - } \ +#define CUDNN_ENFORCE(condition) \ + do { \ + cudnnStatus_t status = condition; \ + if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \ + PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ + } \ } while (false) enum class DataLayout { // Not use From 10a1c2bb86d602f27490079adda1ab594fdeeed9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 10:16:14 +0800 Subject: [PATCH 31/37] control omp num_threads --- paddle/fluid/platform/cpu_helper.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 77ecb17011..234a04b5c2 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_MKLML +#include #include "paddle/fluid/platform/dynload/mklml.h" #endif @@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); + omp_set_num_threads(num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif From 9788e5ab8795c9f5cf19f4fa7f7dd37a20512d12 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 10:36:06 +0800 Subject: [PATCH 32/37] add flags to control num_threads --- .../fluid/inference/tests/book/test_inference_nlp.cc | 11 ++--------- paddle/fluid/platform/cpu_helper.cc | 3 +++ paddle/fluid/platform/init.cc | 4 +++- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index 5cc1db12bb..e2a3e9d46e 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -20,9 +20,6 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/platform/cpu_helper.h" -#ifdef PADDLE_WITH_MKLML -#include -#endif DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); @@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_int32(num_threads, 1, "Number of threads should be used"); DECLARE_bool(use_mkldnn); +DECLARE_int32(paddle_num_threads); inline double GetCurrentMs() { struct timeval time; @@ -160,12 +158,7 @@ TEST(inference, nlp) { std::unique_ptr scope( new paddle::framework::Scope()); -#ifdef PADDLE_WITH_MKLML - // only use 1 thread number per std::thread - omp_set_dynamic(0); - omp_set_num_threads(1); - paddle::platform::SetNumThreads(1); -#endif + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); double start_ms = 0, stop_ms = 0; if (FLAGS_num_threads > 1) { diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 234a04b5c2..6841652b75 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -24,6 +24,9 @@ limitations under the License. */ #include #endif +DEFINE_int32(paddle_num_threads, 1, + "Number of threads for each paddle instance."); + namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 0b77652841..79a2ca96ed 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -23,6 +23,8 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" +DECLARE_int32(paddle_num_threads); + namespace paddle { namespace framework { @@ -115,7 +117,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); #ifndef PADDLE_WITH_MKLDNN - platform::SetNumThreads(1); + platform::SetNumThreads(FLAGS_paddle_num_threads); #endif } From 23e3f790c0601136be1582cad7ad6da8b8e5ddb1 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 11:00:41 +0800 Subject: [PATCH 33/37] enable read from python --- python/paddle/fluid/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index c2d641600c..d1d6dd75ee 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -123,7 +123,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory' + 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') From 4f0383f52ea7e7da06a7b383d736aeccb7d0e512 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 13:29:38 +0800 Subject: [PATCH 34/37] fix unknown flag --- paddle/fluid/platform/cpu_helper.cc | 3 --- paddle/fluid/platform/init.cc | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 6841652b75..234a04b5c2 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -24,9 +24,6 @@ limitations under the License. */ #include #endif -DEFINE_int32(paddle_num_threads, 1, - "Number of threads for each paddle instance."); - namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 79a2ca96ed..6f1f0c4796 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -23,7 +23,8 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" -DECLARE_int32(paddle_num_threads); +DEFINE_int32(paddle_num_threads, 1, + "Number of threads for each paddle instance."); namespace paddle { namespace framework { From 57c8aa60c2e1b2f0a2e99b17c53c9b8ce56f2521 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 27 Jul 2018 13:50:15 +0800 Subject: [PATCH 35/37] Add self to author --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index 8c4a113fc2..41b7193677 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -46,6 +46,7 @@ | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | | typhoonzero | Yi Wu | +| velconia | Qi-Yang Min | | wanghaoshuang | Hao-Shuang Wang | | wangyang59 | Yang Wang | | wangzhen-nlp | Zhen Wang | From 2409d0f710aab9e030ff23b2b0b66d874294b931 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 27 Jul 2018 16:29:58 +0800 Subject: [PATCH 36/37] Refine regularization for selected_rows (#12369) * refine regularization for selected_rows * clean lookup_table * refine rpc_server_test * temporally disable rpc_server_test * fix rpc_server_test * add unit test --- paddle/fluid/operators/CMakeLists.txt | 1 + .../operators/distributed/CMakeLists.txt | 6 +- .../operators/distributed/rpc_server_test.cc | 25 ++--- paddle/fluid/operators/extract_rows_op.cc | 103 ++++++++++++++++++ paddle/fluid/operators/lookup_table_op.cc | 46 +++----- paddle/fluid/operators/lookup_table_op.cu | 73 +++++-------- paddle/fluid/operators/lookup_table_op.h | 40 +------ python/paddle/fluid/regularizer.py | 16 ++- .../tests/unittests/test_extract_rows_op.py | 58 ++++++++++ .../tests/unittests/test_lookup_table_op.py | 47 -------- 10 files changed, 240 insertions(+), 175 deletions(-) create mode 100644 paddle/fluid/operators/extract_rows_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_extract_rows_op.py diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b56ad4c55..2da52dbf48 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -270,6 +270,7 @@ op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) op_library(unsqueeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op) +op_library(extract_rows_op DEPS memory) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 1612927055..da5d20505e 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -17,9 +17,9 @@ if(WITH_GRPC) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(rpc_server_test SRCS rpc_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) return() endif() diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index 9f2360ec70..b50830c362 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -30,7 +30,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace distributed = paddle::operators::distributed; -USE_OP(lookup_table); +USE_NO_KERNEL_OP(lookup_sparse_table); std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; @@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); framework::VariableNameMap output({{"Output", {"out"}}}); auto op = block->AppendOp(); - op->SetType("lookup_table"); + op->SetType("lookup_sparse_table"); op->SetInput("W", {"w"}); op->SetInput("Ids", {"ids"}); op->SetOutput("Out", {"out"}); auto& out = *root_block->Var("out"); - out.SetType(framework::proto::VarType::SELECTED_ROWS); + out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetShape({10, 10}); return block; @@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { w_var->GetMutable(); auto out_var = scope->Var("out"); - out_var->GetMutable(); + out_var->GetMutable(); auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); + ids_var->GetMutable(); } void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - auto rows = ids_var->mutable_rows(); - for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2); - ids_var->mutable_value()->Resize({rows_numel, 1}); - ids_var->mutable_value()->mutable_data(*place); + auto ids_var = scope->Var("ids")->GetMutable(); + int64_t* ids_ptr = + ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); + for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; } void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, @@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) { client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name); client->Wait(); auto var = scope.Var(out_var_name); - auto value = var->GetMutable()->value(); - auto ptr = value.mutable_data(place); + auto value = var->GetMutable(); + auto ptr = value->mutable_data(place); for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast(i * 2)); + EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast(i * 2)); } } diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc new file mode 100644 index 0000000000..9a297d03cf --- /dev/null +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ExtractRowsOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ExtractRowsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ExtractRowsOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0], + framework::proto::VarType::SELECTED_ROWS, + "The type of input(X) must be SelectedRows."); + auto in_dims = ctx->GetInputDim("X"); + + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector{in_dims[0], 1})); + } +}; + +class ExtractRowsOp : public framework::OperatorBase { + public: + ExtractRowsOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &in = scope.FindVar(Input("X"))->Get(); + auto out = scope.FindVar(Output("Out"))->GetMutable(); + + auto in_rows = in.rows(); + auto out_dim = framework::make_ddim( + std::vector{static_cast(in_rows.size()), 1}); + auto dst_ptr = out->mutable_data(out_dim, in.place()); + + if (paddle::platform::is_gpu_place(in.place())) { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = pool.Get(in.place()); + auto src_ptr = in_rows.Data(in.place()); + auto stream = + reinterpret_cast(*dev_ctx) + .stream(); + memory::Copy(boost::get(out->place()), dst_ptr, + boost::get(in.place()), src_ptr, + in_rows.size() * sizeof(int64_t), stream); +#else + PADDLE_THROW("Not compiled with CUDA."); +#endif + } else { + memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(), + in_rows.data(), in_rows.size() * sizeof(int64_t)); + } + } +}; + +class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(SelectedRows). The input tensor of extract_rows operator," + " and its type is SelectedRows."); + AddOutput("Out", "(Tensor). The the rows of input(X)."); + + AddComment(R"DOC( + ExtractRows Operator. + +The function of extract_rows_op is extracting the rows from the input(X) +whose type is SelectedRows. + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker, + ops::ExtractRowsOpInferShape); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index bda4994322..3e8f3ec5c5 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -33,19 +33,15 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W - // and it must be a column vector with rank = 2 while the 2nd dimension - // size must be 1, when Ids's type is SelectedRows, the rows of Ids - // contains the ids to be looked up in W; - if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); - } + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); - ctx->ShareLoD("Ids", /*->*/ "Out"); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } } protected: @@ -62,17 +58,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("W", "(Tensor) The input represents embedding tensors, " "which is a learnable parameter."); - AddInput( - "Ids", - "(Tensor or SelectedRows) Ids's type can be Tensor or " - "SelectedRows, when Ids's type is Tensor, this tensor contains " - "the ids to be looked up in W and it must be a column vector with " - "rank = 2 while the 2nd dimension size must be 1; when Ids's type is " - "SelectedRows, the rows of Ids contains the ids to be looked up " - "in W."); - AddOutput("Out", - "(Tensor or SelectedRows) The lookup results, which have the " - "same type as W."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "Ids must be a column vector with rank = 2. " + "The 2nd dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") @@ -90,15 +81,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { Lookup Table Operator. This operator is used to perform lookups on the parameter W, -then concatenated into a dense or sparse tensor. - -The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's -type is SelectedRows, the rows of Ids contains the ids to be looked up in W; -when Ids's type is Tensor, this tensor contains the ids to be looked up in W -and it must be a column vector with rank = 2 while the 2nd dimension size must be 1, -at this time, Ids can carry the LoD (Level of Details) information, or not, and -the output only shares the LoD information with input Ids. +then concatenated into a dense tensor. +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. )DOC"); } diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 77722c50d3..27483372b9 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -23,7 +23,7 @@ namespace operators { template -__global__ void LookupTable(T* output, const T* table, const int64_t* ids, +__global__ void LookupTable(T *output, const T *table, const int64_t *ids, const int64_t N, const int64_t K, const int64_t D, const int64_t padding_idx) { int idx = threadIdx.x; @@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, int64_t id = ids[idy]; PADDLE_ASSERT(id >= 0); PADDLE_ASSERT(id < N); - T* out = output + idy * D; - const T* tab = table + id * D; + T *out = output + idy * D; + const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { if (PaddingFlag) { if (id == padding_idx) @@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, } template -__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, +__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, const int64_t N, const int64_t K, const int64_t D) { int idx = threadIdx.x; @@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, int id = ids[idy]; PADDLE_ASSERT(id >= 0); PADDLE_ASSERT(id < N); - const T* out = output + idy * D; - T* tab = table + id * D; + const T *out = output + idy * D; + T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { paddle::platform::CudaAtomicAdd(&tab[i], out[i]); } @@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, template class LookupTableCUDAKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* table_t = context.Input("W"); + void Compute(const framework::ExecutionContext &context) const override { + auto *table_t = context.Input("W"); + auto *ids_t = context.Input("Ids"); + auto *output_t = context.Output("Out"); int64_t padding_idx = context.Attr("padding_idx"); - auto* ids_var = context.InputVar("Ids"); - Tensor* output_t = context.Output("Out"); - - int64_t* ids; - int64_t K; - - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W; - // when Ids's type is SelectedRows, the rows of Ids contains the - // ids to be looked up in W. - if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); - K = ids_t->numel(); - } else if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().CUDAData(context.GetPlace())); - K = ids_t->rows().size(); - output_t->Resize({K, table_t->dims()[1]}); - } else { - PADDLE_THROW("Unsupported Variable Type of Ids"); - } size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; - auto* table = table_t->data(); - auto* output = output_t->mutable_data(context.GetPlace()); + size_t K = ids_t->numel(); + + auto *ids = ids_t->data(); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); dim3 threads(128, 8); dim3 grids(8, 1); @@ -122,19 +105,19 @@ class LookupTableCUDAKernel : public framework::OpKernel { template class LookupTableGradCUDAKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = + void Compute(const framework::ExecutionContext &context) const override { + auto &dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - auto* ids = context.Input("Ids"); - auto* table = context.Input("W"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); + auto *ids = context.Input("Ids"); + auto *table = context.Input("W"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); auto stream = dev_ctx.stream(); @@ -150,12 +133,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { d_table->set_rows(new_rows); - auto* d_table_value = d_table->mutable_value(); + auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_dim[0], table->dims()[1]}); d_table_value->mutable_data(context.GetPlace()); - auto* d_table_data = d_table_value->data(); - auto* d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); + auto *d_output_data = d_output->data(); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, d_output->numel() * sizeof(T), stream); @@ -168,9 +151,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { int N = d_table_t->dims()[0]; int D = d_table_t->dims()[1]; int K = ids_t->numel(); - const int64_t* ids = ids_t->data(); - const T* d_output = d_output_t->data(); - T* d_table = d_table_t->mutable_data(context.GetPlace()); + const int64_t *ids = ids_t->data(); + const T *d_output = d_output_t->data(); + T *d_table = d_table_t->mutable_data(context.GetPlace()); auto t = framework::EigenVector::Flatten(*d_table_t); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index d482506bf0..c9f074ca0e 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -36,43 +36,13 @@ template class LookupTableKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor auto *table_var = context.InputVar("W"); - auto *ids_var = context.InputVar("Ids"); - Tensor *output_t = context.Output("Out"); - int64_t padding_idx = context.Attr("padding_idx"); - - DDim table_dim; - if (table_var->IsType()) { - table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); - table_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter W of a LookupTable " - "must be either LoDTensor or SelectedRows"); - } - - int64_t *ids; - int64_t ids_numel; - - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W; - // when Ids's type is SelectedRows, the rows of Ids contains the - // ids to be looked up in W. - if (ids_var->IsType()) { - auto *ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); - ids_numel = ids_t->numel(); - } else if (ids_var->IsType()) { - auto *ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().data()); - ids_numel = ids_t->rows().size(); - output_t->Resize({ids_numel, table_dim[1]}); - } else { - PADDLE_THROW("Unsupported Variable Type of Ids"); - } + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); if (table_var->IsType()) { auto *table_t = context.Input("W"); diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 080c185420..3712955b3b 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer): dtype="float32", shape=param.shape, lod_level=param.lod_level) if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + idx = block.create_var( + dtype="int64", + shape=param.shape, + type=core.VarDesc.VarType.LOD_TENSOR) decay = block.create_var( dtype="float32", shape=param.shape, type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( type='lookup_table', inputs={'W': param, - 'Ids': grad}, + 'Ids': idx}, outputs={'Out': decay}, attrs={'is_sparse': True}) param = decay @@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer): dtype="float32", shape=param.shape, lod_level=param.lod_level) if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + idx = block.create_var( + dtype="int64", + shape=param.shape, + type=core.VarDesc.VarType.LOD_TENSOR) decay = block.create_var( dtype="float32", shape=param.shape, type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( type='lookup_table', inputs={'W': param, - 'Ids': grad}, + 'Ids': idx}, outputs={'Out': decay}, attrs={'is_sparse': True}) diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py new file mode 100644 index 0000000000..6a41c44fe6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class TestExtractRows(OpTest): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Variable + feature_len = 12 + rows = [0, 4, 4, 7] + np_array = np.ones((len(rows), feature_len)).astype("float32") + + in_x = scope.var('X').get_selected_rows() + in_x.set_height(len(rows)) + in_x.set_rows(rows) + in_x_tensor = in_x.get_tensor() + in_x_tensor.set(np_array, place) + + # create Out Variable + out_tensor = scope.var('Out').get_tensor() + + # create and run lookup_table operator + extract_rows_op = Operator("extract_rows", X='X', Out='Out') + extract_rows_op.run(scope, place) + + # get result from Out + result_array = np.array(out_tensor) + result_array = [ele[0] for ele in result_array] + assert result_array == rows + + def test_concat_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py index f8d5785fbf..e16ab1d15f 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py @@ -49,53 +49,6 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): pass -class TestLookupTableIdsIsSelectedRows(OpTest): - def check_with_place(self, place): - scope = core.Scope() - - # create and initialize Variable - height = 10 - rows = [0, 4, 4, 7] - row_numel = 12 - - # create and initialize W Variable - W = scope.var('W').get_tensor() - W_array = np.full((height, row_numel), 1.0).astype("float32") - for i in range(height): - W_array[i] *= i - W.set(W_array, place) - - # create and initialize Ids Variable - ids_selected_rows = scope.var('Ids').get_selected_rows() - ids_selected_rows.set_height(len(rows)) - ids_selected_rows.set_rows(rows) - np_array = np.ones((len(rows), row_numel)).astype("float32") - ids_tensor = ids_selected_rows.get_tensor() - ids_tensor.set(np_array, place) - - # create Out Variable - Out = scope.var('Out').get_selected_rows() - - # create and run lookup_table operator - concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out') - concat_rows_op.run(scope, place) - - # get result from Out - Out_tensor = Out.get_tensor() - result_array = np.array(Out_tensor) - - # all(): return True if all elements of the iterable are true (or if the iterable is empty) - for idx, row in enumerate(rows): - assert (row == result_array[idx]).all() - - def test_concat_rows(self): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) - for place in places: - self.check_with_place(place) - - class TestLookupTableWIsSelectedRows(OpTest): def check_with_place(self, place): scope = core.Scope() From 4dbcb975f7817c9f88acac90fbfc80aeab1a09d5 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Fri, 27 Jul 2018 16:51:56 +0800 Subject: [PATCH 37/37] Enable dist se resnext (#12365) * enable dist se resnext * small batch size --- python/paddle/fluid/tests/unittests/dist_se_resnext.py | 8 ++------ .../paddle/fluid/tests/unittests/test_dist_se_resnext.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 72bc1729b0..bf7816b246 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -278,7 +278,7 @@ class DistSeResneXt2x2: def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( - batch_size=20) + batch_size=2) if is_dist: t = get_transpiler(trainer_id, fluid.default_main_program(), endpoints, @@ -294,11 +294,7 @@ class DistSeResneXt2x2: strategy.num_threads = 1 strategy.allow_op_delay = False exe = fluid.ParallelExecutor( - True, - loss_name=avg_cost.name, - exec_strategy=strategy, - num_trainers=trainers, - trainer_id=trainer_id) + True, loss_name=avg_cost.name, exec_strategy=strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.itervalues() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index e3e7036f08..5ca13881bf 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -56,7 +56,7 @@ class TestDistSeResneXt2x2(unittest.TestCase): except os.error: retry_times -= 1 - def non_test_with_place(self): + def test_with_place(self): # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN required_envs = { "PATH": os.getenv("PATH"),