From d8e00facf7884c20257d1faa1fc92620be048e7b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 25 Jul 2018 23:46:48 +0800 Subject: [PATCH 01/40] reuse im_size --- paddle/fluid/operators/math/im2col.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index bb55ce21b0..101e046acb 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -54,12 +54,13 @@ class Im2ColFunctor Date: Wed, 25 Jul 2018 23:59:24 +0800 Subject: [PATCH 02/40] enable padding!=0 and fill height padding with 0 --- paddle/fluid/operators/math/im2col.cc | 64 ++++++++++++++++++++------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 101e046acb..311401b3d7 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -48,29 +48,63 @@ class Im2ColFunctor(); T* col_data = col->data(); // TODO(TJ): change me to template - // further optimaze: - // 1. padding != 1 - // 2. could also support stride_h != 1 + // further optimize: padding == 1 need special if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && - dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { + dilation[1] == 1) { int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; - size_t copy_size = sizeof(T) * output_width; - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; - for (int kh = 0; kh < filter_height; ++kh) { + if (padding[0] == 0 && padding[1] == 0) { + size_t copy_size = sizeof(T) * output_width; + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + oh * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } + return; + } else { + int plh = padding[0]; + // int plw = padding[1]; + int prh = + (output_height - 1) * stride[0] + filter_height - im_height - plh; + // int prw = (output_width - 1) * stride[1] + filter_width - im_width - + // plw; + + // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1) + // TODO(TJ): reuse sizes + assert(plh == prh); // because stride_h == 1 + for (int ph = 0; ph < plh; ++ph) { + size_t sz = sizeof(T) * output_width * (plh - ph); + T* col_start_l = col_data + ph * filter_width * col_matrix_width; + T* col_start_r = + col_data + + (filter_width - ph - 1) * filter_width * col_matrix_width + + col_matrix_width - output_width * (plh - ph); + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_l = + col_start_l + + ic * filter_width * filter_height * col_matrix_width; + T* dst_data_r = + col_start_r + + ic * filter_width * filter_height * col_matrix_width; for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; + std::memset(dst_data_l, 0, sz); + std::memset(dst_data_r, 0, sz); + dst_data_l = dst_data_l + col_matrix_width; + dst_data_r = dst_data_r + col_matrix_width; } - src_data = src_data + im_width; } } + return; } - return; } for (int c = 0; c < channels_col; ++c) { From 92518c519fd56065ef142b5e21cfe350648d1c47 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 00:25:19 +0800 Subject: [PATCH 03/40] reuse sizes saving time --- paddle/fluid/operators/math/im2col.cc | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 311401b3d7..90783ba1c6 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -79,25 +79,22 @@ class Im2ColFunctor Date: Wed, 25 Jul 2018 09:44:44 +0800 Subject: [PATCH 04/40] graph viz pass --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/ir/CMakeLists.txt | 1 + paddle/fluid/framework/ir/graph_viz_pass.cc | 66 +++++++++++++++++++ paddle/fluid/framework/ir/graph_viz_pass.h | 44 +++++++++++++ paddle/fluid/framework/parallel_executor.cc | 11 ++++ .../unittests/parallel_executor_test_base.py | 1 + .../unittests/test_parallel_executor_mnist.py | 42 ++++++------ 7 files changed, 147 insertions(+), 20 deletions(-) create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.cc create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 93ec047c80..f3c1e7c528 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -99,7 +99,7 @@ else() endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6447452ae5..a6bdd12b63 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -2,5 +2,6 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node) +cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc new file mode 100644 index 0000000000..c839ebadac --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr GraphVizPass::Apply( + std::unique_ptr graph) const { + std::unique_ptr fout(new std::ofstream(graph_viz_path_)); + PADDLE_ENFORCE(fout->good()); + std::ostream& sout = *fout; + + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kVariable) continue; + size_t cur_var_id = var_id++; + vars[n] = cur_var_id; + + sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]" + << std::endl; + } + + size_t op_id = 0; + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kOperation) continue; + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]" + << std::endl; + for (auto in : n->inputs) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : n->outputs) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h new file mode 100644 index 0000000000..08c534f417 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class GraphVizPass : public Pass { + public: + explicit GraphVizPass(const std::string& graph_viz_path) + : graph_viz_path_(graph_viz_path) {} + + std::unique_ptr Apply( + std::unique_ptr graph) const override; + + private: + const std::string graph_viz_path_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 02c836bea1..fbd5acc3e5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" @@ -133,7 +134,17 @@ ParallelExecutor::ParallelExecutor( } builder_ = builder_factory.Create(); std::unique_ptr graph(new ir::Graph(main_program)); + if (!build_strategy.debug_graphviz_path_.empty()) { + const std::string origin_graph_path = string::Sprintf( + "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph"); + graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph)); + } graph = builder_->Apply(std::move(graph)); + if (!build_strategy.debug_graphviz_path_.empty()) { + const std::string origin_graph_path = string::Sprintf( + "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec"); + graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph)); + } member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index fcf86cc583..c5e69e41be 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -71,6 +71,7 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() + build_strategy.debug_graphviz_path = "/tmp/graphviz" build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 76389d916f..d740eb5443 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -152,16 +152,6 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce) - def test_simple_fc(self): - # use_cuda - self.check_simple_fc_convergence(True) - self.check_simple_fc_convergence(False) - - def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reduce - self._compare_reduce_and_allreduce(simple_fc_net, True) - self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -188,10 +178,6 @@ class TestMNIST(TestParallelExecutorBase): for p_l in parallel_last_loss: self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -206,13 +192,31 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda) - def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(True) - self.check_batchnorm_fc_convergence(False) + def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + self.check_network_convergence( + fc_with_batchnorm, use_cuda=use_cuda, use_reduce=False) + """ + img, label = self._init_data() + + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=False) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=True) + """ def test_batchnorm_fc_with_new_strategy(self): - self._compare_reduce_and_allreduce(fc_with_batchnorm, True) - self._compare_reduce_and_allreduce(fc_with_batchnorm, False) + self.check_batchnorm_fc_convergence_use_reduce(True) + # self.check_batchnorm_fc_convergence_use_reduce(False) if __name__ == '__main__': From 142e832d21715c0ce651e4ac04f10554945e5ad7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 25 Jul 2018 19:59:24 +0800 Subject: [PATCH 05/40] pass registration --- .../details/multi_devices_graph_builder.cc | 31 ++--- .../details/multi_devices_graph_builder.h | 27 ++-- .../details/ssa_graph_builder_factory.cc | 33 ++--- .../framework/details/ssa_graph_checker.h | 12 +- .../framework/details/ssa_graph_printer.h | 34 ++--- paddle/fluid/framework/ir/graph_viz_pass.cc | 6 +- paddle/fluid/framework/ir/graph_viz_pass.h | 6 - paddle/fluid/framework/ir/pass.cc | 9 +- paddle/fluid/framework/ir/pass.h | 117 +++++++++++++++++- paddle/fluid/framework/parallel_executor.cc | 20 ++- 10 files changed, 191 insertions(+), 104 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 22f0cb20d0..4fad520f40 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,30 +34,16 @@ namespace paddle { namespace framework { namespace details { +void MultiDevSSAGraphBuilder::Init() const { + loss_var_name_ = Get("loss_var_name"); + places_ = Get>("places"); + local_scopes_ = Get>("local_scopes"); + strategy_ = Get("strategy"); #ifdef PADDLE_WITH_CUDA -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - nccl_ctxs_(nccl_ctxs), - strategy_(strategy) { -#else -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - strategy_(strategy) { + nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : params) { + + for (auto &p : Get>("params")) { grad_names_.insert(GradVarName(p)); } balance_vars_.resize(places_.size(), 0); @@ -241,6 +227,7 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unique_ptr graph) const { + Init(); // Give the topology sort order and rebuild the graph structure. std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); auto nodes = graph->ReleaseNodes(); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 55076f227b..c8c1b2a438 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -32,20 +32,6 @@ namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { public: -#ifdef PADDLE_WITH_CUDA - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, - const BuildStrategy &strategy); -#else - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - const BuildStrategy &strategy); -#endif std::unique_ptr Apply( std::unique_ptr graph) const override; int GetVarDeviceID(const std::string &varname) const override; @@ -53,15 +39,16 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { private: void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t device_id) const; + void Init() const; private: - std::string loss_var_name_; - const std::vector &places_; - const std::vector &local_scopes_; - std::unordered_set grad_names_; + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + mutable std::unordered_set grad_names_; #ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nccl_ctxs_; + mutable platform::NCCLContextMap *nccl_ctxs_; #endif bool IsScaleLossOp(ir::Node *node) const; @@ -113,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &var_names) const; private: - BuildStrategy strategy_; + mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; mutable std::unordered_map var_name_on_devices_; mutable std::vector balance_vars_; diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc index b4b49d3de6..e8d83943ac 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -22,26 +22,29 @@ namespace paddle { namespace framework { namespace details { std::unique_ptr SSAGraphBuilderFactory::Create() { - std::unique_ptr res( + std::unique_ptr res(new MultiDevSSAGraphBuilder); + res->SetNotOwned>("places", &places_); + res->SetNotOwned("loss_var_name", &loss_var_name_); + res->SetNotOwned>("params", ¶m_names_); + res->SetNotOwned>("local_scopes", &local_scopes_); + res->SetNotOwned("strategy", &strategy_); #ifdef PADDLE_WITH_CUDA - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, nccl_ctxs_, strategy_) -#else - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, strategy_) + res->SetNotOwned("nccl_ctxs", nccl_ctxs_); #endif - ); // NOLINT if (!strategy_.debug_graphviz_path_.empty()) { - std::unique_ptr fout( - new std::ofstream(strategy_.debug_graphviz_path_)); - PADDLE_ENFORCE(fout->good()); - std::unique_ptr graphviz_printer( - new GraphvizSSAGraphPrinter()); - res.reset(new SSAGraghBuilderWithPrinter( - std::move(fout), std::move(graphviz_printer), std::move(res))); + SSAGraphBuilder *previous_pass = res.release(); + res.reset(new SSAGraghBuilderWithPrinter); + res->Set("previous_pass", previous_pass); + res->SetNotOwned("debug_graphviz_path", + &strategy_.debug_graphviz_path_); + res->Set("graph_printer", + new GraphvizSSAGraphPrinter); } - res.reset(new SSAGraghBuilderWithChecker(std::move(res))); + + SSAGraphBuilder *previous_pass = res.release(); + res.reset(new SSAGraghBuilderWithChecker); + res->Set("previous_pass", previous_pass); return res; } diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 51ce6e5eca..ae5ad16b0c 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -24,25 +24,19 @@ namespace details { class SSAGraghBuilderWithChecker : public SSAGraphBuilder { public: - explicit SSAGraghBuilderWithChecker( - std::unique_ptr&& builder) - : builder_(std::move(builder)) {} - std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); + auto new_graph = + Get("previous_pass").Apply(std::move(graph)); PADDLE_ENFORCE(IsValidGraph(new_graph.get())); return new_graph; } int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); + return Get("previous_pass").GetVarDeviceID(var_name); } bool IsValidGraph(const ir::Graph* graph) const; - - private: - std::unique_ptr builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index a77c1bad3f..2a939ef4c9 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/details/ssa_graph_builder.h" @@ -35,37 +37,21 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { public: - SSAGraghBuilderWithPrinter(std::ostream& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ref_(sout) {} - - SSAGraghBuilderWithPrinter(std::unique_ptr&& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ptr_(std::move(sout)), - stream_ref_(*stream_ptr_) {} - std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); - printer_->Print(*new_graph, stream_ref_); + auto new_graph = + Get("previous_pass").Apply(std::move(graph)); + + std::unique_ptr fout( + new std::ofstream(Get("debug_graphviz_path"))); + PADDLE_ENFORCE(fout->good()); + Get("graph_printer").Print(*new_graph, *fout); return new_graph; } int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); + return Get("previous_pass").GetVarDeviceID(var_name); } - - private: - std::unique_ptr printer_; - std::unique_ptr builder_; - std::unique_ptr stream_ptr_; - std::ostream& stream_ref_; }; } // namespace details diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index c839ebadac..7d1cff7178 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -23,7 +23,8 @@ namespace ir { std::unique_ptr GraphVizPass::Apply( std::unique_ptr graph) const { - std::unique_ptr fout(new std::ofstream(graph_viz_path_)); + const std::string graph_viz_path = Get("graph_viz_path"); + std::unique_ptr fout(new std::ofstream(graph_viz_path)); PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; @@ -61,6 +62,9 @@ std::unique_ptr GraphVizPass::Apply( sout << "}\n"; return graph; } + } // namespace ir } // namespace framework } // namespace paddle + +REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index 08c534f417..04c0c35d12 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -29,14 +29,8 @@ namespace ir { class GraphVizPass : public Pass { public: - explicit GraphVizPass(const std::string& graph_viz_path) - : graph_viz_path_(graph_viz_path) {} - std::unique_ptr Apply( std::unique_ptr graph) const override; - - private: - const std::string graph_viz_path_; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index c05d7d0bb5..0e68ecb56f 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -15,5 +15,12 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +namespace ir { +PassRegistry& PassRegistry::Instance() { + static PassRegistry g_pass_info_map; + return g_pass_info_map; +} +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index f52ba788d5..9466924262 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -14,9 +14,14 @@ limitations under the License. */ #pragma once +#include +#include +#include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { @@ -25,10 +30,120 @@ namespace ir { class Pass { public: Pass() = default; - virtual ~Pass() {} + virtual ~Pass() { + for (auto &attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; + + template + AttrType &Get(const std::string &attr_name) const { + return *boost::any_cast(attrs_.at(attr_name)); + } + + template + void Set(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + attrs_[attr_name] = attr; + } + + private: + std::map attrs_; + std::map> attr_dels_; +}; + +using PassCreator = std::function()>; + +class Registrar { + public: + // In our design, various kinds of passes, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_PASS macros to + // call this method. So, as long as the callee code calls USE_PASS, the global + // registrar variable won't be removed by the linker. + void Touch() {} }; + +class PassRegistry { + public: + static PassRegistry &Instance(); + + bool Has(const std::string &pass_type) const { + return map_.find(pass_type) != map_.end(); + } + + void Insert(const std::string &type, const PassCreator &pass_creator) { + PADDLE_ENFORCE(!Has(type), "Pass %s has been registered", type); + map_.insert({type, pass_creator}); + } + + std::unique_ptr Get(const std::string &type) const { + PADDLE_ENFORCE(Has(type), "Pass %s has not been registered", type); + return map_.at(type)(); + } + + private: + PassRegistry() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(PassRegistry); +}; + +template +struct PassRegistrar : public Registrar { + explicit PassRegistrar(const char *pass_type) { + PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), + "'%s' is registered more than once.", pass_type); + PassRegistry::Instance().Insert(pass_type, []() -> std::unique_ptr { + return std::unique_ptr(new PassType()); + }); + } +}; + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ + TouchPassRegistrar_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index fbd5acc3e5..ff661d0013 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,19 +132,27 @@ ParallelExecutor::ParallelExecutor( PADDLE_THROW("Not compiled with CUDA."); #endif } - builder_ = builder_factory.Create(); + std::unique_ptr graph(new ir::Graph(main_program)); if (!build_strategy.debug_graphviz_path_.empty()) { - const std::string origin_graph_path = string::Sprintf( + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph"); - graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph)); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); } + + builder_ = builder_factory.Create(); graph = builder_->Apply(std::move(graph)); + if (!build_strategy.debug_graphviz_path_.empty()) { - const std::string origin_graph_path = string::Sprintf( + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec"); - graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph)); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); } + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -297,3 +305,5 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle + +USE_PASS(graph_viz_pass); From e3131e2d73252280e38b3096ca681951a122dddd Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 10:27:00 +0800 Subject: [PATCH 06/40] enable width padding --- paddle/fluid/operators/math/im2col.cc | 64 +++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 90783ba1c6..c29a137319 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -72,11 +72,11 @@ class Im2ColFunctor(0); // padding zero + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): use add and resue stride + T* dst_data_ic = col_data + ic * col_block_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data_kh = dst_data_ic + kh * col_block_fh; + for (T* dst_data : + {dst_data_kh, dst_data_kh + + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + } + } + } else { + // padding_size > 1 + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): use add and resue stride + T* dst_data_ic = + col_data + ic * filter_width * filter_height * col_matrix_width; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data_kh = + dst_data_ic + kh * filter_width * col_matrix_width; + for (int kw = 0; kw < plw; ++kw) { + // TODO(TJ): reuse array outside this for + size_t sz = sizeof(T) * (plw - kw); + T* dst_data = dst_data_kh + kw * col_matrix_width; + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + std::memset(dst_data, 0, sz); + dst_data = dst_data + output_width; + } + } + // TODO(TJ): use reverse to save cache + for (int kw = 0; kw < prw; ++kw) { + // TODO(TJ): reuse array outside this for + auto num = (prw - kw); + size_t sz = sizeof(T) * num; + T* dst_data = dst_data_kh + + (filter_width - 1 - kw) * col_matrix_width + + output_width - num; + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + std::memset(dst_data, 0, sz); + dst_data = dst_data + output_width; + } + } + } + } + } return; } } From e4d7d7ae8fd7198447df21188d3fd85868c8bafa Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 26 Jul 2018 11:06:12 +0800 Subject: [PATCH 07/40] pass refactoring --- .../details/multi_devices_graph_builder.cc | 46 ++++++--- .../details/multi_devices_graph_builder.h | 6 +- .../scope_buffered_ssa_graph_executor.h | 3 + .../framework/details/ssa_graph_builder.h | 4 +- .../details/ssa_graph_builder_factory.cc | 12 +-- .../details/ssa_graph_builder_factory.h | 14 +-- .../framework/details/ssa_graph_checker.cc | 3 + .../framework/details/ssa_graph_checker.h | 7 +- .../framework/details/ssa_graph_executor.h | 4 +- .../framework/details/ssa_graph_printer.cc | 3 + .../framework/details/ssa_graph_printer.h | 7 +- .../details/threaded_ssa_graph_executor.h | 1 + paddle/fluid/framework/ir/graph.h | 2 + paddle/fluid/framework/ir/pass.h | 2 + paddle/fluid/framework/parallel_executor.cc | 98 ++++++++++++------- paddle/fluid/framework/parallel_executor.h | 1 - .../operators/distributed/send_recv.proto | 97 ++++++++++++++++++ 17 files changed, 229 insertions(+), 81 deletions(-) create mode 100644 paddle/fluid/operators/distributed/send_recv.proto diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 4fad520f40..d211f02689 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -244,6 +244,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( result.Set("vars", new GraphVars(places_.size())); result.Set("dep_vars", new GraphDepVars); result.Set("ops", new GraphOps); + result.Set("sharded_var_device", new ShardedVarDevice); // find send/recv vars so that we can place the distributed training // realted op in the place 0 @@ -276,11 +277,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(node); + int op_dev_id = GetOpDeviceID(result, node); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { - var_name_on_devices_.emplace(n->Name(), op_dev_id); + graph->Get("sharded_var_device") + .emplace(n->Name(), op_dev_id); } } else { // This op runs on all devices, and its output may have parameter's @@ -317,7 +319,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( case BuildStrategy::ReduceStrategy::kReduce: cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - var_name_on_devices_.emplace(g_name, cur_device_id); + graph->Get("sharded_var_device") + .emplace(g_name, cur_device_id); bcast_var_name_set[cur_device_id].emplace(p_name); break; case BuildStrategy::ReduceStrategy::kAllReduce: @@ -499,7 +502,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, + ir::Node *node) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } @@ -512,15 +516,17 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1]); + int dev_id = GetVarDeviceID(graph, param_grad[1]); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } -int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { - auto got = var_name_on_devices_.find(varname); - return got == var_name_on_devices_.end() ? -1 : got->second; +int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, + const std::string &varname) const { + auto &sharded_var_device = graph.Get("sharded_var_device"); + auto got = sharded_var_device.find(varname); + return got == sharded_var_device.end() ? -1 : got->second; } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { @@ -625,20 +631,23 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, if (node->Op()->Type() == "split_byref" || node->Op()->Type() == "split_selected_rows") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get("sharded_var_device") + .emplace(varname, op_dev_id); } } for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get("sharded_var_device") + .emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get("sharded_var_device") + .emplace(varname, op_dev_id); } } else { PADDLE_ENFORCE( @@ -663,7 +672,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); + op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -678,7 +687,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get("sharded_var_device") + .emplace(varname, op_dev_id); } } } else if (node->Op()->Type() == "recv") { @@ -688,7 +698,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(output_var_names); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get("sharded_var_device") + .emplace(varname, op_dev_id); } } else { // send_barrier and fetch_barrier op can be scheduled on device 0 @@ -730,3 +741,6 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_pass, + paddle::framework::details::MultiDevSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index c8c1b2a438..baea091af3 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -34,7 +34,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { public: std::unique_ptr Apply( std::unique_ptr graph) const override; - int GetVarDeviceID(const std::string &varname) const override; private: void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, @@ -51,6 +50,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { mutable platform::NCCLContextMap *nccl_ctxs_; #endif + int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const; + bool IsScaleLossOp(ir::Node *node) const; void CreateRPCOp(ir::Graph *result, ir::Node *node) const; @@ -84,7 +85,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID(ir::Node *node) const; + int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { private: mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; - mutable std::unordered_map var_name_on_devices_; mutable std::vector balance_vars_; void SetCommunicationContext(OpHandleBase *op_handle, diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index cbfbcb1c0c..1b188aec59 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -40,6 +40,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy, std::vector local_scopes, std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); + + const ir::Graph& Graph() const { return underlying_executor_->Graph(); } + FeedFetchList Run(const std::vector& fetch_tensors) override; private: diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 2b4f31f2ff..e0ad027315 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -47,13 +47,13 @@ typedef std::unordered_set> GraphDepVars; // unordered. typedef std::vector> GraphOps; +typedef std::unordered_map ShardedVarDevice; + class SSAGraphBuilder : public ir::Pass { public: SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} - virtual int GetVarDeviceID(const std::string &var_name) const = 0; - DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc index e8d83943ac..2254a3b41e 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -21,8 +21,8 @@ namespace paddle { namespace framework { namespace details { -std::unique_ptr SSAGraphBuilderFactory::Create() { - std::unique_ptr res(new MultiDevSSAGraphBuilder); +std::unique_ptr ParallelExecutorPassManager::Create() { + std::unique_ptr res(new MultiDevSSAGraphBuilder); res->SetNotOwned>("places", &places_); res->SetNotOwned("loss_var_name", &loss_var_name_); res->SetNotOwned>("params", ¶m_names_); @@ -33,18 +33,18 @@ std::unique_ptr SSAGraphBuilderFactory::Create() { #endif if (!strategy_.debug_graphviz_path_.empty()) { - SSAGraphBuilder *previous_pass = res.release(); + ir::Pass *previous_pass = res.release(); res.reset(new SSAGraghBuilderWithPrinter); - res->Set("previous_pass", previous_pass); + res->Set("previous_pass", previous_pass); res->SetNotOwned("debug_graphviz_path", &strategy_.debug_graphviz_path_); res->Set("graph_printer", new GraphvizSSAGraphPrinter); } - SSAGraphBuilder *previous_pass = res.release(); + ir::Pass *previous_pass = res.release(); res.reset(new SSAGraghBuilderWithChecker); - res->Set("previous_pass", previous_pass); + res->Set("previous_pass", previous_pass); return res; } diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h index 91a119de83..1bfc3e71e8 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h @@ -29,13 +29,13 @@ namespace framework { class Scope; namespace details { -class SSAGraphBuilderFactory { +class ParallelExecutorPassManager { public: - SSAGraphBuilderFactory(const std::vector& places, - const std::string& loss_var_name, - const std::unordered_set& param_names, - const std::vector& local_scopes, - const BuildStrategy& strategy) + ParallelExecutorPassManager( + const std::vector& places, + const std::string& loss_var_name, + const std::unordered_set& param_names, + const std::vector& local_scopes, const BuildStrategy& strategy) : places_(places), loss_var_name_(loss_var_name), param_names_(param_names), @@ -52,7 +52,7 @@ class SSAGraphBuilderFactory { } #endif - std::unique_ptr Create(); + std::unique_ptr Create(); private: std::vector places_; diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index 0438b09610..2994329f48 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -85,3 +85,6 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_check_pass, + paddle::framework::details::SSAGraghBuilderWithChecker); diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index ae5ad16b0c..fb766fb415 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -26,16 +26,11 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { public: std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = - Get("previous_pass").Apply(std::move(graph)); + auto new_graph = Get("previous_pass").Apply(std::move(graph)); PADDLE_ENFORCE(IsValidGraph(new_graph.get())); return new_graph; } - int GetVarDeviceID(const std::string& var_name) const override { - return Get("previous_pass").GetVarDeviceID(var_name); - } - bool IsValidGraph(const ir::Graph* graph) const; }; diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 8815ec89b2..96fffb7d94 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -32,7 +32,9 @@ class SSAGraphExecutor { virtual ~SSAGraphExecutor(); - virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; + virtual const ir::Graph& Graph() const = 0; + + virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 20aab14644..95d0641d72 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_print_pass, + paddle::framework::details::SSAGraghBuilderWithPrinter); diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index 2a939ef4c9..b7d20aa983 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -39,8 +39,7 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { public: std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = - Get("previous_pass").Apply(std::move(graph)); + auto new_graph = Get("previous_pass").Apply(std::move(graph)); std::unique_ptr fout( new std::ofstream(Get("debug_graphviz_path"))); @@ -48,10 +47,6 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { Get("graph_printer").Print(*new_graph, *fout); return new_graph; } - - int GetVarDeviceID(const std::string& var_name) const override { - return Get("previous_pass").GetVarDeviceID(var_name); - } }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 3d67daa45e..82d6b5272a 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -42,6 +42,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const std::vector &places, std::unique_ptr &&graph); + const ir::Graph &Graph() const { return *graph_; } // Run a SSAGraph by a thread pool // Use topological sort algorithm FeedFetchList Run(const std::vector &fetch_tensors) override; diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 4f59ec82a7..49f39df4b9 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -42,6 +42,8 @@ class Graph { template AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), + "%s attr not registered for graph.", attr_name); return *boost::any_cast(attrs_.at(attr_name)); } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 9466924262..5ab7f9a1e2 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -44,6 +44,8 @@ class Pass { template AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), + "%s attr not registered for pass.", attr_name); return *boost::any_cast(attrs_.at(attr_name)); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff661d0013..a23fd2a41a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -33,6 +33,48 @@ limitations under the License. */ namespace paddle { namespace framework { +std::unique_ptr ApplyParallelExecutorPass( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, const bool use_cuda, +#ifdef PADDLE_WITH_CUDA + const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { +#else + const BuildStrategy &strategy) { +#endif + details::ParallelExecutorPassManager builder_factory( + places, loss_var_name, param_names, local_scopes, strategy); + if (use_cuda) { +#ifdef PADDLE_WITH_CUDA + builder_factory.SetNCCLContextMap(nccl_ctxs); +#else + PADDLE_THROW("Not compiled with CUDA."); +#endif + } + + std::unique_ptr graph(new ir::Graph(main_program)); + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); + } + + auto builder = builder_factory.Create(); + graph = builder->Apply(std::move(graph)); + + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_before_exec"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); + } + return graph; +} + class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -120,38 +162,18 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = var->Persistable(); } - // Step 3. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp - details::SSAGraphBuilderFactory builder_factory( - member_->places_, loss_var_name, params, member_->local_scopes_, - build_strategy); - if (member_->use_cuda_) { +// Step 3. Convert main_program to SSA form and dependency graph. Also, insert +// ncclOp #ifdef PADDLE_WITH_CUDA - builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy, + member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA."); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy); #endif - } - - std::unique_ptr graph(new ir::Graph(main_program)); - if (!build_strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - - builder_ = builder_factory.Create(); - graph = builder_->Apply(std::move(graph)); - - if (!build_strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); @@ -165,11 +187,18 @@ void ParallelExecutor::BCastParamsToDevices( // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. - bool initializing = builder_.get() == nullptr ? true : false; - + bool initializing = member_->executor_ ? false : true; for (auto &var : vars) { - int var_dev_id = - builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); + int var_dev_id = -1; + if (member_->executor_) { + auto &sharded_var_device = + member_->executor_->Graph().Get( + "sharded_var_device"); + if (sharded_var_device.find(var) != sharded_var_device.end()) { + var_dev_id = sharded_var_device.at(var); + } + } + if (!initializing && var_dev_id == -1) continue; framework::Variable *main_var = nullptr; @@ -307,3 +336,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace paddle USE_PASS(graph_viz_pass); +USE_PASS(multi_device_pass); +USE_PASS(multi_device_check_pass); +USE_PASS(multi_device_print_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ffb9934a2d..d624956acd 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -70,7 +70,6 @@ class ParallelExecutor { private: ParallelExecutorPrivate *member_; - std::unique_ptr builder_; }; } // namespace framework diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto new file mode 100644 index 0000000000..d0595ef108 --- /dev/null +++ b/paddle/fluid/operators/distributed/send_recv.proto @@ -0,0 +1,97 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under +the Apache License, Version 2.0 (the "License"); you may not use this file +except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto3"; +package sendrecv; + +option cc_generic_services = false; + +service SendRecvService { + // For parameter server round-robin like hashing, do not split tensors. + // Send and recv only one tensor + // TODO(typhoonzero): add streaming API + rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. + rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // pre-fetch variable by given variable name and Ids + rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} + + rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} +} + +// VariableMessage is serialized paddle variable message. +// It can be: +// LoDTensor +// SelectedRows +enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + NCCL_ID = 2; +} + +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. +message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + string varname = 1; + // TODO(Yancey1989): reference framework::proto::VarDesc::VarType + VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: + Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + int64 lod_level = 5; + repeated LodData lod = 6; + // selected_rows height, aka. original dim0 + int64 slr_height = 7; + // tensor data + bytes serialized = 8; + // selected_rows data + bytes rows = 9; + // Look up table block execution output variable name. + string out_varname = 10; + // If 1, the ps server will start profiling, the ps + // server stops profiling and generates a profile to /tmp/profile_ps_* + // when profile switches from 1 to 2. + int64 profile = 11; +} + +message VoidMessage {} From aa1085ddc54c5dc40ae91468d5f2a2a7ecdf85cc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 26 Jul 2018 12:54:55 +0800 Subject: [PATCH 08/40] all passes add doc --- doc/fluid/design/ir/draft.md | 38 ++++++++++ paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 3 - .../details/multi_devices_graph_builder.cc | 10 +-- .../details/ssa_graph_builder_factory.cc | 53 -------------- .../details/ssa_graph_builder_factory.h | 71 ------------------- .../framework/details/ssa_graph_checker.h | 5 +- .../framework/details/ssa_graph_printer.h | 8 +-- paddle/fluid/framework/ir/pass.h | 5 ++ paddle/fluid/framework/parallel_executor.cc | 46 ++++++++---- 10 files changed, 87 insertions(+), 154 deletions(-) delete mode 100644 paddle/fluid/framework/details/ssa_graph_builder_factory.cc delete mode 100644 paddle/fluid/framework/details/ssa_graph_builder_factory.h diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index a33b5a9c93..65bfaea6a1 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -71,6 +71,44 @@ is a `Graph` and its output is also a `Graph`. For example, a `Pass` can simply print out the `Graph`. A `Pass` can also fuse some `Graph`'s `Node`s. +```cpp +class Pass { + public: + + virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const; + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) ; + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr); +}; + +// In my_pass.cc +class MyPass : public Pass { + public: + std::unique_ptr Apply(std::unique_ptr graph) const override { + // do something. + return graph; + } +} +REGISTER_PASS(my_pass, MyPass); + + +// To use the pass. +auto my_pass = ir::PassRegistry::Instance().Get("my_pass"); +graph = my_pass->Apply(std::move(graph)); +// Note: to force link my_pass.cc, in the code: +USE_PASS(my_pass); +``` + #### Optimize `Optimize` contains a series of `Pass` with defined order. diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f3c1e7c528..d822a1c9c4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -99,7 +99,7 @@ else() endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9df7df1f42..5d652d3730 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -31,9 +31,6 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) - -cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) - cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index d211f02689..ff90f31cdb 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -35,15 +35,15 @@ namespace framework { namespace details { void MultiDevSSAGraphBuilder::Init() const { - loss_var_name_ = Get("loss_var_name"); - places_ = Get>("places"); - local_scopes_ = Get>("local_scopes"); - strategy_ = Get("strategy"); + loss_var_name_ = Get("loss_var_name"); + places_ = Get>("places"); + local_scopes_ = Get>("local_scopes"); + strategy_ = Get("strategy"); #ifdef PADDLE_WITH_CUDA nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : Get>("params")) { + for (auto &p : Get>("params")) { grad_names_.insert(GradVarName(p)); } balance_vars_.resize(places_.size(), 0); diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc deleted file mode 100644 index 2254a3b41e..0000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" -#include -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" -#include "paddle/fluid/framework/details/ssa_graph_checker.h" -#include "paddle/fluid/framework/details/ssa_graph_printer.h" - -namespace paddle { -namespace framework { -namespace details { -std::unique_ptr ParallelExecutorPassManager::Create() { - std::unique_ptr res(new MultiDevSSAGraphBuilder); - res->SetNotOwned>("places", &places_); - res->SetNotOwned("loss_var_name", &loss_var_name_); - res->SetNotOwned>("params", ¶m_names_); - res->SetNotOwned>("local_scopes", &local_scopes_); - res->SetNotOwned("strategy", &strategy_); -#ifdef PADDLE_WITH_CUDA - res->SetNotOwned("nccl_ctxs", nccl_ctxs_); -#endif - - if (!strategy_.debug_graphviz_path_.empty()) { - ir::Pass *previous_pass = res.release(); - res.reset(new SSAGraghBuilderWithPrinter); - res->Set("previous_pass", previous_pass); - res->SetNotOwned("debug_graphviz_path", - &strategy_.debug_graphviz_path_); - res->Set("graph_printer", - new GraphvizSSAGraphPrinter); - } - - ir::Pass *previous_pass = res.release(); - res.reset(new SSAGraghBuilderWithChecker); - res->Set("previous_pass", previous_pass); - - return res; -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h deleted file mode 100644 index 1bfc3e71e8..0000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/ssa_graph_builder.h" -#include "paddle/fluid/platform/place.h" - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { -class Scope; -namespace details { - -class ParallelExecutorPassManager { - public: - ParallelExecutorPassManager( - const std::vector& places, - const std::string& loss_var_name, - const std::unordered_set& param_names, - const std::vector& local_scopes, const BuildStrategy& strategy) - : places_(places), - loss_var_name_(loss_var_name), - param_names_(param_names), - local_scopes_(local_scopes), - strategy_(strategy) { -#ifdef PADDLE_WITH_CUDA - nccl_ctxs_ = nullptr; -#endif - } - -#ifdef PADDLE_WITH_CUDA - void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { - nccl_ctxs_ = nccl_ctxs; - } -#endif - - std::unique_ptr Create(); - - private: - std::vector places_; - std::string loss_var_name_; - std::unordered_set param_names_; - std::vector local_scopes_; - BuildStrategy strategy_; - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap* nccl_ctxs_; -#endif -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index fb766fb415..25891cf74d 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -26,9 +26,8 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { public: std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = Get("previous_pass").Apply(std::move(graph)); - PADDLE_ENFORCE(IsValidGraph(new_graph.get())); - return new_graph; + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; } bool IsValidGraph(const ir::Graph* graph) const; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index b7d20aa983..bd4498c061 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -39,13 +39,11 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { public: std::unique_ptr Apply( std::unique_ptr graph) const override { - auto new_graph = Get("previous_pass").Apply(std::move(graph)); - std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get("debug_graphviz_path"))); PADDLE_ENFORCE(fout->good()); - Get("graph_printer").Print(*new_graph, *fout); - return new_graph; + Get("graph_printer").Print(*graph, *fout); + return graph; } }; diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 5ab7f9a1e2..f254ef62df 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -42,6 +42,7 @@ class Pass { virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; + // Get a reference to the attributed previously set. template AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), @@ -49,6 +50,7 @@ class Pass { return *boost::any_cast(attrs_.at(attr_name)); } + // Set a pointer to the attribute. Pass takes ownership of the attribute. template void Set(const std::string &attr_name, AttrType *attr) { PADDLE_ENFORCE(attrs_.count(attr_name) == 0); @@ -59,6 +61,8 @@ class Pass { }; } + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. template void SetNotOwned(const std::string &attr_name, AttrType *attr) { PADDLE_ENFORCE(attrs_.count(attr_name) == 0); @@ -127,6 +131,7 @@ struct PassRegistrar : public Registrar { __test_global_namespace_##uniq_name##__>::value, \ msg) +// Register a new pass that can be applied on the IR. #define REGISTER_PASS(pass_type, pass_class) \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ __reg_pass__##pass_type, \ diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a23fd2a41a..77bed5c999 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,7 +26,8 @@ limitations under the License. */ #endif #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include "paddle/fluid/framework/details/ssa_graph_printer.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -43,16 +44,6 @@ std::unique_ptr ApplyParallelExecutorPass( #else const BuildStrategy &strategy) { #endif - details::ParallelExecutorPassManager builder_factory( - places, loss_var_name, param_names, local_scopes, strategy); - if (use_cuda) { -#ifdef PADDLE_WITH_CUDA - builder_factory.SetNCCLContextMap(nccl_ctxs); -#else - PADDLE_THROW("Not compiled with CUDA."); -#endif - } - std::unique_ptr graph(new ir::Graph(main_program)); if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); @@ -62,8 +53,37 @@ std::unique_ptr ApplyParallelExecutorPass( graph = viz_pass->Apply(std::move(graph)); } - auto builder = builder_factory.Create(); - graph = builder->Apply(std::move(graph)); + auto multi_device_pass = + ir::PassRegistry::Instance().Get("multi_device_pass"); + multi_device_pass->SetNotOwned>("places", + &places); + multi_device_pass->SetNotOwned("loss_var_name", + &loss_var_name); + multi_device_pass->SetNotOwned>( + "params", ¶m_names); + multi_device_pass->SetNotOwned>("local_scopes", + &local_scopes); + multi_device_pass->SetNotOwned("strategy", &strategy); + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + multi_device_pass->SetNotOwned("nccl_ctxs", nctx); +#endif + graph = multi_device_pass->Apply(std::move(graph)); + + if (!strategy.debug_graphviz_path_.empty()) { + auto multi_device_print_pass = + ir::PassRegistry::Instance().Get("multi_device_print_pass"); + multi_device_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy.debug_graphviz_path_); + multi_device_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + graph = multi_device_print_pass->Apply(std::move(graph)); + } + + auto multi_device_check_pass = + ir::PassRegistry::Instance().Get("multi_device_check_pass"); + graph = multi_device_check_pass->Apply(std::move(graph)); if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); From 4eeed0b5e48db8bc822cd6de35a30e2a0d7705b4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 15:01:21 +0800 Subject: [PATCH 09/40] refine width padding and enable core copy --- paddle/fluid/operators/math/im2col.cc | 67 +++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index c29a137319..be373c99d1 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -126,11 +126,9 @@ class Im2ColFunctor 1 for (int ic = 0; ic < im_channels; ++ic) { // TODO(TJ): use add and resue stride - T* dst_data_ic = - col_data + ic * filter_width * filter_height * col_matrix_width; + T* dst_data_ic = col_data + ic * col_block_ic; for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = - dst_data_ic + kh * filter_width * col_matrix_width; + T* dst_data_kh = dst_data_ic + kh * col_block_fh; for (int kw = 0; kw < plw; ++kw) { // TODO(TJ): reuse array outside this for size_t sz = sizeof(T) * (plw - kw); @@ -158,6 +156,67 @@ class Im2ColFunctor 2*pw: kw = 3, pw = 1 + // 0 x x x x ... x x x x 0 + // 1 1 1 1 1 1 + // ==> + // 0 x ... x x + // x x ... x x + // x x ... x 0 + // 2. kw < 2*pw: kw = 3, pw = 2 + // 0 0 x x x ... x x x 0 0 + // 1 1 1 1 1 1 + // ==> + // 0 0 x ... x x x + // 0 x x ... x x 0 + // x x x ... x 0 0 + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + if (plw + prw < filter_width) { + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = + im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || + (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; + ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } + } else { + LOG(FATAL) << "Not implement yet"; + } return; } } From ab72d28a5ec3efd4243df8c7cd3370b9354e009f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 26 Jul 2018 15:02:51 +0800 Subject: [PATCH 10/40] clean up and correctness check --- doc/fluid/design/ir/draft.md | 10 +- .../details/multi_devices_graph_builder.cc | 103 ++++++++++-------- .../details/multi_devices_graph_builder.h | 4 +- .../framework/details/ssa_graph_builder.cc | 13 ++- .../framework/details/ssa_graph_builder.h | 4 + .../framework/details/ssa_graph_checker.cc | 12 +- .../framework/details/ssa_graph_checker.h | 4 +- .../framework/details/ssa_graph_printer.cc | 6 +- .../framework/details/ssa_graph_printer.h | 4 +- .../details/threaded_ssa_graph_executor.cc | 8 +- paddle/fluid/framework/ir/graph.h | 8 +- paddle/fluid/framework/ir/graph_viz_pass.cc | 8 +- paddle/fluid/framework/ir/graph_viz_pass.h | 4 +- paddle/fluid/framework/ir/pass.cc | 16 +++ paddle/fluid/framework/ir/pass.h | 70 +++++++++--- paddle/fluid/framework/parallel_executor.cc | 2 +- 16 files changed, 184 insertions(+), 92 deletions(-) diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index 65bfaea6a1..e141ce0959 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -75,7 +75,12 @@ can also fuse some `Graph`'s `Node`s. class Pass { public: - virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; + std::unique_ptr Apply(std::unique_ptr graph) const { + // Some correctness check. + auto new_graph = ApplyImpl(std::move(graph)); + // Some correctness check. + return new_graph; + } // Get a reference to the attributed previously set. template @@ -89,6 +94,9 @@ class Pass { // should delete the attribute. template void SetNotOwned(const std::string &attr_name, AttrType *attr); + + protected: + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0; }; // In my_pass.cc diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index ff90f31cdb..b63c2f695a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,16 +34,22 @@ namespace paddle { namespace framework { namespace details { +static const char kLossVarName[] = "loss_var_name"; +static const char kPlaces[] = "places"; +static const char kParams[] = "params"; +static const char kLocalScopes[] = "local_scopes"; +static const char kStrategy[] = "strategy"; + void MultiDevSSAGraphBuilder::Init() const { - loss_var_name_ = Get("loss_var_name"); - places_ = Get>("places"); - local_scopes_ = Get>("local_scopes"); - strategy_ = Get("strategy"); + loss_var_name_ = Get(kLossVarName); + places_ = Get>(kPlaces); + local_scopes_ = Get>(kLocalScopes); + strategy_ = Get(kStrategy); #ifdef PADDLE_WITH_CUDA nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : Get>("params")) { + for (auto &p : Get>(kParams)) { grad_names_.insert(GradVarName(p)); } balance_vars_.resize(places_.size(), 0); @@ -58,7 +64,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t place_id) const { auto p = places_[place_id]; - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -225,7 +231,7 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { return sorted_ret; } -std::unique_ptr MultiDevSSAGraphBuilder::Apply( +std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. @@ -241,10 +247,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 - result.Set("vars", new GraphVars(places_.size())); - result.Set("dep_vars", new GraphDepVars); - result.Set("ops", new GraphOps); - result.Set("sharded_var_device", new ShardedVarDevice); + result.Set(kGraphVars, new GraphVars(places_.size())); + result.Set(kGraphDepVars, new GraphDepVars); + result.Set(kGraphOps, new GraphOps); + result.Set(kShardedVarDevice, new ShardedVarDevice); // find send/recv vars so that we can place the distributed training // realted op in the place 0 @@ -281,7 +287,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { - graph->Get("sharded_var_device") + graph->Get(kShardedVarDevice) .emplace(n->Name(), op_dev_id); } } else { @@ -319,7 +325,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( case BuildStrategy::ReduceStrategy::kReduce: cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - graph->Get("sharded_var_device") + graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); bcast_var_name_set[cur_device_id].emplace(p_name); break; @@ -406,16 +412,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_); #endif - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); auto *in = - result->Get("vars").at(src_dev_id).at(p_name).back().get(); + result->Get(kGraphVars).at(src_dev_id).at(p_name).back().get(); op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars").at(i).at(p_name); + auto &vars = result->Get(kGraphVars).at(i).at(p_name); auto *out_var = new VarHandle( result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(), i, p_name, p); @@ -427,7 +433,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const { - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id])); CreateOpHandleIOs(result, node, dev_id); @@ -436,20 +442,20 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); @@ -465,20 +471,20 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); for (const std::string &d_name : datas) { - auto &vars = result->Get("vars")[i][d_name]; + auto &vars = result->Get(kGraphVars)[i][d_name]; PADDLE_ENFORCE(!vars.empty()); op_handle->AddInput(vars.back().get()); auto var = new VarHandle( @@ -524,7 +530,7 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const { - auto &sharded_var_device = graph.Get("sharded_var_device"); + auto &sharded_var_device = graph.Get(kShardedVarDevice); auto got = sharded_var_device.find(varname); return got == sharded_var_device.end() ? -1 : got->second; } @@ -544,7 +550,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), local_scopes_.size(), local_scopes_[i], places_[i], communication_dev_ctx); - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -565,7 +571,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); CreateOpHandleIOs(result, node, scope_idx); } @@ -575,25 +581,25 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); } - auto &vars = result->Get("vars")[dst_dev_id][og]; + auto &vars = result->Get(kGraphVars)[dst_dev_id][og]; auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), dst_dev_id, og, places_[dst_dev_id]); @@ -606,11 +612,11 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, // on it. void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { - for (auto &prev_op : result->Get("ops")) { + for (auto &prev_op : result->Get(kGraphOps)) { if (prev_op->Name() == prev_op_name) { auto *dep_var = new DummyVarHandle(result->CreateControlDepVar()); prev_op->AddOutput(dep_var); - result->Get("dep_vars").emplace(dep_var); + result->Get(kGraphDepVars).emplace(dep_var); op->AddInput(dep_var); } } @@ -635,18 +641,18 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - result->Get("sharded_var_device") + result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } } for (auto &varname : output_var_names) { - result->Get("sharded_var_device") + result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { op_dev_id = GetVarDeviceID(*result, input_var_names[0]); for (auto &varname : output_var_names) { - result->Get("sharded_var_device") + result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } } else { @@ -661,7 +667,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, CreateComputationalOp(result, node, op_dev_id); if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "fetch_barrier"); } } @@ -687,7 +693,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - result->Get("sharded_var_device") + result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } } @@ -698,7 +704,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(output_var_names); for (auto &varname : output_var_names) { - result->Get("sharded_var_device") + result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } } else { @@ -709,17 +715,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get("ops").emplace_back(new RPCOpHandle( + result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "send"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "send_barrier"); } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "recv"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); } else if (node->Op()->Type() == "send") { // do nothing } else { @@ -743,4 +749,9 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { } // namespace paddle REGISTER_PASS(multi_device_pass, - paddle::framework::details::MultiDevSSAGraphBuilder); + paddle::framework::details::MultiDevSSAGraphBuilder) + .RequirePassAttr(paddle::framework::details::kLossVarName) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kParams) + .RequirePassAttr(paddle::framework::details::kLocalScopes) + .RequirePassAttr(paddle::framework::details::kStrategy); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index baea091af3..099dbe5abe 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -31,8 +31,8 @@ class Scope; namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { - public: - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; private: diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 506e7eb35c..575532540a 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -18,7 +18,7 @@ namespace paddle { namespace framework { namespace details { void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { continue; @@ -50,7 +50,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); read_op->AddOutput(dep_var); write_op->AddInput(dep_var); - graph->Get("dep_vars").emplace(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); } } } @@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { - auto &var_holders = graph->Get("vars")[place_offset]; + auto &var_holders = graph->Get(kGraphVars)[place_offset]; auto &var_holder = var_holders[node->Name()]; VarHandle *var = nullptr; if (var_holder.empty()) { @@ -83,7 +83,8 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, ir::Node *new_node, const platform::Place &place, size_t place_offset) { - auto &vars = graph->Get("vars")[place_offset][new_node->Name()]; + auto &vars = + graph->Get(kGraphVars)[place_offset][new_node->Name()]; size_t version = vars.size(); auto var = new VarHandle(new_node, version, place_offset, new_node->Name(), place); @@ -92,12 +93,12 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, } void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) { - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (!op->Outputs().empty()) { continue; } auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get("dep_vars").emplace(dummy_leaf); + graph->Get(kGraphDepVars).emplace(dummy_leaf); op->AddOutput(dummy_leaf); } } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index e0ad027315..53a4ad003d 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -39,15 +39,19 @@ namespace details { typedef std::vector< std::unordered_map>>> GraphVars; +const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. typedef std::unordered_set> GraphDepVars; +const char kGraphDepVars[] = "dep_vars"; // all operators. NOTE that even we use a vector here, the operators is // unordered. typedef std::vector> GraphOps; +const char kGraphOps[] = "ops"; typedef std::unordered_map ShardedVarDevice; +const char kShardedVarDevice[] = "sharded_var_device"; class SSAGraphBuilder : public ir::Pass { public: diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index 2994329f48..b9e1cda1f2 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } }; - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { insert_pending_var(version_pair.get()); @@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } } - for (auto &var : graph->Get("dep_vars")) { + for (auto &var : graph->Get(kGraphDepVars)) { insert_pending_var(var.get()); } - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (op->Inputs().empty()) { ready_ops.insert(op.get()); } else { @@ -87,4 +87,8 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } // namespace paddle REGISTER_PASS(multi_device_check_pass, - paddle::framework::details::SSAGraghBuilderWithChecker); + paddle::framework::details::SSAGraghBuilderWithChecker) + .RequireGraphAttr(paddle::framework::details::kGraphVars) + .RequireGraphAttr(paddle::framework::details::kGraphDepVars) + .RequireGraphAttr(paddle::framework::details::kGraphOps) + .RequireGraphAttr(paddle::framework::details::kShardedVarDevice); diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 25891cf74d..0e861ecb23 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -23,8 +23,8 @@ namespace framework { namespace details { class SSAGraghBuilderWithChecker : public SSAGraphBuilder { - public: - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { PADDLE_ENFORCE(IsValidGraph(graph.get())); return graph; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 95d0641d72..ec3f31ab8d 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -22,7 +22,7 @@ namespace details { template static inline void IterAllVar(const ir::Graph &graph, Callback callback) { - for (auto &each : graph.Get("vars")) { + for (auto &each : graph.Get(kGraphVars)) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { callback(*pair2); @@ -30,7 +30,7 @@ static inline void IterAllVar(const ir::Graph &graph, Callback callback) { } } - for (auto &var : graph.Get("dep_vars")) { + for (auto &var : graph.Get(kGraphDepVars)) { callback(*var); } } @@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, }); size_t op_id = 0; - for (auto &op : graph.Get("ops")) { + for (auto &op : graph.Get(kGraphOps)) { std::string op_name = "op_" + std::to_string(op_id++); sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" << std::endl; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index bd4498c061..5eafd1805c 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -36,8 +36,8 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { }; class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { - public: - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( new std::ofstream(Get("debug_graphviz_path"))); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c19f74476f..eec4050733 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,18 +45,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set delayed_ops; // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); } } } - for (auto &var : graph_->Get("dep_vars")) { + for (auto &var : graph_->Get(details::kGraphDepVars)) { InsertPendingVar(&pending_vars, &ready_vars, var.get()); } - for (auto &op : graph_->Get("ops")) { + for (auto &op : graph_->Get(details::kGraphOps)) { if (op->Inputs().empty()) { // Special case, Op has no input. ready_ops.insert(op.get()); } else { @@ -162,7 +162,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 49f39df4b9..78094e46fb 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -40,10 +40,14 @@ class Graph { attr_dels_.clear(); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + template AttrType &Get(const std::string &attr_name) const { - PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), - "%s attr not registered for graph.", attr_name); + PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", + attr_name); return *boost::any_cast(attrs_.at(attr_name)); } diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 7d1cff7178..8cb812d138 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -20,10 +20,11 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +static const char kGraphVizPath[] = "graph_viz_path"; -std::unique_ptr GraphVizPass::Apply( +std::unique_ptr GraphVizPass::ApplyImpl( std::unique_ptr graph) const { - const std::string graph_viz_path = Get("graph_viz_path"); + const std::string graph_viz_path = Get(kGraphVizPath); std::unique_ptr fout(new std::ofstream(graph_viz_path)); PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; @@ -67,4 +68,5 @@ std::unique_ptr GraphVizPass::Apply( } // namespace framework } // namespace paddle -REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass); +REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index 04c0c35d12..1fd8c8a26e 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -28,8 +28,8 @@ namespace framework { namespace ir { class GraphVizPass : public Pass { - public: - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; }; diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 0e68ecb56f..2ebc3c7430 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -17,6 +17,22 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +std::unique_ptr Pass::Apply(std::unique_ptr graph) const { + for (const std::string& attr : required_pass_attrs_) { + PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), + "Required pass atrribute %s not registered.", attr); + } + for (const std::string& attr : required_graph_attrs_) { + PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not exist.", + attr); + } + auto applied_graph = ApplyImpl(std::move(graph)); + // TODO(panyx0718): Add more verifications. + PADDLE_ENFORCE(!HasCircle(*applied_graph), + "Illegal Pass. Generated graph shouldn't has cycle."); + return applied_graph; +} + PassRegistry& PassRegistry::Instance() { static PassRegistry g_pass_info_map; return g_pass_info_map; diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index f254ef62df..3f65794fab 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/variant.h" @@ -26,6 +27,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +template +struct PassRegistrar; class Pass { public: @@ -40,7 +43,7 @@ class Pass { attr_dels_.clear(); } - virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; + std::unique_ptr Apply(std::unique_ptr graph) const; // Get a reference to the attributed previously set. template @@ -69,7 +72,25 @@ class Pass { attrs_[attr_name] = attr; } + protected: + virtual std::unique_ptr ApplyImpl( + std::unique_ptr graph) const = 0; + private: + template + friend struct PassRegistrar; + + void RegisterRequiredPassAttrs(const std::unordered_set &attrs) { + required_pass_attrs_.insert(attrs.begin(), attrs.end()); + } + + void RegisterRequiredGraphAttrs( + const std::unordered_set &attrs) { + required_graph_attrs_.insert(attrs.begin(), attrs.end()); + } + + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; std::map attrs_; std::map> attr_dels_; }; @@ -119,10 +140,28 @@ struct PassRegistrar : public Registrar { explicit PassRegistrar(const char *pass_type) { PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), "'%s' is registered more than once.", pass_type); - PassRegistry::Instance().Insert(pass_type, []() -> std::unique_ptr { - return std::unique_ptr(new PassType()); - }); + PassRegistry::Instance().Insert( + pass_type, [this]() -> std::unique_ptr { + std::unique_ptr pass(new PassType()); + pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); + pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + return pass; + }); } + + PassRegistrar &RequirePassAttr(const std::string &attr) { + required_pass_attrs_.insert(attr); + return *this; + } + + PassRegistrar &RequireGraphAttr(const std::string &attr) { + required_graph_attrs_.insert(attr); + return *this; + } + + private: + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; }; #define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ @@ -132,16 +171,19 @@ struct PassRegistrar : public Registrar { msg) // Register a new pass that can be applied on the IR. -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ + __pass_registrar_##pass_type##__ #define USE_PASS(pass_type) \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 77bed5c999..112b48ca31 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices( if (member_->executor_) { auto &sharded_var_device = member_->executor_->Graph().Get( - "sharded_var_device"); + details::kShardedVarDevice); if (sharded_var_device.find(var) != sharded_var_device.end()) { var_dev_id = sharded_var_device.at(var); } From 12e9bf6c178aa1eb3d7839a2d28fe0c2a98dbead Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 26 Jul 2018 15:05:08 +0800 Subject: [PATCH 11/40] clean up --- .../operators/distributed/send_recv.proto | 97 ------------------- .../unittests/parallel_executor_test_base.py | 1 - .../unittests/test_parallel_executor_mnist.py | 42 ++++---- 3 files changed, 19 insertions(+), 121 deletions(-) delete mode 100644 paddle/fluid/operators/distributed/send_recv.proto diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto deleted file mode 100644 index d0595ef108..0000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = false; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} -} - -// VariableMessage is serialized paddle variable message. -// It can be: -// LoDTensor -// SelectedRows -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; -} - -message VoidMessage {} diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c5e69e41be..fcf86cc583 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -71,7 +71,6 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "/tmp/graphviz" build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index d740eb5443..76389d916f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -152,6 +152,16 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce) + def test_simple_fc(self): + # use_cuda + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(False) + + def test_simple_fc_with_new_strategy(self): + # use_cuda, use_reduce + self._compare_reduce_and_allreduce(simple_fc_net, True) + self._compare_reduce_and_allreduce(simple_fc_net, False) + def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -178,6 +188,10 @@ class TestMNIST(TestParallelExecutorBase): for p_l in parallel_last_loss: self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(False) + def check_batchnorm_fc_convergence(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -192,31 +206,13 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda) - def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - fc_with_batchnorm, use_cuda=use_cuda, use_reduce=False) - """ - img, label = self._init_data() - - all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=False) - reduce_first_loss, reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=True) - """ + def test_batchnorm_fc(self): + self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence_use_reduce(True) - # self.check_batchnorm_fc_convergence_use_reduce(False) + self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + self._compare_reduce_and_allreduce(fc_with_batchnorm, False) if __name__ == '__main__': From 507c14304788e3cf5160db354dc23bdc9e725ff0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 17:27:17 +0800 Subject: [PATCH 12/40] im2col cfo cpu code clean --- paddle/fluid/operators/math/im2col.cc | 203 +------------- paddle/fluid/operators/math/im2col_cfo_cpu.h | 265 +++++++++++++++++++ 2 files changed, 270 insertions(+), 198 deletions(-) create mode 100644 paddle/fluid/operators/math/im2col_cfo_cpu.h diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index be373c99d1..478900e1c2 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" namespace paddle { namespace operators { @@ -35,210 +36,16 @@ class Im2ColFunctordims().size() == 5); - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - - const T* im_data = im.data(); - T* col_data = col->data(); - // TODO(TJ): change me to template - // further optimize: padding == 1 need special if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && dilation[1] == 1) { - int col_matrix_width = output_width * output_height; - int im_size = im_height * im_width; if (padding[0] == 0 && padding[1] == 0) { - size_t copy_size = sizeof(T) * output_width; - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } - return; + im2col_sh1sw1dh1dw1ph0pw0(im, col); } else { - int plh = padding[0]; - int plw = padding[1]; - int prh = - (output_height - 1) * stride[0] + filter_height - im_height - plh; - int prw = - (output_width - 1) * stride[1] + filter_width - im_width - plw; - - // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1) - // TODO(TJ): refine ph*xxx - assert(plh == prh); // because stride_h == 1 - int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow - int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow - for (int ph = 0; ph < plh; ++ph) { - int sz = output_width * (plh - ph); - size_t copy_sz = sizeof(T) * sz; - T* col_start_l = col_data + ph * col_block_fh; - T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh + - col_matrix_width - sz; - for (int ic = 0; ic < im_channels; ++ic) { - T* dst_data_l = col_start_l + ic * col_block_ic; - T* dst_data_r = col_start_r + ic * col_block_ic; - for (int kw = 0; kw < filter_width; ++kw) { - std::memset(dst_data_l, 0, copy_sz); - std::memset(dst_data_r, 0, copy_sz); - dst_data_l = dst_data_l + col_matrix_width; - dst_data_r = dst_data_r + col_matrix_width; - } - } - } - - // fill width padding - assert(plw == prw); // because stride_w == 1 - if (plw == 1) { - auto pad = static_cast(0); // padding zero - for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride - T* dst_data_ic = col_data + ic * col_block_ic; - for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (T* dst_data : - {dst_data_kh, dst_data_kh + - (filter_width - prw) * col_matrix_width + - output_width - 1}) { - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - *dst_data = pad; - dst_data = dst_data + output_width; - } - } - } - } - } else { - // padding_size > 1 - for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride - T* dst_data_ic = col_data + ic * col_block_ic; - for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (int kw = 0; kw < plw; ++kw) { - // TODO(TJ): reuse array outside this for - size_t sz = sizeof(T) * (plw - kw); - T* dst_data = dst_data_kh + kw * col_matrix_width; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - // TODO(TJ): use reverse to save cache - for (int kw = 0; kw < prw; ++kw) { - // TODO(TJ): reuse array outside this for - auto num = (prw - kw); - size_t sz = sizeof(T) * num; - T* dst_data = dst_data_kh + - (filter_width - 1 - kw) * col_matrix_width + - output_width - num; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - } - } - } - - // fill im_data - // padding cover two cases: - // 1. kw > 2*pw: kw = 3, pw = 1 - // 0 x x x x ... x x x x 0 - // 1 1 1 1 1 1 - // ==> - // 0 x ... x x - // x x ... x x - // x x ... x 0 - // 2. kw < 2*pw: kw = 3, pw = 2 - // 0 0 x x x ... x x x 0 0 - // 1 1 1 1 1 1 - // ==> - // 0 0 x ... x x x - // 0 x x ... x x 0 - // x x x ... x 0 0 - - // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * - // (output_width-1)} - // length of copy_size is equal kw. - if (plw + prw < filter_width) { - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = - im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; - for (int kh = 0; kh < filter_height; ++kh) { - if ((oh < plh && kh < plh) || - (oh > (output_height - prh - 1) && - kh > (filter_height - prh - 1))) { - dst_data = dst_data + filter_width * col_matrix_width; - continue; - } - // TODO(TJ): reuse plw-kw outside this for - // try to unify - for (int kw = 0; kw < plw; ++kw) { - std::memcpy(dst_data + (plw - kw), src_data, - sizeof(T) * (output_width - (plw - kw))); - dst_data = dst_data + col_matrix_width; - } - for (int kw = plw; kw < filter_width - prw; ++kw) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * output_width); - dst_data = dst_data + col_matrix_width; - } - int i = 1; - for (int kw = filter_width - prw; kw < filter_width; - ++kw, ++i) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * (output_width - i)); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } - } else { - LOG(FATAL) << "Not implement yet"; - } - return; - } - } - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } + im2col_sh1sw1dh1dw1(im, padding, col); } + return; } + im2col_common(im, dilation, stride, padding, col); } }; diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h new file mode 100644 index 0000000000..ebdb062b7a --- /dev/null +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -0,0 +1,265 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * The most common im2col algorithm. + * Support dilation, stride and padding. + */ +template +inline void im2col_common(const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } +} + +/* + * im2col algorithm with strides == 1, dilations == 1, paddings == 0 + * */ +template +inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + const T* im_data = im.data(); + T* col_data = col->data(); + int col_matrix_width = output_width * output_height; + int im_size = im_height * im_width; + size_t copy_size = sizeof(T) * output_width; + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + oh * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } +} + +// further optimize: padding == 1 need special +template +inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, + const std::vector& padding, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + const int sh = 1; + const int sw = 1; + + const T* im_data = im.data(); + T* col_data = col->data(); + int col_matrix_width = output_width * output_height; + int im_size = im_height * im_width; + + int plh = padding[0]; + int plw = padding[1]; + int prh = (output_height - 1) * sh + filter_height - im_height - plh; + int prw = (output_width - 1) * sw + filter_width - im_width - plw; + + // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1) + // TODO(TJ): refine ph*xxx + assert(plh == prh); // because stride_h == 1 + int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow + int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow + for (int ph = 0; ph < plh; ++ph) { + int sz = output_width * (plh - ph); + size_t copy_sz = sizeof(T) * sz; + T* col_start_l = col_data + ph * col_block_fh; + T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh + + col_matrix_width - sz; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_l = col_start_l + ic * col_block_ic; + T* dst_data_r = col_start_r + ic * col_block_ic; + for (int kw = 0; kw < filter_width; ++kw) { + std::memset(dst_data_l, 0, copy_sz); + std::memset(dst_data_r, 0, copy_sz); + dst_data_l = dst_data_l + col_matrix_width; + dst_data_r = dst_data_r + col_matrix_width; + } + } + } + + // fill width padding + assert(plw == prw); // because stride_w == 1 + if (plw == 1) { + auto pad = static_cast(0); // padding zero + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): use add and resue stride + T* dst_data_ic = col_data + ic * col_block_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data_kh = dst_data_ic + kh * col_block_fh; + for (T* dst_data : + {dst_data_kh, dst_data_kh + + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + } + } + } else { + // padding_size > 1 + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): use add and resue stride + T* dst_data_ic = col_data + ic * col_block_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data_kh = dst_data_ic + kh * col_block_fh; + for (int kw = 0; kw < plw; ++kw) { + // TODO(TJ): reuse array outside this for + size_t sz = sizeof(T) * (plw - kw); + T* dst_data = dst_data_kh + kw * col_matrix_width; + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + std::memset(dst_data, 0, sz); + dst_data = dst_data + output_width; + } + } + // TODO(TJ): use reverse to save cache + for (int kw = 0; kw < prw; ++kw) { + // TODO(TJ): reuse array outside this for + auto num = (prw - kw); + size_t sz = sizeof(T) * num; + T* dst_data = dst_data_kh + + (filter_width - 1 - kw) * col_matrix_width + + output_width - num; + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + std::memset(dst_data, 0, sz); + dst_data = dst_data + output_width; + } + } + } + } + } + + // fill im_data + // padding cover two cases: + // 1. kw > 2*pw: kw = 3, pw = 1 + // 0 x x x x ... x x x x 0 + // 1 1 1 1 1 1 + // ==> + // 0 x ... x x + // x x ... x x + // x x ... x 0 + // 2. kw < 2*pw: kw = 3, pw = 2 + // 0 0 x x x ... x x x 0 0 + // 1 1 1 1 1 1 + // ==> + // 0 0 x ... x x x + // 0 x x ... x x 0 + // x x x ... x 0 0 + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + if (plw + prw < filter_width) { + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = + im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } + } else { + LOG(FATAL) << "Not implement yet"; + } +} + +} // namespace math +} // namespace operators +} // namespace paddle From 8d6be4fb5f2ad665b5cfa06c7573bd1cc895e3fb Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 18:08:36 +0800 Subject: [PATCH 13/40] refine im2col test and add benchmark --- paddle/fluid/operators/math/im2col_test.cc | 140 +++++++++++---------- 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index db61f68db3..b22d21a9a3 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" template void testIm2col() { @@ -160,82 +162,86 @@ void testIm2col() { delete context; } +TEST(math, im2col) { + testIm2col(); +#ifdef PADDLE_WITH_CUDA + testIm2col(); +#endif +} + +#define PREPARE_IM2COL_CPU \ + paddle::platform::CPUPlace place; \ + paddle::platform::CPUDeviceContext context(place); \ + paddle::framework::Tensor input; \ + paddle::framework::Tensor out; \ + paddle::framework::Tensor ref; \ + std::vector padding({ph, pw}); \ + std::vector stride({1, 1}); \ + std::vector dilation({1, 1}); \ + float* input_ptr = input.mutable_data({ic, ih, iw}, place); \ + for (int i = 0; i < input.numel(); ++i) { \ + input_ptr[i] = static_cast(i + 1); \ + } \ + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \ + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ + out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + paddle::operators::math::Im2ColFunctor< \ + paddle::operators::math::ColFormat::kCFO, \ + paddle::platform::CPUDeviceContext, float> \ + im2col + void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - paddle::framework::Tensor input; - paddle::framework::Tensor output; - paddle::framework::Tensor ref_output; - std::vector padding({ph, pw}); - std::vector stride({1, 1}); // stride_y, stride_x - std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; - int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; - float* input_ptr = - input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); - for (int i = 0; i < input.numel(); ++i) { - input_ptr[i] = static_cast(i + 1); + PREPARE_IM2COL_CPU; + + im2col(context, input, dilation, stride, padding, &out); + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + + float* ref_data = ref.data(); + float* out_data = out.data(); + for (int i = 0; i < out.numel(); ++i) { + EXPECT_EQ(out_data[i], ref_data[i]); } +} - paddle::platform::CPUPlace place; - paddle::platform::CPUDeviceContext context(place); - output.mutable_data({ic, fh, fw, output_height, output_width}, place); - ref_output.mutable_data({ic, fh, fw, output_height, output_width}, - place); - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - paddle::platform::CPUDeviceContext, float> - im2col; - im2col(context, input, dilation, stride, padding, &output); - auto ref_im2col = [&]( - const paddle::framework::Tensor& im, const std::vector& dilation, - const std::vector& stride, const std::vector& padding, - paddle::framework::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; - - const float* im_data = im.data(); - float* col_data = col->data(); - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? 0.f - : im_data[im_idx]; - } - } - } +void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + constexpr int repeat = 30; + auto GetCurrentMs = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; }; + auto t1 = GetCurrentMs(); + for (int i = 0; i < repeat; ++i) { + im2col(context, input, dilation, stride, padding, &out); + } + auto t2 = GetCurrentMs(); - ref_im2col(input, dilation, stride, padding, &ref_output); - - float* out_cfo_ptr = output.data(); - float* out_ref_ptr = ref_output.data(); - for (int i = 0; i < output.numel(); ++i) { - EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); + for (int i = 0; i < repeat; ++i) { + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); } + auto t3 = GetCurrentMs(); + + LOG(INFO) << "before: " << (t3 - t2) / repeat + << ",after: " << (t2 - t1) / repeat; } -TEST(math, im2col) { - testIm2col(); - testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, +TEST(math, im2col_cputest) { + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0, /*pw*/ 0); testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, /*pw*/ 1); -#ifdef PADDLE_WITH_CUDA - testIm2col(); -#endif + + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 1); + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0, + /*pw*/ 0); + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1, + /*pw*/ 1); + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0, + /*pw*/ 0); } From 3017f4607642e0f124e6a1f12f310f853a9536b5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 21:47:10 +0800 Subject: [PATCH 14/40] add more test cases --- paddle/fluid/operators/math/im2col_cfo_cpu.h | 4 +- paddle/fluid/operators/math/im2col_test.cc | 40 +++++++++++++++----- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h index ebdb062b7a..af581f3218 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -108,8 +108,8 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, int filter_width = col->dims()[2]; int output_height = col->dims()[3]; int output_width = col->dims()[4]; - const int sh = 1; - const int sw = 1; + constexpr int sh = 1; + constexpr int sw = 1; const T* im_data = im.data(); T* col_data = col->data(); diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index b22d21a9a3..7897be2ec3 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -208,7 +208,7 @@ void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { PREPARE_IM2COL_CPU; - constexpr int repeat = 30; + constexpr int repeat = 100; auto GetCurrentMs = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -231,17 +231,39 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { } TEST(math, im2col_cputest) { - testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0, - /*pw*/ 0); - testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); + // padding_h == padding_w + for (int p = 0; p < 4; ++p) { + // width == height + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p, + /*pw*/ p); - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); + // height != width + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + + // filter == 1 + testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + } + // padding_h != padding_w + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 2); + + // benchmark + LOG(INFO) << "padding == 0"; benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0, /*pw*/ 0); - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1, - /*pw*/ 1); benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0, /*pw*/ 0); + LOG(INFO) << "padding == 1"; + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 1); + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1, + /*pw*/ 1); } From 52eb86e30f6739a0d5ca115507b90f48bc8b89fd Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 26 Jul 2018 23:19:54 +0800 Subject: [PATCH 15/40] refine im2col benchmark --- paddle/fluid/operators/math/im2col_test.cc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 7897be2ec3..789d8e684a 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -256,14 +256,11 @@ TEST(math, im2col_cputest) { /*pw*/ 2); // benchmark - LOG(INFO) << "padding == 0"; - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0, - /*pw*/ 0); - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0, - /*pw*/ 0); - LOG(INFO) << "padding == 1"; - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); - benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1, - /*pw*/ 1); + for (int p : {0, 1, 2}) { + for (int k : {3, 5}) { + LOG(INFO) << "padding == " << p << ", filter == " << k; + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, + /*ph*/ p, /*pw*/ p); + } + } } From 99c0c20468cb3b816a34e039a8d2f6021d907437 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 27 Jul 2018 09:48:57 +0800 Subject: [PATCH 16/40] add pass test --- doc/fluid/design/ir/draft.md | 58 +++++++++- paddle/fluid/framework/ir/CMakeLists.txt | 8 +- paddle/fluid/framework/ir/graph.h | 3 +- paddle/fluid/framework/ir/pass.cc | 8 +- paddle/fluid/framework/ir/pass.h | 18 ++-- paddle/fluid/framework/ir/pass_test.cc | 112 ++++++++++++++++++++ paddle/fluid/framework/parallel_executor.cc | 14 ++- 7 files changed, 195 insertions(+), 26 deletions(-) create mode 100644 paddle/fluid/framework/ir/pass_test.cc diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index e141ce0959..c29337cba1 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -64,6 +64,41 @@ can also contain other things that describe some properties of the `Graph` or `Graph` nodes. `Attribute` can be passed across `Pass`. However, it should be used with care. +```cpp +class Graph { + public: + explicit Graph(const ProgramDesc &program); + + bool Has(const std::string &attr_name) const; + + template + AttrType &Get(const std::string &attr_name) const; + + template + void Set(const std::string &attr_name, AttrType *attr); + const std::unordered_set &Nodes() const; + + // Create a normal variable with non-null VarDesc. + ir::Node *CreateVarNode(VarDesc *var_desc); + + // Create a normal runnable operator with OpDesc. + ir::Node *CreateOpNode(OpDesc *op_desc); + + // Create a control dependency var that connects 2 operations. The + // var doesn't hold any data. Other than that, it's no different from + // other var, considering dependency analysis. + ir::Node *CreateControlDepVar(); + + // A more free style way of creating a graph node. Mostly use for test + // or "copy" from another node. Avoid using it if possible. + ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type); + + // Clear all node information of the graph and return the ownership of the + // nodes. + std::vector> ReleaseNodes(); +}; +``` + #### Pass `Pass` represents a transformation of `Graph`. Its input @@ -101,13 +136,15 @@ class Pass { // In my_pass.cc class MyPass : public Pass { - public: - std::unique_ptr Apply(std::unique_ptr graph) const override { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override { // do something. return graph; } } -REGISTER_PASS(my_pass, MyPass); +REGISTER_PASS(my_pass, MyPass) +.RequirePassAttr("places") +.RequireGraphAttr("dep_vars"); // To use the pass. @@ -132,4 +169,17 @@ maintaining the original modeling logic. * Graph is transformed from raw model logic to a form that is efficient to execute. -Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +``` +// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +auto graph = Graph(program); +graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah)); +// For more complex Pass, Optimize Process can provide Pass attributes. +auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass"); +mem_opt_pass.SetNotOwned("optimize_level", 1); +mem_opt_pass->Apply(std::move(graph)); +graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah)); +graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah)); +Executor exe; +exe.Run(graph); + +``` diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a6bdd12b63..bf7d76a8a6 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,7 +1,9 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) -cc_library(pass SRCS pass.cc DEPS graph node) +cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) -cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry) -cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry) + +cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) +cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) +cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 78094e46fb..c9d55fbf52 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -53,7 +53,8 @@ class Graph { template void Set(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 2ebc3c7430..d7158eba62 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -13,23 +13,27 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace ir { std::unique_ptr Pass::Apply(std::unique_ptr graph) const { + PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); + PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), - "Required pass atrribute %s not registered.", attr); + "Required pass atrribute %s not set.", attr); } for (const std::string& attr : required_graph_attrs_) { - PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not exist.", + PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", attr); } auto applied_graph = ApplyImpl(std::move(graph)); // TODO(panyx0718): Add more verifications. PADDLE_ENFORCE(!HasCircle(*applied_graph), "Illegal Pass. Generated graph shouldn't has cycle."); + applied_ = true; return applied_graph; } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 3f65794fab..0f14083d25 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/variant.h" @@ -56,7 +55,8 @@ class Pass { // Set a pointer to the attribute. Pass takes ownership of the attribute. template void Set(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; @@ -89,6 +89,7 @@ class Pass { required_graph_attrs_.insert(attrs.begin(), attrs.end()); } + mutable bool applied_{false}; std::unordered_set required_pass_attrs_; std::unordered_set required_graph_attrs_; std::map attrs_; @@ -118,14 +119,15 @@ class PassRegistry { return map_.find(pass_type) != map_.end(); } - void Insert(const std::string &type, const PassCreator &pass_creator) { - PADDLE_ENFORCE(!Has(type), "Pass %s has been registered", type); - map_.insert({type, pass_creator}); + void Insert(const std::string &pass_type, const PassCreator &pass_creator) { + PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type); + map_.insert({pass_type, pass_creator}); } - std::unique_ptr Get(const std::string &type) const { - PADDLE_ENFORCE(Has(type), "Pass %s has not been registered", type); - return map_.at(type)(); + std::unique_ptr Get(const std::string &pass_type) const { + PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered", + pass_type); + return map_.at(pass_type)(); } private: diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc new file mode 100644 index 0000000000..5b5011412e --- /dev/null +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +class TestPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + graph->Set("copy_test_pass_attr", new int); + graph->Set("copy_test_graph_attr", new int); + + int test_pass_attr = this->Get("test_pass_attr"); + graph->Get("copy_test_pass_attr") = test_pass_attr + 1; + + int test_graph_attr = graph->Get("test_graph_attr"); + graph->Get("copy_test_graph_attr") = test_graph_attr + 1; + return graph; + } +}; + +TEST(PassTest, TestPassAttrCheck) { + ProgramDesc prog; + auto pass = PassRegistry::Instance().Get("test_pass"); + std::unique_ptr graph(new Graph(prog)); + std::string exception; + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos); + + int val = 1; + graph.reset(new Graph(prog)); + pass->SetNotOwned("test_pass_attr", &val); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos); + + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 1; + graph = pass->Apply(std::move(graph)); + ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); + ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + + pass = PassRegistry::Instance().Get("test_pass"); + pass->SetNotOwned("test_pass_attr", &val); + graph.reset(new Graph(prog)); + BuildCircleGraph(graph.get()); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 2; + try { + auto tmp = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(test_pass, paddle::framework::ir::TestPass) + .RequirePassAttr("test_pass_attr") + .RequireGraphAttr("test_graph_attr"); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 112b48ca31..b5f01a9a2b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -44,7 +44,10 @@ std::unique_ptr ApplyParallelExecutorPass( #else const BuildStrategy &strategy) { #endif + // Convert the program to graph. std::unique_ptr graph(new ir::Graph(main_program)); + + // Apply a graph viz pass to record a graph. if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); const std::string graph_path = string::Sprintf( @@ -53,6 +56,7 @@ std::unique_ptr ApplyParallelExecutorPass( graph = viz_pass->Apply(std::move(graph)); } + // Convert graph to run on multi-devices. auto multi_device_pass = ir::PassRegistry::Instance().Get("multi_device_pass"); multi_device_pass->SetNotOwned>("places", @@ -71,6 +75,7 @@ std::unique_ptr ApplyParallelExecutorPass( #endif graph = multi_device_pass->Apply(std::move(graph)); + // Apply a graph print pass to record a graph with device info. if (!strategy.debug_graphviz_path_.empty()) { auto multi_device_print_pass = ir::PassRegistry::Instance().Get("multi_device_print_pass"); @@ -81,17 +86,10 @@ std::unique_ptr ApplyParallelExecutorPass( graph = multi_device_print_pass->Apply(std::move(graph)); } + // Verify that the graph is correct for multi-device executor. auto multi_device_check_pass = ir::PassRegistry::Instance().Get("multi_device_check_pass"); graph = multi_device_check_pass->Apply(std::move(graph)); - - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_before_exec"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } return graph; } From 65d418f060507999d74c7adca0575e8b991e60b4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 15:27:08 +0800 Subject: [PATCH 17/40] complete im2col with padding==1 and speedup filter width==1 --- paddle/fluid/operators/math/im2col.cc | 8 +- paddle/fluid/operators/math/im2col_cfo_cpu.h | 218 +++++++++---------- paddle/fluid/operators/math/im2col_test.cc | 12 +- 3 files changed, 113 insertions(+), 125 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 478900e1c2..1472edbbf4 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -40,10 +40,12 @@ class Im2ColFunctor(im, col); - } else { - im2col_sh1sw1dh1dw1(im, padding, col); + return; + } else if (padding[0] == 1 && padding[1] == 1) { + im2col_sh1sw1dh1dw1ph1pw1(im, col); + return; } - return; + // TODO(TJ): complete padding >=2 } im2col_common(im, dilation, stride, padding, col); } diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h index af581f3218..ac843cdc71 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -21,7 +21,7 @@ namespace paddle { namespace operators { namespace math { -/* +/** * The most common im2col algorithm. * Support dilation, stride and padding. */ @@ -61,9 +61,9 @@ inline void im2col_common(const framework::Tensor& im, } } -/* +/** * im2col algorithm with strides == 1, dilations == 1, paddings == 0 - * */ + */ template inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, framework::Tensor* col) { @@ -96,11 +96,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, } } -// further optimize: padding == 1 need special +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 1 + * and filter_width == 1 have a special implementation + */ template -inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, - const std::vector& padding, - framework::Tensor* col) { +inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, + framework::Tensor* col) { int im_channels = im.dims()[0]; int im_height = im.dims()[1]; int im_width = im.dims()[2]; @@ -108,119 +110,57 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, int filter_width = col->dims()[2]; int output_height = col->dims()[3]; int output_width = col->dims()[4]; - constexpr int sh = 1; - constexpr int sw = 1; + + constexpr int plh = 1; + constexpr int prh = 1; + constexpr int plw = 1; + constexpr int prw = 1; const T* im_data = im.data(); T* col_data = col->data(); - int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; - - int plh = padding[0]; - int plw = padding[1]; - int prh = (output_height - 1) * sh + filter_height - im_height - plh; - int prw = (output_width - 1) * sw + filter_width - im_width - plw; - - // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1) - // TODO(TJ): refine ph*xxx - assert(plh == prh); // because stride_h == 1 + int col_matrix_width = output_width * output_height; int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow - for (int ph = 0; ph < plh; ++ph) { - int sz = output_width * (plh - ph); - size_t copy_sz = sizeof(T) * sz; - T* col_start_l = col_data + ph * col_block_fh; - T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh + - col_matrix_width - sz; + + // fill height padding + { + size_t copy_size = sizeof(T) * output_width; + T* col_start_l = col_data; + T* col_start_r = col_data + (filter_height - 1) * col_block_fh + + col_matrix_width - output_width; for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): move * outside T* dst_data_l = col_start_l + ic * col_block_ic; T* dst_data_r = col_start_r + ic * col_block_ic; for (int kw = 0; kw < filter_width; ++kw) { - std::memset(dst_data_l, 0, copy_sz); - std::memset(dst_data_r, 0, copy_sz); + std::memset(dst_data_l, 0, copy_size); + std::memset(dst_data_r, 0, copy_size); dst_data_l = dst_data_l + col_matrix_width; dst_data_r = dst_data_r + col_matrix_width; } } } - // fill width padding - assert(plw == prw); // because stride_w == 1 - if (plw == 1) { - auto pad = static_cast(0); // padding zero + auto pad = static_cast(0); + if (filter_width == 1) { + // fill width padding for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride + // TODO(TJ): move * outside T* dst_data_ic = col_data + ic * col_block_ic; for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (T* dst_data : - {dst_data_kh, dst_data_kh + - (filter_width - prw) * col_matrix_width + - output_width - 1}) { - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - *dst_data = pad; - dst_data = dst_data + output_width; - } + // TODO(TJ): move * outside + T* dst_data = dst_data_ic + kh * col_block_fh; + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width - 1; + *dst_data = pad; + ++dst_data; } } } - } else { - // padding_size > 1 - for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride - T* dst_data_ic = col_data + ic * col_block_ic; - for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (int kw = 0; kw < plw; ++kw) { - // TODO(TJ): reuse array outside this for - size_t sz = sizeof(T) * (plw - kw); - T* dst_data = dst_data_kh + kw * col_matrix_width; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - // TODO(TJ): use reverse to save cache - for (int kw = 0; kw < prw; ++kw) { - // TODO(TJ): reuse array outside this for - auto num = (prw - kw); - size_t sz = sizeof(T) * num; - T* dst_data = dst_data_kh + - (filter_width - 1 - kw) * col_matrix_width + - output_width - num; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - } - } - } - - // fill im_data - // padding cover two cases: - // 1. kw > 2*pw: kw = 3, pw = 1 - // 0 x x x x ... x x x x 0 - // 1 1 1 1 1 1 - // ==> - // 0 x ... x x - // x x ... x x - // x x ... x 0 - // 2. kw < 2*pw: kw = 3, pw = 2 - // 0 0 x x x ... x x x 0 0 - // 1 1 1 1 1 1 - // ==> - // 0 0 x ... x x x - // 0 x x ... x x 0 - // x x x ... x 0 0 - - // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * - // (output_width-1)} - // length of copy_size is equal kw. - if (plw + prw < filter_width) { + // fill core + size_t copy_size = sizeof(T) * (output_width - plw - prw); for (int oh = 0; oh < output_height; ++oh) { const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; @@ -230,33 +170,73 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, for (int kh = 0; kh < filter_height; ++kh) { if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && kh > (filter_height - prh - 1))) { - dst_data = dst_data + filter_width * col_matrix_width; - continue; - } - // TODO(TJ): reuse plw-kw outside this for - // try to unify - for (int kw = 0; kw < plw; ++kw) { - std::memcpy(dst_data + (plw - kw), src_data, - sizeof(T) * (output_width - (plw - kw))); - dst_data = dst_data + col_matrix_width; - } - for (int kw = plw; kw < filter_width - prw; ++kw) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * output_width); - dst_data = dst_data + col_matrix_width; - } - int i = 1; - for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * (output_width - i)); dst_data = dst_data + col_matrix_width; + continue; } + std::memcpy(dst_data + plw, src_data, copy_size); + dst_data = dst_data + col_matrix_width; src_data = src_data + im_width; } } } - } else { - LOG(FATAL) << "Not implement yet"; + return; + } + + // filter_width != 1 + // fill width padding + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): move * outside + T* dst_data_ic = col_data + ic * col_block_ic; + for (int kh = 0; kh < filter_height; ++kh) { + // TODO(TJ): move * outside + T* dst_data_kh = dst_data_ic + kh * col_block_fh; + for (T* dst_data : + {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + } + } + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } } } diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 789d8e684a..ae2c90b33a 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -227,7 +227,8 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { auto t3 = GetCurrentMs(); LOG(INFO) << "before: " << (t3 - t2) / repeat - << ",after: " << (t2 - t1) / repeat; + << ",after: " << (t2 - t1) / repeat + << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; } TEST(math, im2col_cputest) { @@ -244,6 +245,10 @@ TEST(math, im2col_cputest) { // height != width testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); // filter == 1 testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, @@ -251,13 +256,14 @@ TEST(math, im2col_cputest) { testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, /*pw*/ p); } + // padding_h != padding_w testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, /*pw*/ 2); // benchmark - for (int p : {0, 1, 2}) { - for (int k : {3, 5}) { + for (int p : {0, 1}) { + for (int k : {1, 3, 5}) { LOG(INFO) << "padding == " << p << ", filter == " << k; benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, /*ph*/ p, /*pw*/ p); From 2aa732ba34e4456242a597c3f3701c2c47e09b78 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 29 Jul 2018 17:23:59 +0800 Subject: [PATCH 18/40] fix distribute transpiler to optimize selected rows --- paddle/fluid/framework/operator.cc | 2 ++ python/paddle/fluid/transpiler/distribute_transpiler.py | 1 + 2 files changed, 3 insertions(+) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d1dc5fcd97..7c1c29fd9a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -679,6 +679,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (var == nullptr) continue; if (var->IsType()) { CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get().value()); } } } diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 4a9ea6af74..4b03bad0dd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -494,6 +494,7 @@ class DistributeTranspiler(object): pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) + optimize_blocks.append(table_opt_block) prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( From 91f63cd40128dc74e1fe37e0ffaa072af22c10bb Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 29 Jul 2018 19:59:12 +0800 Subject: [PATCH 19/40] fix split_ids_op and add unit test --- paddle/fluid/operators/split_ids_op.h | 12 ++-- .../tests/unittests/test_split_ids_op.py | 60 +++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d263426e07..86a3eaa5c4 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -68,9 +69,11 @@ class SplitIdsOpKernel : public framework::OpKernel { auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); // get rows for outputs - for (auto &id : ids_rows) { - size_t shard_id = static_cast(id) % shard_num; - outs[shard_id]->mutable_rows()->push_back(id); + std::map id_to_index; + for (size_t i = 0; i < ids_rows.size(); ++i) { + id_to_index[ids_rows[i]] = i; + size_t shard_id = static_cast(ids_rows[i]) % shard_num; + outs[shard_id]->mutable_rows()->push_back(ids_rows[i]); } int64_t row_width = ids_dims[1]; @@ -80,7 +83,8 @@ class SplitIdsOpKernel : public framework::OpKernel { {static_cast(out->rows().size()), row_width}); T *output = out->mutable_value()->mutable_data(ddim, place); for (int64_t i = 0; i < ddim[0]; ++i) { - memcpy(output + i * row_width, ids + out->rows()[i] * row_width, + memcpy(output + i * row_width, + ids + id_to_index[out->rows()[i]] * row_width, row_width * sizeof(T)); } } diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py index e9f0a06a56..adf3345f1d 100644 --- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -15,6 +15,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestSplitIdsOp(OpTest): @@ -31,5 +33,63 @@ class TestSplitIdsOp(OpTest): self.check_output() +class TestSpliteIds(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + return places + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + def check_with_place(self, place): + scope = core.Scope() + rows = [0, 5, 7, 4, 9] + height = 20 + row_numel = 2 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(rows) + x.set_height(height) + np_array = np.ones((len(rows), row_numel)).astype("float32") + for i in range(len(rows)): + for j in range(row_numel): + np_array[i, j] = rows[i] + j + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + outs_name = ["out%d" % i for i in xrange(3)] + outs = [ + scope.var(var_name).get_selected_rows() for var_name in outs_name + ] + + # expected output selected rows + expected_out0_rows = [0, 9] + expected_out1_rows = [7, 4] + expected_out2_rows = [5] + + op = Operator("split_ids", Ids="X", Out=outs_name) + + op.run(scope, place) + + self.assertEqual(outs[0].rows(), expected_out0_rows) + self.assertEqual(outs[1].rows(), expected_out1_rows) + self.assertEqual(outs[2].rows(), expected_out2_rows) + + self.assertAlmostEqual(0.0, np.array(outs[0].get_tensor())[0, 0]) + self.assertAlmostEqual(1.0, np.array(outs[0].get_tensor())[0, 1]) + self.assertAlmostEqual(9.0, np.array(outs[0].get_tensor())[1, 0]) + self.assertAlmostEqual(10.0, np.array(outs[0].get_tensor())[1, 1]) + + self.assertAlmostEqual(7.0, np.array(outs[1].get_tensor())[0, 0]) + self.assertAlmostEqual(8.0, np.array(outs[1].get_tensor())[0, 1]) + self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 0]) + self.assertAlmostEqual(5.0, np.array(outs[1].get_tensor())[1, 1]) + + self.assertAlmostEqual(5.0, np.array(outs[2].get_tensor())[0, 0]) + self.assertAlmostEqual(6.0, np.array(outs[2].get_tensor())[0, 1]) + + if __name__ == '__main__': unittest.main() From 41693b6ae7460f3c8f5a8bf49287980dfd2da466 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 29 Jul 2018 20:13:03 +0800 Subject: [PATCH 20/40] optimize code --- .../tests/unittests/test_split_ids_op.py | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py index adf3345f1d..4001877290 100644 --- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -65,30 +65,21 @@ class TestSpliteIds(unittest.TestCase): ] # expected output selected rows - expected_out0_rows = [0, 9] - expected_out1_rows = [7, 4] - expected_out2_rows = [5] + expected_out_rows = [[0, 9], [7, 4], [5]] op = Operator("split_ids", Ids="X", Out=outs_name) op.run(scope, place) - self.assertEqual(outs[0].rows(), expected_out0_rows) - self.assertEqual(outs[1].rows(), expected_out1_rows) - self.assertEqual(outs[2].rows(), expected_out2_rows) - - self.assertAlmostEqual(0.0, np.array(outs[0].get_tensor())[0, 0]) - self.assertAlmostEqual(1.0, np.array(outs[0].get_tensor())[0, 1]) - self.assertAlmostEqual(9.0, np.array(outs[0].get_tensor())[1, 0]) - self.assertAlmostEqual(10.0, np.array(outs[0].get_tensor())[1, 1]) - - self.assertAlmostEqual(7.0, np.array(outs[1].get_tensor())[0, 0]) - self.assertAlmostEqual(8.0, np.array(outs[1].get_tensor())[0, 1]) - self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 0]) - self.assertAlmostEqual(5.0, np.array(outs[1].get_tensor())[1, 1]) - - self.assertAlmostEqual(5.0, np.array(outs[2].get_tensor())[0, 0]) - self.assertAlmostEqual(6.0, np.array(outs[2].get_tensor())[0, 1]) + for i in range(len(outs)): + expected_rows = expected_out_rows[i] + self.assertEqual(outs[i].rows(), expected_rows) + for j in range(len(expected_rows)): + row = expected_rows[j] + self.assertAlmostEqual( + float(row), np.array(outs[i].get_tensor())[j, 0]) + self.assertAlmostEqual( + float(row + 1), np.array(outs[i].get_tensor())[j, 1]) if __name__ == '__main__': From 4d2405d851812daded4f2344bb817a61dc804f47 Mon Sep 17 00:00:00 2001 From: Superjomn Date: Mon, 30 Jul 2018 01:26:48 +0000 Subject: [PATCH 21/40] inference analysis support ssa --- .../inference/analysis/data_flow_graph.h | 2 ++ .../analysis/fluid_to_data_flow_graph_pass.cc | 19 +++++++++++++++---- .../analysis/fluid_to_data_flow_graph_pass.h | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 1c60d5de21..bc1875f4d8 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -36,6 +36,8 @@ namespace analysis { /* * DataFlowGraph - A container of Value and Function Nodes. + * + * This is the base graph for any other type of graphs, such as SSA or CFG. */ struct DataFlowGraph { NodeMap nodes; diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 496921db9e..88fdf8c9cb 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -40,6 +40,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { PADDLE_ENFORCE(graph); PADDLE_ENFORCE(desc_); // insert vars + // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id + // will keep updating to its latest alias during the graph-building. std::unordered_map var2id; auto &main_block = desc_->blocks(framework::kRootBlockIndex); for (int i = 0; i < main_block.vars_size(); i++) { @@ -51,6 +53,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { var2id[var.name()] = v->id(); } + // The variables in a SSA can only write once, so if a variable is written + // multiple times(quite common in our ProgramDesc design), multiple alias + // Nodes of this variable will be created, and each will just write once. + + // An set that keep all the names of the variables(the original, not alias) + // that have been written(as outputs). Once an Op's output variable hit the + // set, it should create a new alias and update the global alias for this + // variable. And that make a Data Flow Graph a SSA. + std::unordered_set unique_written_vars; for (int i = 0; i < main_block.ops_size(); i++) { const auto &op = main_block.ops(i); auto *o = graph->nodes.Create(Node::Type::kFunction); @@ -62,33 +73,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { o->SetPbMsg(op.SerializeAsString()); // set inputs and outputs - std::unordered_set inlinks; for (int j = 0; j < op.inputs_size(); j++) { auto &in_var = op.inputs(j); for (int k = 0; k < in_var.arguments_size(); k++) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); - inlinks.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { auto &out_var = op.outputs(j); for (int k = 0; k < out_var.arguments_size(); k++) { auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); - if (inlinks.count(out)) { + if (unique_written_vars.count(out)) { // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). auto *out_alias = graph->nodes.Create(Node::Type::kValue); out_alias->SetName(out->name()); out_alias->SetPbDesc(out->pb_desc()); out_alias->SetPbMsg(out->pb_msg()); - var2id[out_alias->name()] = out_alias->id(); // update a -> a0 + var2id[out_alias->name()] = + out_alias->id(); // update variable's alias Node LOG(INFO) << "loop found in graph, create SSA alias node [" << out_alias->repr() << "] for [" << out->repr() << "]"; out = out_alias; } out->inlinks.push_back(o); o->outlinks.push_back(out); + unique_written_vars.insert(out); } } } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h index da8463b63b..fb948bf224 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h @@ -30,7 +30,7 @@ namespace inference { namespace analysis { /* - * Transform a FluidDesc to a data flow graph. + * Transform a FluidDesc to a SSA. */ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { public: From d8d2dbcfacfd9e53f9d4d4e6d70ab98bd05e5d38 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 30 Jul 2018 11:29:17 +0800 Subject: [PATCH 22/40] further optimize im2col using variables --- paddle/fluid/operators/math/im2col_cfo_cpu.h | 35 ++++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h index ac843cdc71..0d32bc5bd0 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -80,11 +80,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; size_t copy_size = sizeof(T) * output_width; + const T* im_data_oh = im_data; + T* dst_data_oh = col_data; for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; + const T* src_data_ic = im_data_oh; + T* dst_data = dst_data_oh; for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; + const T* src_data = src_data_ic; for (int kh = 0; kh < filter_height; ++kh) { for (int kw = 0; kw < filter_width; ++kw) { std::memcpy(dst_data, src_data + kw, copy_size); @@ -92,7 +94,10 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, } src_data = src_data + im_width; } + src_data_ic = src_data_ic + im_size; } + im_data_oh = im_data_oh + im_width; + dst_data_oh = dst_data_oh + output_width; } } @@ -130,34 +135,36 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, T* col_start_r = col_data + (filter_height - 1) * col_block_fh + col_matrix_width - output_width; for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): move * outside - T* dst_data_l = col_start_l + ic * col_block_ic; - T* dst_data_r = col_start_r + ic * col_block_ic; + T* dst_data_l = col_start_l; + T* dst_data_r = col_start_r; for (int kw = 0; kw < filter_width; ++kw) { std::memset(dst_data_l, 0, copy_size); std::memset(dst_data_r, 0, copy_size); dst_data_l = dst_data_l + col_matrix_width; dst_data_r = dst_data_r + col_matrix_width; } + col_start_l = col_start_l + col_block_ic; + col_start_r = col_start_r + col_block_ic; } } auto pad = static_cast(0); if (filter_width == 1) { // fill width padding + T* dst_data_ic = col_data; for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): move * outside - T* dst_data_ic = col_data + ic * col_block_ic; + T* dst_data_kh = dst_data_ic; for (int kh = 0; kh < filter_height; ++kh) { - // TODO(TJ): move * outside - T* dst_data = dst_data_ic + kh * col_block_fh; + T* dst_data = dst_data_kh; for (int oh = 0; oh < output_height; ++oh) { *dst_data = pad; dst_data = dst_data + output_width - 1; *dst_data = pad; ++dst_data; } + dst_data_kh = dst_data_kh + col_block_fh; } + dst_data_ic = dst_data_ic + col_block_ic; } // fill core size_t copy_size = sizeof(T) * (output_width - plw - prw); @@ -184,12 +191,10 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, // filter_width != 1 // fill width padding + T* dst_data_ic = col_data; for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): move * outside - T* dst_data_ic = col_data + ic * col_block_ic; + T* dst_data_kh = dst_data_ic; for (int kh = 0; kh < filter_height; ++kh) { - // TODO(TJ): move * outside - T* dst_data_kh = dst_data_ic + kh * col_block_fh; for (T* dst_data : {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + output_width - 1}) { @@ -199,7 +204,9 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, dst_data = dst_data + output_width; } } + dst_data_kh = dst_data_kh + col_block_fh; } + dst_data_ic = dst_data_ic + col_block_ic; } // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * From 91b114a7870b62764f0a493c520c5fab1d89f6a7 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 30 Jul 2018 11:39:48 +0800 Subject: [PATCH 23/40] change map to unordered_map --- paddle/fluid/operators/split_ids_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index 86a3eaa5c4..d6caf626cb 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -69,7 +69,7 @@ class SplitIdsOpKernel : public framework::OpKernel { auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); // get rows for outputs - std::map id_to_index; + std::unordered_map id_to_index; for (size_t i = 0; i < ids_rows.size(); ++i) { id_to_index[ids_rows[i]] = i; size_t shard_id = static_cast(ids_rows[i]) % shard_num; From 147bf00ffee4f02882d852ca433ea9322c156e3d Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 30 Jul 2018 12:35:18 +0800 Subject: [PATCH 24/40] clear mutable rows for the output of split_ids_op --- paddle/fluid/operators/split_ids_op.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d6caf626cb..c4af5a65fc 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -68,6 +68,9 @@ class SplitIdsOpKernel : public framework::OpKernel { const auto &ids_rows = ids_selected_rows->rows(); auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); + for (auto &out : outs) { + out->mutable_rows()->clear(); + } // get rows for outputs std::unordered_map id_to_index; for (size_t i = 0; i < ids_rows.size(); ++i) { From f4c4c6179c9628055e13fc2237851a0fba801702 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 30 Jul 2018 14:18:48 +0800 Subject: [PATCH 25/40] optimize unit test of test_split_ids_op --- .../tests/unittests/test_split_ids_op.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py index 4001877290..ca78613098 100644 --- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -69,17 +69,18 @@ class TestSpliteIds(unittest.TestCase): op = Operator("split_ids", Ids="X", Out=outs_name) - op.run(scope, place) - - for i in range(len(outs)): - expected_rows = expected_out_rows[i] - self.assertEqual(outs[i].rows(), expected_rows) - for j in range(len(expected_rows)): - row = expected_rows[j] - self.assertAlmostEqual( - float(row), np.array(outs[i].get_tensor())[j, 0]) - self.assertAlmostEqual( - float(row + 1), np.array(outs[i].get_tensor())[j, 1]) + for _ in range(3): + op.run(scope, place) + + for i in range(len(outs)): + expected_rows = expected_out_rows[i] + self.assertEqual(outs[i].rows(), expected_rows) + for j in range(len(expected_rows)): + row = expected_rows[j] + self.assertAlmostEqual( + float(row), np.array(outs[i].get_tensor())[j, 0]) + self.assertAlmostEqual( + float(row + 1), np.array(outs[i].get_tensor())[j, 1]) if __name__ == '__main__': From f628b1dfad227ffd339fa57a27d13614af6ab958 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 30 Jul 2018 15:23:54 +0800 Subject: [PATCH 26/40] fix_tests_on_gcc482 --- paddle/fluid/platform/device_tracer.cc | 10 +++++++--- tools/manylinux1/Dockerfile.x64 | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf..696ed61981 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -273,14 +273,18 @@ class DeviceTracerImpl : public DeviceTracer { proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); + std::string kernel_name; for (const KernelRecord &r : kernel_records_) { if (correlations_.find(r.correlation_id) == correlations_.end()) { - fprintf(stderr, "cannot relate a kernel activity\n"); - continue; + // fprintf(stderr, "cannot relate a kernel activity\n"); + // continue; + kernel_name = "Unknown"; + } else { + kernel_name = correlations_.at(r.correlation_id); } auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(correlations_.at(r.correlation_id)); + event->set_name(kernel_name); event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index bca0b77ad7..0b72ea323b 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz +RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ bash build_scripts/install_nccl2.sh && rm -r build_scripts From b7b600225ec954cac89dd40485480567123f750b Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 30 Jul 2018 15:31:39 +0800 Subject: [PATCH 27/40] clean up --- paddle/fluid/platform/device_tracer.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 696ed61981..90ab85905d 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -276,15 +276,12 @@ class DeviceTracerImpl : public DeviceTracer { std::string kernel_name; for (const KernelRecord &r : kernel_records_) { if (correlations_.find(r.correlation_id) == correlations_.end()) { - // fprintf(stderr, "cannot relate a kernel activity\n"); - // continue; - kernel_name = "Unknown"; - } else { - kernel_name = correlations_.at(r.correlation_id); + fprintf(stderr, "cannot relate a kernel activity\n"); + continue; } auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(kernel_name); + event->set_name(correlations_.at(r.correlation_id)); event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); From ff97c709dfe25f2470e489505db4293eb0206d95 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 30 Jul 2018 15:32:20 +0800 Subject: [PATCH 28/40] clean up --- paddle/fluid/platform/device_tracer.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 90ab85905d..d9e2afadaf 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -273,7 +273,6 @@ class DeviceTracerImpl : public DeviceTracer { proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); - std::string kernel_name; for (const KernelRecord &r : kernel_records_) { if (correlations_.find(r.correlation_id) == correlations_.end()) { fprintf(stderr, "cannot relate a kernel activity\n"); From f372f27e3f724a55ed221f18d3c30b18b4cde8cf Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 30 Jul 2018 15:36:15 +0800 Subject: [PATCH 29/40] Hidden APIs for While, StaticRNN, ParallelDo. (#12332) * Hidden APIs for While, StaticRNN, ParallelDo. --- paddle/fluid/API.spec | 5 +---- python/paddle/fluid/layers/control_flow.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 6efb03dabe..5f3bfa2965 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) @@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs= paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs= paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index f05ae6d5d1..3ee1c636ac 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -21,6 +21,7 @@ from ..layer_helper import LayerHelper, unique_name from ..initializer import force_init_on_cpu from ops import logical_and, logical_not, logical_or import numpy +import warnings __all__ = [ 'While', @@ -280,6 +281,9 @@ class ParallelDo(object): """ def __init__(self, places, use_nccl=False, name=None): + warnings.warn( + "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.", + Warning) self.helper = LayerHelper("parallel_do", name=name) self.inputs = [] self.places = places @@ -338,7 +342,7 @@ class ParallelDo(object): return [parent_block.var(name) for name in params] - def complete_op(self): + def _complete_op(self): main_program = self.helper.main_program current_block = main_program.current_block() parent_block = self.parent_block() @@ -394,7 +398,7 @@ class BlockGuardWithCompletion(BlockGuard): if exc_type is not None: return False self.rnn.status = StaticRNN.AFTER_RNN_BLOCK - self.rnn.complete_op() + self.rnn._complete_op() return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val, exc_tb) @@ -470,7 +474,7 @@ class StaticRNN(object): if shape is None or batch_ref is None: raise ValueError( "if init is None, memory at least need shape and batch_ref") - parent_block = self.parent_block() + parent_block = self._parent_block() var_name = unique_name.generate("@".join( [self.helper.name, "memory_boot"])) boot_var = parent_block.create_var( @@ -527,7 +531,7 @@ class StaticRNN(object): outputs={'Out': tmp_o}, attrs={'dtype': o.dtype}) - out_var = self.parent_block().create_var( + out_var = self._parent_block().create_var( name=tmp_o.name, shape=[self.seq_len] + list(tmp_o.shape), dtype=tmp_o.dtype) @@ -543,7 +547,7 @@ class StaticRNN(object): raise TypeError("update memory should take variables") self.memories[mem.name].mem = var - def parent_block(self): + def _parent_block(self): prog = self.helper.main_program parent_idx = prog.current_block().parent_idx assert parent_idx >= 0 @@ -560,10 +564,10 @@ class StaticRNN(object): else: return self.outputs - def complete_op(self): + def _complete_op(self): main_program = self.helper.main_program rnn_block = main_program.current_block() - parent_block = self.parent_block() + parent_block = self._parent_block() local_inputs = set() @@ -643,7 +647,7 @@ class WhileGuard(BlockGuard): if exc_type is not None: return False self.while_op.status = While.AFTER_WHILE_BLOCK - self.while_op.complete() + self.while_op._complete() return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb) @@ -690,7 +694,7 @@ class While(object): def block(self): return WhileGuard(self) - def complete(self): + def _complete(self): main_program = self.helper.main_program while_block = main_program.current_block() parent_block = main_program.block(main_program.current_block() From 3e4083ed1f7a75e17f67b3a31831db56d01a1bb4 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 30 Jul 2018 19:35:05 +0800 Subject: [PATCH 30/40] Make exception handling of threaded_ssa_graph_executor an independent class --- .../framework/details/exception_holder.h | 83 +++++++++++++++++++ .../details/threaded_ssa_graph_executor.cc | 27 ++---- .../details/threaded_ssa_graph_executor.h | 4 +- 3 files changed, 90 insertions(+), 24 deletions(-) create mode 100644 paddle/fluid/framework/details/exception_holder.h diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h new file mode 100644 index 0000000000..6e302a2923 --- /dev/null +++ b/paddle/fluid/framework/details/exception_holder.h @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +class ExceptionHolder { + public: + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + + bool ExceptionCatched() const { + std::lock_guard lock(mu_); + return exception_.get() != nullptr; + } + + void Throw() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + break; + case kEnforceNotMet: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + case kEOF: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + default: + LOG(FATAL) << "Unknown exception."; + } + exception_.reset(); + type_ = kNone; + } + + void Clear() { + std::lock_guard lock(mu_); + exception_.reset(); + type_ = kNone; + } + + private: + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; + ExceptionType type_{kNone}; + + std::unique_ptr exception_; + mutable std::mutex mu_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c19f74476f..00f1f262a6 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); - exception_.reset(); + exception_holder_.Clear(); // Step 3. Execution while (!pending_vars.empty()) { @@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - std::unique_lock l(exception_mu_); - if (exception_) { - l.unlock(); + if (exception_holder_.ExceptionCatched()) { for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - l.lock(); - std::exception *exp = exception_.get(); - if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else { - LOG(FATAL) << "Unknown exception."; - } + exception_holder_.Throw(); } else { continue; } @@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp( ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EOFException ex) { - std::lock_guard l(exception_mu_); - // EOFException will not cover up existing EnforceNotMet. - if (exception_.get() == nullptr) { - exception_.reset(new platform::EOFException(ex)); - } + exception_holder_.Catch(ex); } catch (platform::EnforceNotMet ex) { - std::lock_guard l(exception_mu_); - exception_.reset(new platform::EnforceNotMet(ex)); + exception_holder_.Catch(ex); } catch (...) { LOG(FATAL) << "Unknown exception catched"; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 3d67daa45e..4f3e5a6288 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -58,8 +59,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; - std::mutex exception_mu_; - std::unique_ptr exception_; + ExceptionHolder exception_holder_; std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, From 0b861bbca9ca8cf383e8c559a1d88f1e62b3d518 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 30 Jul 2018 22:27:35 +0800 Subject: [PATCH 31/40] add profiler for listen_and_serv op --- paddle/fluid/operators/listen_and_serv_op.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 438b44b42a..fb475e3253 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -25,6 +25,10 @@ limitations under the License. */ #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_int32(listen_and_serv_profile_period); +DEFINE_int32(listen_and_serv_profile_period, 0, + "the period of listen_and_serv to do profile"); + namespace paddle { namespace operators { @@ -122,7 +126,13 @@ void ListenAndServOp::RunSyncLoop( std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); + + int32_t profile_step = 0; while (true) { + if (FLAGS_listen_and_serv_profile_period > 0 && profile_step == 0) { + auto pf_state = paddle::platform::ProfilerState::kCPU; + paddle::platform::EnableProfiler(pf_state); + } // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. rpc_service_->SetCond(distributed::kRequestSend); @@ -164,6 +174,14 @@ void ListenAndServOp::RunSyncLoop( // reset received sparse vars to avoid reuse it in the next mini-batch dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); + if (FLAGS_listen_and_serv_profile_period > 0 && + profile_step == FLAGS_listen_and_serv_profile_period) { + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kTotal, "/dev/null"); + profile_step = 0; + } else { + profile_step++; + } } // while(true) } From c8c8c01a235f4e370073f5946b08122b9bb92c97 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 30 Jul 2018 22:49:54 +0800 Subject: [PATCH 32/40] fix mac build of graph_executor --- .../framework/details/scope_buffered_ssa_graph_executor.h | 4 +++- paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 1b188aec59..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); - const ir::Graph& Graph() const { return underlying_executor_->Graph(); } + const ir::Graph& Graph() const override { + return underlying_executor_->Graph(); + } FeedFetchList Run(const std::vector& fetch_tensors) override; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 82d6b5272a..b0aaf60701 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -42,7 +42,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const std::vector &places, std::unique_ptr &&graph); - const ir::Graph &Graph() const { return *graph_; } + const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool // Use topological sort algorithm FeedFetchList Run(const std::vector &fetch_tensors) override; From 91fb0156ca1a97a247f571960440bae993c28a1c Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 31 Jul 2018 09:09:33 +0800 Subject: [PATCH 33/40] Memory/reshape op (#12414) * "remove inplace in single op" * "fix ci" * "add transpiler case" * fix conflict * "fix reshape" * "delete reshape inplace attr" * "follo the comments" * "rerun ci" --- paddle/fluid/operators/reshape_op.cc | 31 +++---------------- python/paddle/fluid/layers/nn.py | 9 +++--- .../test_memory_optimization_transpiler.py | 24 ++++++++++++++ .../fluid/tests/unittests/test_reshape_op.py | 6 ++-- 4 files changed, 36 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a9fd1869c9..a1dfe39c3a 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor). The output tensor of reshape operator."); AddAttr>( "shape", "(std::vector) Target shape of reshape operator."); - AddAttr("inplace", - "(default: false) Change the source tensor's shape without " - "memory copy. When Attr(inplace) is set true, the output " - "tensor shares memory with Input(X), otherwise, a new output " - "tensor is created, and its data are copied from Input(x).") - .SetDefault(false); AddComment(R"DOC( Reshape Operator. @@ -233,16 +227,9 @@ class ReshapeKernel { "sequence_reshape op."); } - bool inplace = ctx.Attr("inplace"); + out->mutable_data(ctx.GetPlace(), in->type()); + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); - if (!inplace) { - out->mutable_data(ctx.GetPlace(), in->type()); - framework::TensorCopySync(*in, ctx.GetPlace(), out); - out->Resize(out_dims); - } else { - out->ShareDataWith(*in); - out->Resize(out_dims); - } } }; @@ -251,19 +238,11 @@ class ReshapeGradKernel { void operator()(const framework::ExecutionContext &ctx) const { auto *d_out = ctx.Input(framework::GradVarName("Out")); auto *d_x = ctx.Output(framework::GradVarName("X")); + auto in_dims = d_x->dims(); d_x->mutable_data(ctx.GetPlace(), d_out->type()); - bool inplace = ctx.Attr("inplace"); - - auto in_dims = d_x->dims(); - if (!inplace) { - framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); - ctx.device_context().Wait(); - d_x->Resize(in_dims); - } else { - d_x->ShareDataWith(*d_out); - d_x->Resize(in_dims); - } + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5d7f1eadd9..058acd4a50 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4473,15 +4473,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "except one unknown dimension.") helper = LayerHelper("reshape", **locals()) - reshaped = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type="reshape", inputs=inputs, - attrs={"shape": shape, - "inplace": inplace}, - outputs={"Out": reshaped}) + attrs={"shape": shape}, + outputs={"Out": out}) - return helper.append_activation(reshaped) + return helper.append_activation(out) def lod_reset(x, y=None, target_lod=None): diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index cfd6e63e12..67733807f8 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase): print(str(result_program)) +class TestMemoryTranspiler2(unittest.TestCase): + def setUp(self): + program = Program() + with program_guard(program, startup_program=Program()): + x = layers.data(name='x', shape=[13], dtype='float32') + fc = layers.fc(input=x, size=10, act=None) + reshape = layers.reshape(x=fc, shape=[-1, 2, 5]) + fc = layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = layers.fc(input=fc, size=1, act=None) + y = layers.data(name='y', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + opt = optimizer.SGD(learning_rate=0.001) + opt.minimize(avg_cost) + self.program = program + + def test_inplace_ops(self): + print("before optimization") + print(str(self.program)) + result_program = memory_optimize(self.program) + print("after optimization") + print(str(result_program)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index f51b5a7e99..2f5558578a 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -25,7 +25,7 @@ class TestReshapeOp(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): @@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} def test_check_output(self): @@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} def test_check_output(self): From 0b62f61d2968cc34556c8e74e540469a45b21914 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 31 Jul 2018 09:16:48 +0800 Subject: [PATCH 34/40] add init flag in __init__.py for listen_and_serv_profile_period --- paddle/fluid/CMakeLists.txt | 2 +- paddle/fluid/operators/listen_and_serv_op.cc | 3 ++- python/paddle/fluid/__init__.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d274d96c29..e2e26fc5d4 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -6,4 +6,4 @@ add_subdirectory(pybind) add_subdirectory(string) add_subdirectory(recordio) # NOTE: please add subdirectory inference at last. -add_subdirectory(inference) +#add_subdirectory(inference) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index fb475e3253..760cda60a9 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,13 +19,14 @@ limitations under the License. */ #include // NOLINT #include +#include "gflags/gflags.h" + #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" -DECLARE_int32(listen_and_serv_profile_period); DEFINE_int32(listen_and_serv_profile_period, 0, "the period of listen_and_serv to do profile"); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d1d6dd75ee..956e3c4348 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,6 +127,7 @@ def __bootstrap__(): ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') + read_env_flags.append('listen_and_serv_profile_period') if core.is_compiled_with_cuda(): read_env_flags += [ From d04dca37983c16ede418f8d0f1888394be8a5ec2 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 31 Jul 2018 09:17:33 +0800 Subject: [PATCH 35/40] revert cmakelist --- paddle/fluid/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index e2e26fc5d4..d274d96c29 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -6,4 +6,4 @@ add_subdirectory(pybind) add_subdirectory(string) add_subdirectory(recordio) # NOTE: please add subdirectory inference at last. -#add_subdirectory(inference) +add_subdirectory(inference) From 7e46a8d172f808b5731784d1703f1d2fa26b5bde Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 31 Jul 2018 09:35:23 +0800 Subject: [PATCH 36/40] fix logical bug, optimize code --- paddle/fluid/operators/listen_and_serv_op.cc | 26 ++++++++++++-------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 760cda60a9..e14b148cc0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -130,9 +130,14 @@ void ListenAndServOp::RunSyncLoop( int32_t profile_step = 0; while (true) { - if (FLAGS_listen_and_serv_profile_period > 0 && profile_step == 0) { - auto pf_state = paddle::platform::ProfilerState::kCPU; - paddle::platform::EnableProfiler(pf_state); + PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period, + "profile_step should not be larger then " + "FLAGS_listen_and_serv_profile_period"); + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == 0) { + auto pf_state = paddle::platform::ProfilerState::kCPU; + paddle::platform::EnableProfiler(pf_state); + } } // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. @@ -175,13 +180,14 @@ void ListenAndServOp::RunSyncLoop( // reset received sparse vars to avoid reuse it in the next mini-batch dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); - if (FLAGS_listen_and_serv_profile_period > 0 && - profile_step == FLAGS_listen_and_serv_profile_period) { - paddle::platform::DisableProfiler( - paddle::platform::EventSortingKey::kTotal, "/dev/null"); - profile_step = 0; - } else { - profile_step++; + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == FLAGS_listen_and_serv_profile_period) { + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kTotal, "/dev/null"); + profile_step = 0; + } else { + profile_step++; + } } } // while(true) } From 5e6f7bc569b767519881c4224c7d985262492714 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 31 Jul 2018 10:56:28 +0800 Subject: [PATCH 37/40] compress the fluid.tgz --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0f334b2892..a8bc16f1b5 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -534,7 +534,7 @@ EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build cp -r fluid_install_dir fluid - tar -cf fluid.tgz fluid + tar -czf fluid.tgz fluid fi } From 6d3da458a77101e2bbbb8142db32e4d81be53ca2 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 31 Jul 2018 12:20:40 +0800 Subject: [PATCH 38/40] Fix/float16 style (#12446) * "rewrite the test case" * "follow comment" --- paddle/fluid/platform/cuda_helper_test.cu | 183 +++++++++++++--------- paddle/fluid/platform/cuda_primitives.h | 20 +-- 2 files changed, 119 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 4a47ba5cca..ca5ca1caeb 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -13,7 +13,6 @@ // limitations under the License. #include -#include #include #include @@ -25,13 +24,13 @@ using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; -#define CUDA_ATOMIC_KERNEL(op, T) \ - __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ - i += blockDim.x * gridDim.x) { \ - paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ - } \ +template +__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); } +} template struct AddFunctor { @@ -39,80 +38,116 @@ struct AddFunctor { }; template -struct SubFunctor { - T operator()(const T& a, const T& b) { return a - b; } -}; - -// NOTE(dzhwinter): the float16 add has small underflow/overflow -// so we use EXPECT_NEAR to check the result. -#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ - void Test##T##op(size_t num) { \ - T *in1, *in2, *out; \ - T *d_in1, *d_in2; \ - size_t size = sizeof(T) * num; \ - cudaMalloc(reinterpret_cast(&d_in1), size); \ - cudaMalloc(reinterpret_cast(&d_in2), size); \ - in1 = reinterpret_cast(malloc(size)); \ - in2 = reinterpret_cast(malloc(size)); \ - out = reinterpret_cast(malloc(size)); \ - std::minstd_rand engine; \ - std::uniform_real_distribution dist(0.0, 1.0); \ - for (size_t i = 0; i < num; ++i) { \ - in1[i] = static_cast(dist(engine)); \ - in2[i] = static_cast(dist(engine)); \ - } \ - cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ - cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ - op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \ - cudaDeviceSynchronize(); \ - cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ - for (size_t i = 0; i < num; ++i) { \ - EXPECT_NEAR(static_cast(out[i]), \ - static_cast(op##Functor()(in1[i], in2[i])), \ - 0.001); \ - } \ - free(in1); \ - free(in2); \ - free(out); \ - cudaFree(d_in1); \ - cudaFree(d_in2); \ +void TestCase(size_t num) { + T *in1, *in2, *out; + T *d_in1, *d_in2; + size_t size = sizeof(T) * num; + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num; ++i) { + in1[i] = static_cast(dist(engine)); + in2[i] = static_cast(dist(engine)); } -CUDA_ATOMIC_KERNEL(Add, float); -CUDA_ATOMIC_KERNEL(Add, double); -CUDA_ATOMIC_KERNEL(Add, float16); - -ARITHMETIC_KERNEL_LAUNCH(Add, float); -ARITHMETIC_KERNEL_LAUNCH(Add, double); -ARITHMETIC_KERNEL_LAUNCH(Add, float16); - -namespace paddle { -namespace platform { -USE_CUDA_ATOMIC(Sub, int); -}; -}; -CUDA_ATOMIC_KERNEL(Sub, int); -ARITHMETIC_KERNEL_LAUNCH(Sub, int); + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(in1[i], in2[i])), 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} // cuda primitives TEST(CudaAtomic, Add) { - TestfloatAdd(static_cast(10)); - TestfloatAdd(static_cast(1024 * 1024)); - TestdoubleAdd(static_cast(10)); - TestdoubleAdd(static_cast(1024 * 1024)); -} + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); -TEST(CudaAtomic, Sub) { - TestintSub(static_cast(10)); - TestintSub(static_cast(1024 * 1024)); + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); } TEST(CudaAtomic, float16) { - using paddle::platform::float16; - Testfloat16Add(static_cast(1)); - Testfloat16Add(static_cast(2)); - Testfloat16Add(static_cast(3)); + TestCase(static_cast(1)); + TestCase(static_cast(2)); + TestCase(static_cast(3)); + + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); +} + +// unalignment of uint8 +void TestUnalign(size_t num, const int shift_bit) { + PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2"); + float16 *in1, *in2, *out; + float16 *d_in1, *d_in2; + size_t size = sizeof(uint8_t) * (num + shift_bit); + size_t array_size = sizeof(float16) * (num / 2); + + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + + // right shift 1, mimic the unalignment of address + float16* r_in1 = + reinterpret_cast(reinterpret_cast(in1) + shift_bit); + float16* r_in2 = + reinterpret_cast(reinterpret_cast(in2) + shift_bit); + + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num / 2; ++i) { + r_in1[i] = static_cast(dist(engine)); + r_in2[i] = static_cast(dist(engine)); + } + cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num / 2; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(r_in1[i], r_in2[i])), + 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} + +TEST(CudaAtomic, float16Unalign) { + // same with float16 testcase + TestUnalign(static_cast(2), /*shift_bit*/ 2); + TestUnalign(static_cast(1024), /*shift_bit*/ 2); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 2); + + // shift the address. + TestUnalign(static_cast(2), /*shift_bit*/ 1); + TestUnalign(static_cast(1024), /*shift_bit*/ 1); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 1); - Testfloat16Add(static_cast(10)); - Testfloat16Add(static_cast(1024 * 1024)); + TestUnalign(static_cast(2), /*shift_bit*/ 3); + TestUnalign(static_cast(1024), /*shift_bit*/ 3); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 3); } diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 94ce83975a..67ea64833d 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) { // convert the value into float and do the add arithmetic. // then store the result into a uint32. -inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { float16 low_half; // the float16 in lower 16bits - low_half.x = static_cast(val & 0xffffu); + low_half.x = static_cast(val & 0xFFFFu); low_half = static_cast(static_cast(low_half) + x); - return (val & 0xffff0000u) | low_half.x; + return (val & 0xFFFF0000u) | low_half.x; } -inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { float16 high_half; // the float16 in higher 16bits high_half.x = static_cast(val >> 16); high_half = static_cast(static_cast(high_half) + x); - return (val & 0xffffu) | (static_cast(high_half.x) << 16); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } CUDA_ATOMIC_WRAPPER(Add, float16) { // concrete packed float16 value may exsits in lower or higher 16bits // of the 32bits address. - uint32_t *address_as_ui = - reinterpret_cast(reinterpret_cast(address) - - (reinterpret_cast(address) & 2)); + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); float val_f = static_cast(val); uint32_t old = *address_as_ui; uint32_t sum; uint32_t newval; uint32_t assumed; - if (((size_t)address & 2) == 0) { + if (((uintptr_t)address & 0x02) == 0) { // the float16 value stay at lower 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); } while (old != assumed); float16 ret; - ret.x = old & 0xffffu; + ret.x = old & 0xFFFFu; return ret; } else { // the float16 value stay at higher 16 bits of the address. From 31a2c8768899f218da188b8804cdeaface318d4d Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Tue, 31 Jul 2018 14:48:11 +0800 Subject: [PATCH 39/40] fea/lightly support lod (#12451) --- .../inference/api/api_anakin_engine_tester.cc | 18 ++++++++++-------- paddle/fluid/inference/api/api_impl.cc | 11 +++++++++++ .../api/api_tensorrt_subgraph_engine_tester.cc | 9 ++++----- .../api/demo_ci/simple_on_word2vec.cc | 17 +++++++++-------- paddle/fluid/inference/api/demo_ci/vis_demo.cc | 10 +++++----- .../fluid/inference/api/paddle_inference_api.h | 2 +- 6 files changed, 40 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc index d6d631bfba..7554fe4989 100644 --- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc +++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc @@ -37,19 +37,21 @@ TEST(inference, anakin) { float data[1 * 3 * 224 * 224] = {1.0f}; - PaddleTensor tensor{.name = "input_0", - .shape = std::vector({1, 3, 224, 224}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({1, 3, 224, 224}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::FLOAT32; // For simplicity, we set all the slots with the same data. std::vector paddle_tensor_feeds; paddle_tensor_feeds.emplace_back(std::move(tensor)); - PaddleTensor tensor_out{.name = "prob_out", - .shape = std::vector({1000, 1}), - .data = PaddleBuf(), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor_out; + tensor_out.name = "prob_out"; + tensor_out.shape = std::vector({1000, 1}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; std::vector outputs; outputs.emplace_back(std::move(tensor_out)); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 58fd7c6f8b..08d7af6d3a 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + feeds->push_back(input); } return true; @@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch( buffer.Resize(sizeof(float) * data.size()); } std::memcpy(buffer.data(), data.data(), buffer.length()); + // copy LoD + for (const auto &level : fetchs[i].lod()) { + outputs->at(i).lod.emplace_back(level); + } outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 62d98a7967..fcbf9b89d6 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { std::vector data(20); for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{ - .name = "", - .shape = std::vector({10, 1}), - .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({10, 1}); + tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 5f96fecf93..03ac79e9ed 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -47,10 +47,10 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); @@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) { // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + std::vector inputs(4, tensor); std::vector outputs; // 3. Run diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 0a2a2b713a..ddfe05a502 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -123,11 +123,11 @@ void Main(bool use_gpu) { file.close(); // Inference. - PaddleTensor input{ - .name = "xx", - .shape = record.shape, - .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; VLOG(3) << "run executor"; std::vector output; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2f8b4f8596..3342ee3c25 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -67,9 +67,9 @@ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. std::vector shape; - // TODO(Superjomn) for LoD support, add a vector> field if needed. PaddleBuf data; // blob of data. PaddleDType dtype; + std::vector> lod; // lod data }; enum class PaddleEngineKind { From dcfbc6a6615ed63c2de78686d52c8fb847ad7c46 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Tue, 31 Jul 2018 14:48:33 +0800 Subject: [PATCH 40/40] inference analyzer as bin (#12450) --- .../fluid/inference/analysis/CMakeLists.txt | 3 + paddle/fluid/inference/analysis/analyzer.cc | 7 +++ paddle/fluid/inference/analysis/analyzer.h | 30 ++++----- .../fluid/inference/analysis/analyzer_main.cc | 33 ++++++++++ .../inference/analysis/analyzer_tester.cc | 8 ++- paddle/fluid/inference/analysis/argument.h | 13 ++++ .../analysis/data_flow_graph_tester.cc | 4 +- .../data_flow_graph_to_fluid_pass_tester.cc | 10 +-- .../analysis/dfg_graphviz_draw_pass_tester.cc | 12 +++- .../analysis/fluid_to_data_flow_graph_pass.cc | 17 +++++- .../fluid_to_data_flow_graph_pass_tester.cc | 3 +- paddle/fluid/inference/analysis/helper.h | 15 +++++ .../inference/analysis/model_store_pass.cc | 61 +++++++++++++++++++ .../inference/analysis/model_store_pass.h | 51 ++++++++++++++++ .../analysis/model_store_pass_tester.cc | 43 +++++++++++++ paddle/fluid/inference/analysis/pass.h | 1 + .../inference/analysis/pass_manager_tester.cc | 7 ++- .../analysis/subgraph_splitter_tester.cc | 8 +-- ...tensorrt_subgraph_node_mark_pass_tester.cc | 6 +- .../analysis/tensorrt_subgraph_pass_tester.cc | 7 +-- paddle/fluid/inference/analysis/ut_helper.h | 21 +------ .../api/api_tensorrt_subgraph_engine.cc | 12 ++++ 22 files changed, 309 insertions(+), 63 deletions(-) create mode 100644 paddle/fluid/inference/analysis/analyzer_main.cc create mode 100644 paddle/fluid/inference/analysis/model_store_pass.cc create mode 100644 paddle/fluid/inference/analysis/model_store_pass.h create mode 100644 paddle/fluid/inference/analysis/model_store_pass_tester.cc diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 67d355d10d..27fe575cb6 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc analyzer.cc helper.cc + model_store_pass.cc DEPS framework_proto proto_desc) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) +inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index b3a1075e5a..98bdfcc00b 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" @@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +DEFINE_string(inference_analysis_output_storage_path, "", + "optimized model output path"); + namespace inference { namespace analysis { @@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager { AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); } AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); + if (!FLAGS_inference_analysis_output_storage_path.empty()) { + AddPass("model-store-pass", new ModelStorePass); + } } std::string repr() const override { return "dfg-pass-manager"; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0132bf5b9c..c82fdfff86 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -16,28 +16,23 @@ limitations under the License. */ /* * This file contains Analyzer, an class that exposed as a library that analyze - * and optimize - * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to - * control whether - * an process is applied on the program. + * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has + * multiple flags to + * control whether an process is applied on the program. * * The processes are called Passes in analysis, the Passes are placed in a - * pipeline, the first - * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to - * a data flow - * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow - * graph to a - * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes - * which take a - * node or data flow graph as input. + * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a + * Fluid ProgramDesc to + * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms + * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the + * pipeline can be any Passes + * which take a node or data flow graph as input. * * The Analyzer can be used in two methods, the first is a executable file which - * can be used to - * pre-process the inference model and can be controlled by passing difference - * command flags; + * can be used to pre-process the inference model and can be controlled by + * passing difference command flags; * the other way is to compose inside the inference API as a runtime pre-process - * phase in the - * inference service. + * phase in the inference service. */ #include @@ -50,6 +45,7 @@ namespace paddle { // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +DECLARE_string(inference_analysis_output_storage_path); namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc new file mode 100644 index 0000000000..5e1fe3eb79 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_main.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file implements analysizer -- an executation help to analyze and + * optimize trained model. + */ +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + using paddle::inference::analysis::Analyzer; + using paddle::inference::analysis::Argument; + + Argument argument; + Analyzer analyzer; + analyzer.Run(&argument); + + return 0; +} diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 25a440e7e7..24bfb3993c 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -20,14 +20,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, analysis_without_tensorrt) { +TEST(Analyzer, analysis_without_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -TEST_F(DFG_Tester, analysis_with_tensorrt) { +TEST(Analyzer, analysis_with_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 6d316f20bf..9e1c2e4586 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -36,6 +36,16 @@ namespace analysis { * All the fields should be registered here for clearness. */ struct Argument { + Argument() = default; + explicit Argument(const std::string& fluid_model_dir) + : fluid_model_dir(new std::string(fluid_model_dir)) {} + // The directory of the trained model. + std::unique_ptr fluid_model_dir; + // The path of `__model__` and `param`, this is used when the file name of + // model and param is changed. + std::unique_ptr fluid_model_program_path; + std::unique_ptr fluid_model_param_path; + // The graph that process by the Passes or PassManagers. std::unique_ptr main_dfg; @@ -44,6 +54,9 @@ struct Argument { // The processed program desc. std::unique_ptr transformed_program_desc; + + // The output storage path of ModelStorePass. + std::unique_ptr model_output_store_path; }; #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 7912f8d7f1..a881262665 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -20,7 +20,7 @@ namespace inference { namespace analysis { TEST(DataFlowGraph, BFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); @@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) { } TEST(DataFlowGraph, DFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); GraphTraits trait(&dfg); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc index d8fc5e580a..4ef381db29 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -26,21 +26,21 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Test) { - DataFlowGraph graph; +TEST(DataFlowGraph, Test) { + Argument argument(FLAGS_inference_model_dir); FluidToDataFlowGraphPass pass0; DataFlowGraphToFluidPass pass1; ASSERT_TRUE(pass0.Initialize(&argument)); ASSERT_TRUE(pass1.Initialize(&argument)); - pass0.Run(&graph); - pass1.Run(&graph); + pass0.Run(argument.main_dfg.get()); + pass1.Run(argument.main_dfg.get()); pass0.Finalize(); pass1.Finalize(); - LOG(INFO) << graph.nodes.size(); + LOG(INFO) << argument.main_dfg->nodes.size(); } }; // namespace analysis diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 65842b1e85..928be79170 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -23,12 +23,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { - auto dfg = ProgramDescToDFG(*argument.origin_program_desc); +TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) { + Argument argument(FLAGS_inference_model_dir); + FluidToDataFlowGraphPass pass0; + ASSERT_TRUE(pass0.Initialize(&argument)); + pass0.Run(argument.main_dfg.get()); + + // auto dfg = ProgramDescToDFG(*argument.origin_program_desc); + DFG_GraphvizDrawPass::Config config("./", "test"); DFG_GraphvizDrawPass pass(config); pass.Initialize(&argument); - pass.Run(&dfg); + pass.Run(argument.main_dfg.get()); // test content std::ifstream file("./0-graph_test.dot"); diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 88fdf8c9cb..511631d3e0 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -25,8 +26,20 @@ namespace analysis { bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); - PADDLE_ENFORCE(argument); + if (argument->origin_program_desc) { + LOG(WARNING) << "argument's origin_program_desc is already set, might " + "duplicate called"; + } + if (!argument->fluid_model_program_path) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); + argument->fluid_model_program_path.reset( + new std::string(*argument->fluid_model_dir + "/__model__")); + } + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); + auto program = LoadProgramDesc(*argument->fluid_model_program_path); + argument->origin_program_desc.reset( + new framework::proto::ProgramDesc(program)); + if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index dadb84059d..d218dcd050 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -21,8 +21,9 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Init) { +TEST(FluidToDataFlowGraphPass, Test) { FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index f1064cd20f..a0f912b251 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) { } } +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string &model_path) { + std::ifstream fin(model_path, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(buffer); + return program_desc; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc new file mode 100644 index 0000000000..db7be3c0cd --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/argument.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void ModelStorePass::Run(DataFlowGraph *x) { + if (!argument_->fluid_model_param_path) { + PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir); + argument_->fluid_model_param_path.reset( + new std::string(*argument_->fluid_model_dir + "param")); + } + PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path); + // Directly copy param file to destination. + std::stringstream ss; + // NOTE these commands only works on linux. + ss << "mkdir -p " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + ss.str(""); + + ss << "cp " << *argument_->fluid_model_dir << "/*" + << " " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + + // Store program + PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, + "program desc is not transformed, should call " + "DataFlowGraphToFluidPass first."); + const std::string program_output_path = + *argument_->model_output_store_path + "/__model__"; + std::ofstream file(program_output_path, std::ios::binary); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.", + program_output_path); + const std::string serialized_message = + argument_->transformed_program_desc->SerializeAsString(); + file.write(serialized_message.c_str(), serialized_message.size()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h new file mode 100644 index 0000000000..713e8783ea --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines ModelStorePass, which store the runtime DFG to a Paddle + * model in the disk, and that model can be reloaded for prediction. + */ + +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class ModelStorePass : public DataFlowGraphPass { + public: + bool Initialize(Argument* argument) override { + if (!argument) { + LOG(ERROR) << "invalid argument"; + return false; + } + argument_ = argument; + return true; + } + + void Run(DataFlowGraph* x) override; + + std::string repr() const override { return "DFG-store-pass"; } + std::string description() const override { + return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle + model in the disk, and that model can be reloaded for prediction again.)DD"; + } + + private: + Argument* argument_{nullptr}; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc new file mode 100644 index 0000000000..5f3526dd50 --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(inference_model_dir, "", "Model path"); + +TEST(DFG_StorePass, test) { + Analyzer analyzer; + Argument argument(FLAGS_inference_model_dir); + argument.model_output_store_path.reset( + new std::string("./_dfg_store_pass_tmp")); + // disable storage in alalyzer + FLAGS_inference_analysis_output_storage_path = ""; + analyzer.Run(&argument); + + ModelStorePass pass; + pass.Initialize(&argument); + pass.Run(argument.main_dfg.get()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index 6b4dbb3bb5..6806f9ff7d 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -50,6 +50,7 @@ class Pass { // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } + virtual void Run() { LOG(FATAL) << "not valid"; } // Run on a single Node. virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } // Run on a single Function. diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc index dac1c509d7..13423e4837 100644 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -56,7 +56,7 @@ class TestNodePass final : public NodePass { std::string description() const override { return "some doc"; } }; -TEST_F(DFG_Tester, DFG_pass_manager) { +TEST(PassManager, DFG_pass_manager) { TestDfgPassManager manager; DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); @@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) { manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); + Argument argument(FLAGS_inference_model_dir); + ASSERT_TRUE(&argument); ASSERT_TRUE(manager.Initialize(&argument)); manager.RunAll(); } -TEST_F(DFG_Tester, Node_pass_manager) { +TEST(PassManager, Node_pass_manager) { + Argument argument(FLAGS_inference_model_dir); // Pre-process: initialize the DFG with the ProgramDesc first. FluidToDataFlowGraphPass pass0; pass0.Initialize(&argument); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 67dd4da54b..39cc433b40 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { return false; }; -TEST_F(DFG_Tester, Split) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Split) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); LOG(INFO) << "spliter\n" << dfg.DotString(); @@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) { ASSERT_EQ(subgraphs.back().size(), 6UL); } -TEST_F(DFG_Tester, Fuse) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Fuse) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); size_t count0 = dfg.nodes.size(); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc index a6c15e848b..c1d932878e 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc @@ -22,11 +22,11 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { +TEST(TensorRTSubgraphNodeMarkPass, test) { // init FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); ASSERT_TRUE(pass.Initialize(&argument)); - argument.main_dfg.reset(new DataFlowGraph); pass.Run(argument.main_dfg.get()); TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { @@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { for (auto& node : argument.main_dfg->nodes.nodes()) { counter += node->attr(ATTR_supported_by_tensorrt).Bool(); } - + ASSERT_EQ(counter, 2); LOG(INFO) << counter << " nodes marked"; } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 1d749d3fa3..67a5af83d8 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -25,7 +25,7 @@ namespace analysis { DEFINE_string(dot_dir, "./", ""); -TEST_F(DFG_Tester, tensorrt_single_pass) { +TEST(TensorRTSubGraphPass, main) { std::unordered_set teller_set( {"elementwise_add", "mul", "sigmoid"}); SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { @@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { return false; }; - LOG(INFO) << "init"; + Argument argument(FLAGS_inference_model_dir); + DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; @@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { FluidToDataFlowGraphPass pass0; TensorRTSubGraphPass trt_pass(std::move(teller)); - LOG(INFO) << "Initialize"; dfg_pass.Initialize(&argument); dfg_pass1.Initialize(&argument); pass0.Initialize(&argument); trt_pass.Initialize(&argument); - LOG(INFO) << "Run"; argument.main_dfg.reset(new DataFlowGraph); pass0.Run(argument.main_dfg.get()); dfg_pass.Run(argument.main_dfg.get()); diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index ce1191a567..1073a6f686 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -32,27 +32,12 @@ namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); -static framework::proto::ProgramDesc LoadProgramDesc( - const std::string& model_dir = FLAGS_inference_model_dir) { - std::string msg; - std::string net_file = FLAGS_inference_model_dir + "/__model__"; - std::ifstream fin(net_file, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", net_file); - fin.seekg(0, std::ios::end); - msg.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(msg.at(0)), msg.size()); - fin.close(); - framework::proto::ProgramDesc program_desc; - program_desc.ParseFromString(msg); - return program_desc; -} - static DataFlowGraph ProgramDescToDFG( const framework::proto::ProgramDesc& desc) { DataFlowGraph graph; FluidToDataFlowGraphPass pass; Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); pass.Initialize(&argument); pass.Run(&graph); @@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG( class DFG_Tester : public ::testing::Test { protected: void SetUp() override { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index c0891e9c28..45b5a7638b 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + if (!config_.model_dir.empty()) { + argument.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument.fluid_model_param_path.reset( + new std::string(config_.param_file)); + } argument.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); Singleton::Global().Run(&argument);