From 9ac785be396bd21d3f152a299f5fa7cb5e268e08 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 7 Jun 2018 15:40:58 +0800 Subject: [PATCH 01/23] check graph's validation --- .../details/multi_devices_graph_builder.cc | 1 - .../framework/details/ssa_graph_builder.cc | 70 ++++++++++++++++++- .../framework/details/ssa_graph_builder.h | 3 + .../details/threaded_ssa_graph_executor.cc | 1 + 4 files changed, 73 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 0c4d369e88..81d5b079b8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -272,7 +272,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); - return std::unique_ptr(graph); } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 211113c797..d70f95a9f5 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include namespace paddle { namespace framework { @@ -83,6 +83,74 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { op->AddOutput(dummy_leaf); } } + +std::unique_ptr SSAGraphBuilder::BuildAndCheck( + const ProgramDesc &program) final { + std::unique_ptr graph = Build(program); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return std::move(graph); +} + +bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->generated_op_ == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair.get()); + } + } + } + + for (auto &var : graph->dep_vars_) { + insert_pending_var(var.get()); + } + + for (auto &op : graph->ops_) { + if (op->Inputs().empty()) { + ready_ops.insert(op.get()); + } else { + pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + if (ready_vars.empty()) { + return false; + } + for (auto ready_var : ready_vars.) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; +} } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 5fc12a44b5..da9298ac8d 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -31,6 +31,8 @@ class SSAGraphBuilder { virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; + std::unique_ptr BuildAndCheck(const ProgramDesc &program) final; + DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: @@ -48,6 +50,7 @@ class SSAGraphBuilder { const platform::Place &place, size_t place_offset); + bool IsValidGraph(const SSAGraph *graph) const; // Add an output variable (each_var_name, place, place_offset) to op_handle, // which belongs to graph static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 496fadd04d..bcbf573626 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ready_vars->Push(var); } } + void ThreadedSSAGraphExecutor::RunOp( BlockingQueue *ready_var_q, details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { From 8291b916d6cf053db779598b01dd59191fb5a1df Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 7 Jun 2018 16:24:23 +0800 Subject: [PATCH 02/23] replace graph_builder_factory with ssa_graph_builder_factory --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- paddle/fluid/framework/details/multi_devices_graph_builder.cc | 1 + paddle/fluid/framework/details/ssa_graph_builder.cc | 4 ++-- paddle/fluid/framework/details/ssa_graph_builder.h | 2 +- ...{graph_builder_factory.cc => ssa_graph_builder_factory.cc} | 2 +- .../{graph_builder_factory.h => ssa_graph_builder_factory.h} | 0 paddle/fluid/framework/parallel_executor.cc | 4 ++-- 8 files changed, 9 insertions(+), 8 deletions(-) rename paddle/fluid/framework/details/{graph_builder_factory.cc => ssa_graph_builder_factory.cc} (96%) rename paddle/fluid/framework/details/{graph_builder_factory.h => ssa_graph_builder_factory.h} (100%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 627370cd2d..4271e4c1bb 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) -cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c106761f72..ced063a097 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -30,7 +30,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) -cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer) +cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 81d5b079b8..0c4d369e88 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -272,6 +272,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); + return std::unique_ptr(graph); } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index d70f95a9f5..d24669a8f8 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -85,7 +85,7 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { } std::unique_ptr SSAGraphBuilder::BuildAndCheck( - const ProgramDesc &program) final { + const ProgramDesc &program) { std::unique_ptr graph = Build(program); PADDLE_ENFORCE(IsValidGraph(graph.get())); return std::move(graph); @@ -138,7 +138,7 @@ bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const { if (ready_vars.empty()) { return false; } - for (auto ready_var : ready_vars.) { + for (auto ready_var : ready_vars) { pending_vars.erase(ready_var); for (auto *op : ready_var->pending_ops_) { auto &deps = --pending_ops[op]; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index da9298ac8d..e99a988407 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -31,7 +31,7 @@ class SSAGraphBuilder { virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; - std::unique_ptr BuildAndCheck(const ProgramDesc &program) final; + std::unique_ptr BuildAndCheck(const ProgramDesc &program); DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc similarity index 96% rename from paddle/fluid/framework/details/graph_builder_factory.cc rename to paddle/fluid/framework/details/ssa_graph_builder_factory.cc index a04b9bb63c..b5e90d6b05 100644 --- a/paddle/fluid/framework/details/graph_builder_factory.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/graph_builder_factory.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_printer.h" diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h similarity index 100% rename from paddle/fluid/framework/details/graph_builder_factory.h rename to paddle/fluid/framework/details/ssa_graph_builder_factory.h diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ce56f55e41..f1ab337070 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif -#include "paddle/fluid/framework/details/graph_builder_factory.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -114,7 +114,7 @@ ParallelExecutor::ParallelExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, - builder_factory.Create()->Build(main_program))); + builder_factory.Create()->BuildAndCheck(main_program))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), From 1076e85135d3eadd97324c6d06bf4a6a30852148 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 7 Jun 2018 17:01:41 +0800 Subject: [PATCH 03/23] refine logic --- paddle/fluid/framework/details/ssa_graph_builder.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index d24669a8f8..c4ee088507 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -135,9 +135,11 @@ bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const { while (!pending_vars.empty()) { run_all_ops(ready_ops); + if (ready_vars.empty()) { return false; } + for (auto ready_var : ready_vars) { pending_vars.erase(ready_var); for (auto *op : ready_var->pending_ops_) { From 0c851cab2294356dd292b9b4458379d1bde4eadd Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 8 Jun 2018 15:02:33 +0800 Subject: [PATCH 04/23] add SSA graph checker --- paddle/fluid/framework/details/CMakeLists.txt | 3 +- .../framework/details/ssa_graph_builder.cc | 70 --------------- .../framework/details/ssa_graph_builder.h | 3 - .../details/ssa_graph_builder_factory.cc | 3 + .../framework/details/ssa_graph_checker.cc | 87 +++++++++++++++++++ .../framework/details/ssa_graph_checker.h | 44 ++++++++++ paddle/fluid/framework/parallel_executor.cc | 2 +- 7 files changed, 137 insertions(+), 75 deletions(-) create mode 100644 paddle/fluid/framework/details/ssa_graph_checker.cc create mode 100644 paddle/fluid/framework/details/ssa_graph_checker.h diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index ced063a097..dbd118d338 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -8,6 +8,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) +cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -30,7 +31,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) -cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer) +cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index c4ee088507..88a21f4887 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -83,76 +83,6 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { op->AddOutput(dummy_leaf); } } - -std::unique_ptr SSAGraphBuilder::BuildAndCheck( - const ProgramDesc &program) { - std::unique_ptr graph = Build(program); - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return std::move(graph); -} - -bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const { - std::unordered_map pending_ops; - std::unordered_set pending_vars; - std::unordered_set ready_vars; - std::unordered_set ready_ops; - - auto insert_pending_var = [&](VarHandleBase *var) { - pending_vars.insert(var); - if (var->generated_op_ == nullptr) { - ready_vars.emplace(var); - } - }; - - for (auto &var_map : graph->vars_) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - insert_pending_var(version_pair.get()); - } - } - } - - for (auto &var : graph->dep_vars_) { - insert_pending_var(var.get()); - } - - for (auto &op : graph->ops_) { - if (op->Inputs().empty()) { - ready_ops.insert(op.get()); - } else { - pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); - } - } - - auto run_all_ops = [&](std::unordered_set &set) { - for (auto *op : set) { - for (auto out : op->Outputs()) { - ready_vars.emplace(out); - } - } - set.clear(); - }; - - while (!pending_vars.empty()) { - run_all_ops(ready_ops); - - if (ready_vars.empty()) { - return false; - } - - for (auto ready_var : ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->pending_ops_) { - auto &deps = --pending_ops[op]; - if (deps == 0) { - ready_ops.insert(op); - } - } - } - ready_vars.clear(); - } - return true; -} } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index e99a988407..5fc12a44b5 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -31,8 +31,6 @@ class SSAGraphBuilder { virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; - std::unique_ptr BuildAndCheck(const ProgramDesc &program); - DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: @@ -50,7 +48,6 @@ class SSAGraphBuilder { const platform::Place &place, size_t place_offset); - bool IsValidGraph(const SSAGraph *graph) const; // Add an output variable (each_var_name, place, place_offset) to op_handle, // which belongs to graph static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc index b5e90d6b05..b4b49d3de6 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" #include "paddle/fluid/framework/details/ssa_graph_printer.h" namespace paddle { @@ -40,6 +41,8 @@ std::unique_ptr SSAGraphBuilderFactory::Create() { res.reset(new SSAGraghBuilderWithPrinter( std::move(fout), std::move(graphviz_printer), std::move(res))); } + res.reset(new SSAGraghBuilderWithChecker(std::move(res))); + return res; } } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc new file mode 100644 index 0000000000..da5428946e --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph.h" +#include +#include "paddle/fluid/framework/details/ssa_graph_checker.h" + +namespace paddle { +namespace framework { +namespace details { + +bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->generated_op_ == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair.get()); + } + } + } + + for (auto &var : graph->dep_vars_) { + insert_pending_var(var.get()); + } + + for (auto &op : graph->ops_) { + if (op->Inputs().empty()) { + ready_ops.insert(op.get()); + } else { + pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h new file mode 100644 index 0000000000..542c4a1728 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +class SSAGraph; + +class SSAGraghBuilderWithChecker : public SSAGraphBuilder { + public: + explicit SSAGraghBuilderWithChecker( + std::unique_ptr&& builder) + : builder_(std::move(builder)) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } + + bool IsValidGraph(const SSAGraph* graph) const; + + private: + std::unique_ptr builder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f1ab337070..5d95dc214a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -114,7 +114,7 @@ ParallelExecutor::ParallelExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, - builder_factory.Create()->BuildAndCheck(main_program))); + builder_factory.Create()->Build(main_program))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), From b000e0de5d382ff9f1b3c85b03c8a501f563f89d Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Fri, 8 Jun 2018 15:47:17 +0800 Subject: [PATCH 05/23] Simplize API Reference Documentation --- paddle/fluid/operators/batch_size_like.h | 14 +++--- .../fill_constant_batch_size_like_op.cc | 14 +++--- paddle/fluid/operators/load_op.cc | 15 ++---- paddle/fluid/operators/max_sequence_len_op.cc | 13 +++-- python/paddle/fluid/layers/control_flow.py | 28 +++++------ python/paddle/fluid/layers/io.py | 29 ++++++++++- .../fluid/layers/layer_function_generator.py | 1 - python/paddle/fluid/layers/tensor.py | 50 +++++++------------ 8 files changed, 84 insertions(+), 80 deletions(-) diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index 483c9f8c21..fc15d56891 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel { class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() final { - AddInput("Input", - "(Tensor) Tensor " - "whose input_dim_idx'th dimension specifies the batch_size"); + AddInput( + "Input", + "Tensor whose input_dim_idx'th dimension specifies the batch_size"); AddOutput("Out", - "(Tensor) Tensor of specified shape will be filled " + "Tensor of specified shape will be filled " "with the specified value"); - AddAttr>("shape", "(vector) The shape of the output"); + AddAttr>("shape", "The shape of the output"); AddAttr("input_dim_idx", - "(int, default 0) The index of input's batch size dimension") + "default 0. The index of input's batch size dimension") .SetDefault(0); AddAttr("output_dim_idx", - "(int, default 0) The index of output's batch size dimension") + "default 0. The index of output's batch size dimension") .SetDefault(0); Apply(); } diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 1ae78675a0..453a1b32a0 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp { class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { protected: void Apply() override { - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") + AddAttr( + "dtype", + "It could be numpy.dtype. Output data type. Default is float32") .SetDefault(framework::proto::VarType::FP32); - AddAttr("value", "(float, default 0) The value to be filled") + AddAttr("value", "default 0. The value to be filled") .SetDefault(0.0f); AddComment(R"DOC( -FillConstantBatchSizeLike Operator. - -Fill up a variable with specified constant value. +This function creates a tensor of specified *shape*, *dtype* and batch size, +and initializes this with a constant supplied in *value*. The batch size is +obtained from the `input` tensor. )DOC"); } diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 93f45cff8a..8f4b504927 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase { class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddOutput("Out", "The tensor need to be loaded"); AddAttr( "load_as_fp16", - "(boolean, default false)" "If true, the tensor will be first loaded and then " "converted to float16 data type. Otherwise, the tensor will be " - "directly loaded without data type conversion.") + "directly loaded without data type conversion. Default is false.") .SetDefault(false); AddAttr("file_path", - "(string) " - "Variable will be loaded from \"file_path\".") + R"(Variable will be loaded from "file_path")") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); - AddComment(R"DOC( -Load Operator. - -Load operator will load a tensor variable from disk file. - -)DOC"); + AddComment("Load operator will load a tensor variable from disk file."); } }; } // namespace operators diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc index 8e508b68ee..b1e69f375d 100644 --- a/paddle/fluid/operators/max_sequence_len_op.cc +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase { class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("RankTable", "The lod_rank_table."); - AddOutput("Out", "The max sequence length."); - AddComment( - R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + AddInput("RankTable", "Input variable which is a LoDRankTable object"); + AddOutput("Out", "The max sequence length"); + AddComment(R"DOC( + Given a LoDRankTable object, this layer returns the max length of + a batch of sequences. In fact, a LoDRankTable object contains a list of + tuples() and the list is already sorted by + sequence length in descending order, so the operator just returns the + sequence length of the first tuple element +)DOC"); } }; diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index d1ea9f1485..b32248ad3d 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from tensor import assign, fill_constant from .. import core from ..framework import Program, Variable, Operator @@ -721,26 +721,22 @@ def lod_rank_table(x, level=0): return table +@templatedoc() def max_sequence_len(rank_table): - """Max Sequence Len Operator. Given a LoDRankTable object, this layer - returns the max length of a batch of sequences. In fact, a LoDRankTable - object contains a list of tuples() and - the list is already sorted by sequence length in descending order, so the - operator just returns the sequence length of the first tuple element. + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32', + >>> lod_level=1) + >>> rank_table = layers.lod_rank_table(x=x, level=0) + >>> max_seq_len = layers.max_sequence_len(rank_table) Args: - rank_table (Variable): Input variable which is a LoDRankTable object. + rank_table(${rank_table_type}): ${rank_table_comment}. Returns: - Variable: The max length of sequence. - - Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[10], - dtype='float32', lod_level=1) - rank_table = layers.lod_rank_table(x=x, level=0) - max_seq_len = layers.max_sequence_len(rank_table) + (${out_type}): ${out_comment} """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a56f3ea9db..9de88e2c32 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name from control_flow import BlockGuard from ..layer_helper import LayerHelper from ..executor import global_scope +from layer_function_generator import generate_layer_fn, templatedoc __all__ = [ 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', - 'random_data_generator', 'Preprocessor' + 'random_data_generator', 'Preprocessor', 'load' ] @@ -662,3 +663,29 @@ class Preprocessor(object): "sink_var_names": self.sink_var_names }) return monkey_patch_reader_methods(self.reader) + + +@templatedoc() +def load(out, file_path, load_as_fp16=None): + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32') + >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin") + + Args: + out(${out_type}): ${out_comment}. + + file_path(${file_path_type}): ${file_path_comment}. + + load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}. + + Returns: + None + """ + helper = LayerHelper("load", **locals()) + attrs = {"file_path": file_path} + if load_as_fp16 is not None: + attrs['load_as_fp16'] = load_as_fp16 + helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 904413cc11..e6a7e7c3aa 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -263,7 +263,6 @@ def templatedoc(): output_name = _convert_(each_opt.name) args["{0}_comment".format(output_name)] = each_opt.comment args["{0}_type".format(output_name)] = "Variable" - func.__doc__ = tmpl.substitute(args) return func diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 75d3bf8797..601b06cdd8 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from layer_function_generator import templatedoc import numpy __all__ = [ @@ -266,6 +267,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): return out +@templatedoc() def fill_constant_batch_size_like(input, shape, dtype, @@ -273,30 +275,28 @@ def fill_constant_batch_size_like(input, input_dim_idx=0, output_dim_idx=0): """ - **fill_constant_batch_size_like** - - This function creates a tensor of specified *shape*, *dtype* and batch size, - and initializes this with a constant supplied in *value*. The batch size is - obtained from the `input` tensor. + ${comment} It also sets *stop_gradient* to True. + >>> data = fluid.layers.fill_constant_batch_size_like( + >>> input=like, shape=[1], value=0, dtype='int64') + Args: - input(Variable): Tensor whose dimensions will be used to get batch size - shape(tuple|list|None): Shape of output tensor - dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor - value(float): Constant value to initialize the output tensor - input_dim_idx(int): Index of input's batch size dimension - output_dim_idx(int): Index of output's batch size dimension + input(${input_type}): ${input_comment}. - Returns: - Variable: The tensor variable storing the output + shape(${shape_type}): ${shape_comment}. - Examples: - .. code-block:: python + dtype(${dtype_type}): ${dtype_comment}. + + value(${value_type}): ${value_comment}. - data = fluid.layers.fill_constant_batch_size_like( - input=like, shape=[1], value=0, dtype='int64') + input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}. + + output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}. + + Returns: + ${out_comment} """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) @@ -437,22 +437,6 @@ def save_combine(x, file_path, overwrite=True): "overwrite": overwrite}) -def load(out, file_path): - """ - Loads a variable from a given file. - - Args: - out(variable): The variable to be read from the disk file. - file_path(str): The path of the disk file. - """ - helper = LayerHelper("load", **locals()) - helper.append_op( - type="load", - inputs={}, - output={"Out": out}, - args={"file_path": file_path}) - - def load_combine(out, file_path): """ Loads a list of vairables from a single file. From 439a265760e7421cec2c7ca3ea1b9e6a3c24b673 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Fri, 8 Jun 2018 16:01:35 +0800 Subject: [PATCH 06/23] Better trim dot --- python/paddle/fluid/layers/control_flow.py | 2 +- .../fluid/layers/layer_function_generator.py | 14 ++++++++++---- python/paddle/fluid/layers/tensor.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index b32248ad3d..51fad6e68c 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -736,7 +736,7 @@ def max_sequence_len(rank_table): rank_table(${rank_table_type}): ${rank_table_comment}. Returns: - (${out_type}): ${out_comment} + (${out_type}): ${out_comment}. """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index e6a7e7c3aa..7dc4c214be 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -238,6 +238,9 @@ def templatedoc(): Decorated function. """ + def trim_ending_dot(msg): + return msg.rstrip('.') + def __impl__(func): op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) tmpl = string.Template(func.__doc__) @@ -249,19 +252,22 @@ def templatedoc(): comment += line comment += "\n" - args = {"comment": comment} + args = {"comment": trim_ending_dot(comment)} for each_input in op_proto.inputs: input_name = _convert_(each_input.name) - args["{0}_comment".format(input_name)] = each_input.comment + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_input.comment) args["{0}_type".format(input_name)] = "Variable" for each_attr in op_proto.attrs: input_name = _convert_(each_attr.name) - args["{0}_comment".format(input_name)] = each_attr.comment + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_attr.comment) args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) for each_opt in op_proto.outputs: output_name = _convert_(each_opt.name) - args["{0}_comment".format(output_name)] = each_opt.comment + args["{0}_comment".format(output_name)] = trim_ending_dot( + each_opt.comment) args["{0}_type".format(output_name)] = "Variable" func.__doc__ = tmpl.substitute(args) return func diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 601b06cdd8..66db6fe13f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -296,7 +296,7 @@ def fill_constant_batch_size_like(input, output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}. Returns: - ${out_comment} + ${out_comment}. """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) From 0d29e6592479122288a38a90c751efa6c2afd3ab Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Fri, 8 Jun 2018 16:40:09 +0800 Subject: [PATCH 07/23] Add resize_bilinear --- paddle/fluid/operators/bilinear_interp_op.cc | 11 ++++----- .../fluid/layers/layer_function_generator.py | 15 ++++++++---- python/paddle/fluid/layers/nn.py | 23 ++++++++++++------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc index 3321adf274..2572e813d6 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/bilinear_interp_op.cc @@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor) The input tensor of bilinear interpolation, " + "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of (N x C x h x w)"); AddInput("OutSize", - "(Tensor) This is a 1-D tensor with two number. " + "This is a 1-D tensor with two number. " "The first number is height and the second number is width.") .AsDispensable(); - AddOutput("Out", - "(Tensor) The dimension of output is (N x C x out_h x out_w]"); + AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); - AddAttr("out_h", "(int) output height of bilinear interpolation op."); - AddAttr("out_w", "(int) output width of bilinear interpolation op."); + AddAttr("out_h", "output height of bilinear interpolation op."); + AddAttr("out_w", "output width of bilinear interpolation op."); AddComment(R"DOC( Bilinear interpolation is an extension of linear interpolation for interpolating functions of two variables (e.g. H-direction and diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 7dc4c214be..79aa9ff604 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -224,7 +224,7 @@ def autodoc(comment=""): return __impl__ -def templatedoc(): +def templatedoc(op_type=None): """ Decorator of layer function. It will use the docstring from the layer function as the template. The template arguments are: @@ -242,15 +242,20 @@ def templatedoc(): return msg.rstrip('.') def __impl__(func): - op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) + if op_type is None: + op_type_name = func.__name__ + else: + op_type_name = op_type + op_proto = OpProtoHolder.instance().get_op_proto(op_type_name) tmpl = string.Template(func.__doc__) comment_lines = op_proto.comment.split("\n") comment = "" for line in comment_lines: - line = line.lstrip() - comment += line - comment += "\n" + line = line.strip() + if len(line) != 0: + comment += line + comment += " " args = {"comment": trim_ending_dot(comment)} for each_input in op_proto.inputs: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ddaeb415af..b9ea74fc81 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4037,18 +4037,25 @@ def image_resize(input, return out +@templatedoc(op_type="bilinear_interp") def resize_bilinear(input, out_shape=None, scale=None, name=None): """ - This is an alias of layer 'image_resize' with bilinear interpolation. + ${comment} + + Args: + input(${x_type}): ${x_comment}. + + out_shape(${out_size_type}): ${out_size_comment}. - The mathematical meaning of resize bilinear layer is - Bilinear interpolation. - Bilinear interpolation is an extension of linear interpolation for - interpolating functions of two variables (e.g. H-direction and - W-direction in this layer) on a rectilinear 2D grid. + scale(float|None): The multiplier for the input height or width. At + least one of out_shape or scale must be set. And out_shape has + a higher priority than scale. Default: None. + + name(str|None): The output variable name. + + Returns: - For details, please refer to Wikipedia: - https://en.wikipedia.org/wiki/Bilinear_interpolation + ${out_comment}. """ return image_resize(input, out_shape, scale, name, 'BILINEAR') From 568c4e5ec43083c81335c1b8094472b844c704d0 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 8 Jun 2018 09:30:17 +0000 Subject: [PATCH 08/23] recommit using account sneaxiy --- paddle/fluid/operators/arg_max_op.cc | 40 +++++ paddle/fluid/operators/arg_max_op.cu | 34 ++++ paddle/fluid/operators/arg_max_op.h | 16 ++ paddle/fluid/operators/arg_min_max_op_base.h | 157 ++++++++++++++++++ paddle/fluid/operators/arg_min_op.cc | 40 +++++ paddle/fluid/operators/arg_min_op.cu | 34 ++++ paddle/fluid/operators/arg_min_op.h | 16 ++ python/paddle/fluid/layers/tensor.py | 64 +++++++ .../tests/unittests/test_arg_min_max_op.py | 82 +++++++++ 9 files changed, 483 insertions(+) create mode 100644 paddle/fluid/operators/arg_max_op.cc create mode 100644 paddle/fluid/operators/arg_max_op.cu create mode 100644 paddle/fluid/operators/arg_max_op.h create mode 100644 paddle/fluid/operators/arg_min_max_op_base.h create mode 100644 paddle/fluid/operators/arg_min_op.cc create mode 100644 paddle/fluid/operators/arg_min_op.cu create mode 100644 paddle/fluid/operators/arg_min_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_arg_min_max_op.py diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc new file mode 100644 index 0000000000..5603607357 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_max_op.h" +/* +REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_max, ArgMax); + +REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CPU); +*/ + +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp, + paddle::operators::ArgMaxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_max, paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu new file mode 100644 index 0000000000..8f57c63beb --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cu @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_max_op.h" + +// REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CUDA); + +REGISTER_OP_CUDA_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.h b/paddle/fluid/operators/arg_max_op.h new file mode 100644 index 0000000000..d232a85699 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.h @@ -0,0 +1,16 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/arg_min_max_op_base.h" diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h new file mode 100644 index 0000000000..8c20461a34 --- /dev/null +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -0,0 +1,157 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +enum ArgMinMaxType { kArgMin, kArgMax }; + +template +struct ArgMinMaxFunctor {}; + +#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ + template \ + struct ArgMinMaxFunctor { \ + void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ + framework::LoDTensor& out, int64_t axis) { \ + auto in_eigen = framework::EigenTensor::From(in); \ + auto out_eigen = framework::EigenTensor::From(out); \ + out_eigen.device(*(ctx.eigen_device())) = \ + in_eigen.eigen_op_type(axis).template cast(); \ + } \ + } + +DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); +DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); + +template +class ArgMinMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto& dev_ctx = ctx.template device_context(); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMinMaxFunctor \ + functor##rank; \ + functor##rank(dev_ctx, x, out, axis) + + switch (x.dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_THROW( + "%s operator doesn't supports tensors whose ranks are greater " + "than 6.", + (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); + break; + } + } +}; + +template +using ArgMinKernel = + ArgMinMaxKernel; + +template +using ArgMaxKernel = + ArgMinMaxKernel; + +typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + int64_t axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(), + "'axis' must be inside [-Rank(X), Rank(X))"); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + + std::vector vec; + for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]); + for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); + ctx->SetOutputDim("Out", framework::make_ddim(vec)); + } +} ArgMinOp, ArgMaxOp; + +class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + virtual const char* OpName() const = 0; + virtual const char* Name() const = 0; + + public: + void Make() override { + AddInput("X", "Input tensor."); + AddOutput("Out", "Output tensor."); + AddAttr("axis", "The axis in which to compute the arg indics."); + AddComment(::paddle::string::Sprintf(R"DOC( + %s Operator. + + Computes the indices of the %s elements of the input tensor's element along the provided axis. +)DOC", + OpName(), Name())); + } +}; + +class ArgMinOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMin"; } + const char* Name() const override { return "min"; } +}; + +class ArgMaxOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMax"; } + const char* Name() const override { return "max"; } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc new file mode 100644 index 0000000000..fe17ed711b --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_op.h" +/* +REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_min, ArgMin); + +REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CPU); +*/ + +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp, + paddle::operators::ArgMinOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_min, paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu new file mode 100644 index 0000000000..da9262044a --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cu @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_op.h" + +// REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CUDA); + +REGISTER_OP_CUDA_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.h b/paddle/fluid/operators/arg_min_op.h new file mode 100644 index 0000000000..d232a85699 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.h @@ -0,0 +1,16 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/arg_min_max_op_base.h" diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 75d3bf8797..3dfacfff6a 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -30,6 +30,8 @@ __all__ = [ 'assign', 'fill_constant_batch_size_like', 'fill_constant', + 'argmin', + 'argmax', 'ones', 'zeros', ] @@ -315,6 +317,68 @@ def fill_constant_batch_size_like(input, return out +def argmin(x, axis=0): + """ + **argmin** + + This function computes the indices of the min elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the min elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmin(x=in, axis=0) + out = fluid.layers.argmin(x=in, axis=-1) + """ + helper = LayerHelper("arg_min", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_min', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + +def argmax(x, axis=0): + """ + **argmax** + + This function computes the indices of the max elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the max elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmax(x=in, axis=0) + out = fluid.layers.argmax(x=in, axis=-1) + """ + helper = LayerHelper("arg_max", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_max', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def ones(shape, dtype, force_cpu=False): """ **ones** diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py new file mode 100644 index 0000000000..e04412f809 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output() + + +class TestCase0(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestCase1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float64' + self.axis = 1 + + +class TestCase2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'int32' + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() From 8c9041f486fd6ac4004e0bd93e829dfa051293b9 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Fri, 8 Jun 2018 17:42:22 +0800 Subject: [PATCH 09/23] Refine LinearCRF --- paddle/fluid/operators/linear_chain_crf_op.cc | 2 -- .../fluid/layers/layer_function_generator.py | 10 +++++- .../fluid/layers/learning_rate_scheduler.py | 36 ++++++++++--------- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e38525cd7f..a711da3627 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { "mini-batch. Note: S is equal to the sequence number in a mini-batch. " "The output is no longer a LoDTensor."); AddComment(R"DOC( -LinearChainCRF Operator. - Conditional Random Field defines an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. CRF learns the conditional probability $P(Y|X)$, where diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 79aa9ff604..cb60a3aec9 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -224,6 +224,9 @@ def autodoc(comment=""): return __impl__ +_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$") + + def templatedoc(op_type=None): """ Decorator of layer function. It will use the docstring from the layer @@ -241,6 +244,9 @@ def templatedoc(op_type=None): def trim_ending_dot(msg): return msg.rstrip('.') + def escape_inline_math(msg): + return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg) + def __impl__(func): if op_type is None: op_type_name = func.__name__ @@ -254,8 +260,10 @@ def templatedoc(op_type=None): for line in comment_lines: line = line.strip() if len(line) != 0: - comment += line + comment += escape_inline_math(line) comment += " " + elif len(comment) != 0: + comment += "\n \n " args = {"comment": trim_ending_dot(comment)} for each_input in op_proto.inputs: diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index d13c54daa5..716cc7824e 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +When training a model, it's often useful to decay the +learning rate during training process, this is called +learning_rate_decay. There are many strategies to do +this, this module will provide some classical method. +User can also implement their own learning_rate_decay +strategy according to this module. +""" import control_flow import nn @@ -22,14 +30,6 @@ __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'polynomial_decay', 'piecewise_decay', 'noam_decay' ] -""" -When training a model, it's often useful to decay the -learning rate during training process, this is called -learning_rate_decay. There are many strategies to do -this, this module will provide some classical method. -User can also implement their own learning_rate_decay -strategy according to this module. -""" def _decay_step_counter(begin=0): @@ -41,18 +41,20 @@ def _decay_step_counter(begin=0): def noam_decay(d_model, warmup_steps): - """Apply decay to learning rate. - ```python - lr_value = np.power(d_model, -0.5) * np.min([ - np.power(current_steps, -0.5), - np.power(warmup_steps, -1.5) * current_steps - ]) - ``` + """ + Noam decay method. The numpy implementation of noam decay as follows. + + >>> import numpy as np + >>> lr_value = np.power(d_model, -0.5) * np.min([ + >>> np.power(current_steps, -0.5), + >>> np.power(warmup_steps, -1.5) * current_steps]) + + Please reference `attention is all you need + `_. Args: d_model(Variable): The dimensionality of input and output of model. - Reference: attention is all you need - https://arxiv.org/pdf/1706.03762.pdf + warmup_steps(Variable): A super parameter. Returns: From dd26329b3c14473ed25620d9724e05ebaadfec87 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Fri, 8 Jun 2018 17:46:59 +0800 Subject: [PATCH 10/23] Remove return types --- python/paddle/fluid/layers/control_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 51fad6e68c..80e8ff484a 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -736,7 +736,7 @@ def max_sequence_len(rank_table): rank_table(${rank_table_type}): ${rank_table_comment}. Returns: - (${out_type}): ${out_comment}. + ${out_comment}. """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") From c6d230e03e00dfcca7db67a4a8d4069f3d926d6e Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 8 Jun 2018 18:34:53 +0800 Subject: [PATCH 11/23] add FLAGS_use_mkldnn to global control use_mkldnn --- paddle/fluid/framework/executor.cc | 18 ++++++++++++- paddle/fluid/framework/executor.h | 2 ++ .../test_inference_image_classification.cc | 5 +--- .../tests/book/test_inference_nlp.cc | 4 --- paddle/fluid/inference/tests/test_helper.h | 25 ++++++------------- paddle/testing/paddle_gtest_main.cc | 2 +- python/paddle/fluid/__init__.py | 2 +- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 3d68c5fb87..d4d6c34108 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(benchmark); +DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); namespace paddle { namespace framework { @@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); + if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& feed_holder_name, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); + if (FLAGS_use_mkldnn) EnableMKLDNN(program); bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, unique_ptr_of_copy_program.reset(new ProgramDesc(program)); copy_program = unique_ptr_of_copy_program.get(); } - auto* global_block = copy_program->MutableBlock(0); if (!has_feed_ops) { @@ -378,5 +380,19 @@ void Executor::RunPreparedContext( } } +void Executor::EnableMKLDNN(const ProgramDesc& program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + auto* block = const_cast(program).MutableBlock(bid); + for (auto* op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 0c3c23611d..e6f9c3d31c 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -81,6 +81,8 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + void EnableMKLDNN(const ProgramDesc& program); + private: const platform::Place place_; }; diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 987da18116..60c761c528 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_bool(skip_cpu, false, "Skip the cpu test"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); TEST(inference, image_classification) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { @@ -59,10 +58,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "Batch size is " << FLAGS_batch_size; - LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn; TestInference( - dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined, - FLAGS_use_mkldnn); + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); LOG(INFO) << output1.dims(); } diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index a0e83a1705..9dcd79c3bb 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -27,7 +27,6 @@ limitations under the License. */ DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); DEFINE_int32(repeat, 100, "Running the inference program repeat times"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_int32(num_threads, 1, "Number of threads should be used"); @@ -190,9 +189,6 @@ TEST(inference, nlp) { std::unique_ptr inference_program; inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, /*model combined*/ false); - if (FLAGS_use_mkldnn) { - EnableMKLDNN(inference_program); - } // always prepare context std::unique_ptr ctx; ctx = executor.Prepare(*inference_program, 0); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 01b8dc0be6..44c36b1683 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(use_mkldnn); + template void SetupTensor(paddle::framework::LoDTensor* input, paddle::framework::DDim dims, T lower, T upper) { @@ -133,24 +135,11 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } -void EnableMKLDNN( - const std::unique_ptr& program) { - for (size_t bid = 0; bid < program->Size(); ++bid) { - auto* block = program->MutableBlock(bid); - for (auto* op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -} - template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, - const int repeat = 1, const bool is_combined = false, - const bool use_mkldnn = false) { + const int repeat = 1, const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); @@ -182,9 +171,6 @@ void TestInference(const std::string& dirname, "init_program", paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); - if (use_mkldnn) { - EnableMKLDNN(inference_program); - } } // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, @@ -210,7 +196,10 @@ void TestInference(const std::string& dirname, fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; } - // 6. Run the inference program + // 6. If export Flags_use_mkldnn=True, use mkldnn related ops. + if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program); + + // 7. Run the inference program { if (!CreateVars) { // If users don't want to create and destroy variables every time they diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 586ec48477..507479c862 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -30,7 +30,7 @@ int main(int argc, char** argv) { new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); #else - new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn")); #endif int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index c4fad620f0..4914719279 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,7 +116,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope' + 'eager_delete_scope', 'use_mkldnn' ] if core.is_compiled_with_cuda(): read_env_flags += [ From 6d32e96096b63663111dcab202be357ed00ef93a Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 8 Jun 2018 11:42:13 +0000 Subject: [PATCH 12/23] remove redundant comments --- paddle/fluid/operators/arg_max_op.cc | 5 ----- paddle/fluid/operators/arg_max_op.cu | 2 -- paddle/fluid/operators/arg_min_op.cc | 5 ----- paddle/fluid/operators/arg_min_op.cu | 2 -- 4 files changed, 14 deletions(-) diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 5603607357..859cccd1b2 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/arg_max_op.h" -/* -REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_max, ArgMax); - -REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CPU); -*/ REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp, paddle::operators::ArgMaxOpMaker, diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu index 8f57c63beb..c9c102bdcc 100644 --- a/paddle/fluid/operators/arg_max_op.cu +++ b/paddle/fluid/operators/arg_max_op.cu @@ -14,8 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/arg_max_op.h" -// REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CUDA); - REGISTER_OP_CUDA_KERNEL( arg_max, paddle::operators::ArgMaxKernel Date: Sun, 10 Jun 2018 23:07:07 +0200 Subject: [PATCH 13/23] MKLDNN layout: Support for pool operator --- paddle/fluid/operators/pool_mkldnn_op.cc | 182 ++++++++++++++++------- 1 file changed, 128 insertions(+), 54 deletions(-) diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index a045f9e98d..5341187d1c 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -18,9 +18,14 @@ limitations under the License. */ namespace paddle { namespace operators { -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::pooling_forward; +using framework::DataLayout; +using mkldnn::memory; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial @@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - // Get an unique name from "argument" name of "Out" variable - // This name will be used as key when saving info into device context + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + const std::string key = gethash(src_tz, pooling_type, ksize, strides, paddings, ctx.op().Output("Out")); const std::string key_pool_p = key + "@pool_p"; @@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { - // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); - auto src_md = - platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); - std::shared_ptr pool_pd = + std::shared_ptr pool_pd = CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, pooling_type, mkldnn_engine); @@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // save pool_workspace_memory to be referred in backward path dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - auto pool_src_memory_p = std::make_shared( - memory::primitive_desc{src_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); - auto pool_dst_memory_p = std::make_shared( - memory::primitive_desc{dst_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + pool_p = std::make_shared(*pool_pd, *(src_memory.get()), + *(dst_memory.get()), + *workspace_memory); - pool_p = std::make_shared( - *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()), - *workspace_memory); dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; } else { // Primitives already exist auto pool_src_memory_p = @@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); PADDLE_ENFORCE(pool_dst_memory_p != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; } // push primitive to stream and wait until it's executed std::vector pipeline{*(pool_p.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } private: @@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN && + in_x->format() != memory::format::format_undef, + "Wrong layout/format set for Input X tensor"); + PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN && + out_grad->format() != memory::format::format_undef, + "Wrong layout/format set for Input output_grad tensor"); + std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); @@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + memory::format in_x_grad_format{memory::format::format_undef}; std::vector diff_src_tz = paddle::framework::vectorize2int(in_x_grad->dims()); @@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_workspace_memory = key + "@pool_workspace_memory"; + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()}, + mkldnn_engine}, + to_void_cast(out_grad_data)); + + std::shared_ptr diff_src_memory; + std::shared_ptr diff_dst_memory; + auto dst_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(dst_memory != nullptr, + "Fail to find dst_memory in device context"); + + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; auto pool_bwd_p = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_bwd_p)); if (pool_bwd_p == nullptr) { - auto diff_src_md = - platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto diff_dst_md = - platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + // Retrieve src_memory/dst_memory saved in forward pass + auto src_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find src_memory in device context"); // Retrieve pool_pd/pool_workspace_memory from device context auto pool_pd = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_pd)); PADDLE_ENFORCE(pool_pd != nullptr, "Fail to find pool_pd in device context"); - - auto workspace_memory = std::static_pointer_cast( + auto workspace_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_workspace_memory)); PADDLE_ENFORCE(workspace_memory != nullptr, "Fail to find workspace_memory in device context"); - auto pool_diff_src_memory_p = std::make_shared(memory( - {diff_src_md, mkldnn_engine}, static_cast(in_x_grad_data))); - dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); - - auto pool_diff_dst_memory_p = std::make_shared( - memory({diff_dst_md, mkldnn_engine}, - static_cast(const_cast(out_grad_data)))); - dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p); + // create memory descriptors for pooling + auto diff_src_md = src_memory.get()->get_primitive_desc().desc(); + auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc(); auto pool_bwd_desc = mkldnn::pooling_backward::desc( pooling_type == "max" ? mkldnn::algorithm::pooling_max @@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( pool_bwd_desc, mkldnn_engine, *pool_pd); + // reorder between user_diff_dst and pool diff_dst if needed + diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + diff_src_memory = std::make_shared( + pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); + + dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); + pool_bwd_p = std::make_shared( - pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, - *(pool_diff_src_memory_p)); + pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory, + *(diff_src_memory)); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); + } else { // Primitives already exist - auto pool_diff_src_memory_p = std::static_pointer_cast( + diff_src_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_src_mem_p)); - PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, + PADDLE_ENFORCE(diff_src_memory != nullptr, "Fail to find pooling src mem_p in device context"); - auto pool_diff_dst_memory_p = std::static_pointer_cast( + diff_dst_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); - PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, + PADDLE_ENFORCE(diff_dst_memory != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_diff_src_memory_p->set_data_handle( - reinterpret_cast(in_x_grad_data)); - pool_diff_dst_memory_p->set_data_handle(const_cast(out_grad_data)); + + diff_src_memory->set_data_handle(reinterpret_cast(in_x_grad_data)); + diff_dst_memory->set_data_handle(const_cast(out_grad_data)); + + // reorder between user_diff_dst and pool diff_dst if needed + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } } + in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format; + // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_bwd_p.get())}; + std::vector pipeline; + if (is_diff_dst_reordered) { + pipeline.push_back(reorder_diff_dst); + } + pipeline.push_back(*(pool_bwd_p.get())); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + in_x_grad->set_layout(DataLayout::kMKLDNN); + in_x_grad->set_format(in_x_grad_format); } // Compute() }; } // namespace operators } // namespace paddle +namespace ops = paddle::operators; + REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel); REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNGradOpKernel); + ops::PoolMKLDNNGradOpKernel); From 045589fae4a3bfaf42a9247bff243f0c3864c59d Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 11 Jun 2018 10:27:57 +0800 Subject: [PATCH 14/23] fix compiler error in high-level api --- .../contrib/inference/test_paddle_inference_api_impl.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 4b6cb7b051..5d843010e0 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -109,7 +109,6 @@ void MainWord2Vec(bool use_gpu) { void MainImageClassification(bool use_gpu) { int batch_size = 2; - bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); config.use_gpu = use_gpu; @@ -134,12 +133,8 @@ void MainImageClassification(bool use_gpu) { std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); - TestInference(config.model_dir, - cpu_feeds, - cpu_fetchs1, - repeat, - is_combined, - use_mkldnn); + TestInference( + config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined); auto predictor = CreatePaddlePredictor(config); std::vector paddle_tensor_feeds; From 9b43edeae05d0a9419c787f0166b95f6a70ba4f7 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Mon, 11 Jun 2018 12:26:16 +0800 Subject: [PATCH 15/23] Polish arg_min_max_op * Remove unused arg_max/min_op.h * Remove reference parameter. Use pointer insteaded. * undef macro * Always set OutT as int64_t. --- paddle/fluid/operators/arg_max_op.cc | 28 +++++++++---------- paddle/fluid/operators/arg_max_op.cu | 19 ++++++------- paddle/fluid/operators/arg_max_op.h | 16 ----------- paddle/fluid/operators/arg_min_max_op_base.h | 29 +++++++++++--------- paddle/fluid/operators/arg_min_op.cc | 28 +++++++++---------- paddle/fluid/operators/arg_min_op.cu | 19 ++++++------- paddle/fluid/operators/arg_min_op.h | 16 ----------- 7 files changed, 60 insertions(+), 95 deletions(-) delete mode 100644 paddle/fluid/operators/arg_max_op.h delete mode 100644 paddle/fluid/operators/arg_min_op.h diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 859cccd1b2..8174d37358 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/arg_max_op.h" +#include "paddle/fluid/operators/arg_min_max_op_base.h" -REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp, +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( - arg_max, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu index c9c102bdcc..a147d77a9e 100644 --- a/paddle/fluid/operators/arg_max_op.cu +++ b/paddle/fluid/operators/arg_max_op.cu @@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/arg_max_op.h" +#include "paddle/fluid/operators/arg_min_max_op_base.h" REGISTER_OP_CUDA_KERNEL( arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel, + int32_t>, paddle::operators::ArgMaxKernel, + int16_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, + size_t>, paddle::operators::ArgMaxKernel); + uint8_t>); diff --git a/paddle/fluid/operators/arg_max_op.h b/paddle/fluid/operators/arg_max_op.h deleted file mode 100644 index d232a85699..0000000000 --- a/paddle/fluid/operators/arg_max_op.h +++ /dev/null @@ -1,16 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/operators/arg_min_max_op_base.h" diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index 8c20461a34..6cbdaefeda 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include "paddle/fluid/framework/ddim.h" @@ -37,9 +38,9 @@ struct ArgMinMaxFunctor {}; struct ArgMinMaxFunctor { \ void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor& out, int64_t axis) { \ + framework::LoDTensor* out, int64_t axis) { \ auto in_eigen = framework::EigenTensor::From(in); \ - auto out_eigen = framework::EigenTensor::From(out); \ + auto out_eigen = framework::EigenTensor::From(*out); \ out_eigen.device(*(ctx.eigen_device())) = \ in_eigen.eigen_op_type(axis).template cast(); \ } \ @@ -62,7 +63,7 @@ class ArgMinMaxKernel : public framework::OpKernel { #define CALL_ARG_MINMAX_FUNCTOR(rank) \ ArgMinMaxFunctor \ functor##rank; \ - functor##rank(dev_ctx, x, out, axis) + functor##rank(dev_ctx, x, &out, axis) switch (x.dims().size()) { case 1: @@ -89,19 +90,20 @@ class ArgMinMaxKernel : public framework::OpKernel { "than 6.", (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); break; +#undef CALL_ARG_MINMAX_FUNCTOR } } }; -template +template using ArgMinKernel = - ArgMinMaxKernel; + ArgMinMaxKernel; -template +template using ArgMaxKernel = - ArgMinMaxKernel; + ArgMinMaxKernel; -typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel { +class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -121,7 +123,7 @@ typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel { for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); ctx->SetOutputDim("Out", framework::make_ddim(vec)); } -} ArgMinOp, ArgMaxOp; +}; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { protected: @@ -133,12 +135,13 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input tensor."); AddOutput("Out", "Output tensor."); AddAttr("axis", "The axis in which to compute the arg indics."); - AddComment(::paddle::string::Sprintf(R"DOC( - %s Operator. + AddComment(string::Sprintf(R"DOC( + %s Operator. - Computes the indices of the %s elements of the input tensor's element along the provided axis. + Computes the indices of the %s elements of the input tensor's element + along the provided axis. )DOC", - OpName(), Name())); + OpName(), Name())); } }; diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 18c0884a04..41f188029f 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/arg_min_op.h" +#include "paddle/fluid/operators/arg_min_max_op_base.h" -REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp, +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( - arg_min, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu index 6d5aaa9596..4d02050850 100644 --- a/paddle/fluid/operators/arg_min_op.cu +++ b/paddle/fluid/operators/arg_min_op.cu @@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/arg_min_op.h" +#include "paddle/fluid/operators/arg_min_max_op_base.h" REGISTER_OP_CUDA_KERNEL( arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel, + int32_t>, paddle::operators::ArgMinKernel, + int16_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, + size_t>, paddle::operators::ArgMinKernel); + uint8_t>); diff --git a/paddle/fluid/operators/arg_min_op.h b/paddle/fluid/operators/arg_min_op.h deleted file mode 100644 index d232a85699..0000000000 --- a/paddle/fluid/operators/arg_min_op.h +++ /dev/null @@ -1,16 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/operators/arg_min_max_op_base.h" From a1254a86bab4fd99a7801fd3eb3f3cccb0130ba1 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Mon, 11 Jun 2018 12:43:41 +0800 Subject: [PATCH 16/23] Add lock to record_event. --- paddle/fluid/platform/device_context.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 6b82d93237..292ffef1ae 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include #include @@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; + + std::mutex mtx_; }; template <> From 4b6d584f4fb3741fe3d3268c36b54b8469444f60 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 11 Jun 2018 06:01:41 +0000 Subject: [PATCH 17/23] fix identifier error of 'dshape' --- benchmark/fluid/models/vgg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 6092cdeb88..932601302d 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -82,7 +82,8 @@ def get_model(args): data_file, batch_size=args.batch_size)) images, label = fluid.layers.read_file(data_file) else: - images = fluid.layers.data(name='data', shape=dshape, dtype='float32') + images = fluid.layers.data( + name='data', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program From 799a3aa6bc27c1ceaa8e610f66ab51767cb8895c Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Mon, 11 Jun 2018 15:50:58 +0800 Subject: [PATCH 18/23] fix fluid_benchmark stacked_dynamic_lstm model error --- benchmark/fluid/models/stacked_dynamic_lstm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index e1c4857f1a..211869af4e 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -104,8 +104,9 @@ def get_model(args): loss = fluid.layers.mean(x=loss) # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ - shape=[1], dtype='int64')) + shape=[1], dtype='int64'), total=batch_size_tensor) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): From 1cfd3cb13b0ea4bf091757d59ec24e021563518f Mon Sep 17 00:00:00 2001 From: Qiyang Min Date: Mon, 11 Jun 2018 03:11:22 -0500 Subject: [PATCH 19/23] Add some dist-training robust cases into fluid benchmark test (#11207) * 1. add weight decay feature into fluid benchmark test 2. add learning rate decay feature into fluid benchmark test 3. add L1&L2 regularization feature into fluid benchmark test 4. add error clipping feature into fluid benchmark test 5. add gradient clipping feature into fluid benchmark test * Add some document to README.md under benchmark/fluid/ repo * Add model_base.py * Fix bugs in test_listen_and_serv_op * 1. remove args out of fluid_benchmark.py 2. remove lr_decay, regularization, clipping out of fluid_benchmark.py * add async_mode description to doc and remove the clipping description out * for restart build * to restart build * remove optimization args from args.py * 1. remove optimization from models 2. fix bug in test_listen_and_serv_op * change the name retry_times to left_time * change retry_times to the pserver start left time --- benchmark/fluid/README.md | 4 +- benchmark/fluid/args.py | 126 ++++++++++++++++++ benchmark/fluid/fluid_benchmark.py | 114 ++-------------- .../unittests/test_listen_and_serv_op.py | 9 +- 4 files changed, 143 insertions(+), 110 deletions(-) create mode 100644 benchmark/fluid/args.py diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index f40f3c1297..28cade4634 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,10 +24,12 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 0000000000..68a3d42d7a --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,126 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 62a05234c4..902dca209f 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,108 +24,7 @@ import paddle.fluid.core as core import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The batch size on each gpu.') - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', - type=int, - default=80, - help='The number of minibatches, set to -1 to run all batches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - # this option is available only for vgg and resnet. - parser.add_argument( - '--cpus', - type=int, - default=1, - help='If cpus > 1, will use ParallelDo to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers', 'imagenet'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_true', - help='If set, do not test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - parser.add_argument( - '--use_reader_op', - action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true.' - ) - parser.add_argument( - '--data_path', - type=str, - default="", - help='Directory that contains all the training recordio files.') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): @@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -182,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -417,7 +321,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1226027ddc..d1d709551c 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -70,17 +70,18 @@ class TestListenAndServOp(OpTest): return p.pid def _wait_ps_ready(self, pid): - retry_times = self.ps_timeout + start_left_time = self.ps_timeout + sleep_time = 0.5 while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(0.5) + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= 1 + start_left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. From 627d7a64f8cb6cac25e6bc0ac524f48c430fd62a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 11 Jun 2018 16:54:50 +0800 Subject: [PATCH 20/23] Clean `sendop` `recv` operator. (#11309) --- .../details/multi_devices_graph_builder.cc | 8 +- paddle/fluid/operators/CMakeLists.txt | 11 +- paddle/fluid/operators/recv_op.cc | 8 +- paddle/fluid/operators/send_op.cc | 50 +++------ paddle/fluid/operators/send_vars_op.cc | 101 ------------------ .../tests/unittests/test_dist_transpiler.py | 5 +- .../unittests/test_simple_dist_transpiler.py | 4 +- .../fluid/transpiler/distribute_transpiler.py | 14 +-- 8 files changed, 39 insertions(+), 162 deletions(-) delete mode 100644 paddle/fluid/operators/send_vars_op.cc diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 97242ebf2a..b568c1344b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -89,7 +89,7 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( for (auto *op : program.Block(0).AllOps()) { // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string - if (op->Type() == "send_vars") { + if (op->Type() == "send") { auto op_vars = op->InputArgumentNames(); send_vars.reserve(send_vars.size() + std::distance(op_vars.begin(), op_vars.end())); @@ -468,17 +468,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); if (op.Type() == "send_barrier") { - ConnectOp(result, result->ops_.back().get(), "send_vars"); + ConnectOp(result, result->ops_.back().get(), "send"); } else if (op.Type() == "recv") { ConnectOp(result, result->ops_.back().get(), "send_barrier"); } else if (op.Type() == "fetch_barrier") { ConnectOp(result, result->ops_.back().get(), "recv"); - } else if (op.Type() == "send_vars") { + } else if (op.Type() == "send") { // do nothing } else { PADDLE_THROW( "rpc op should be in [" - "send_vars, send_barrier. recv, fetch_barrier]"); + "send, send_barrier. recv, fetch_barrier]"); } // TODO(Yancey1989): schedule rpc op on different place may diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5e86b16ba1..0f4050250d 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -189,16 +189,14 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - op_library(send_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -208,15 +206,14 @@ if(WITH_DISTRIBUTE) # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op - listen_and_serv_op executor SERIAL) + cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) endif() else() - set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op) + set(DEPS_OPS ${DEPS_OPS} prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() op_library(cross_entropy_op DEPS cross_entropy) diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 1ea1cc458b..4198c3ee56 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -78,9 +78,15 @@ This operator can get variables from server side. } }; +class RecvOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); +REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker, + ops::RecvOpMaker, ops::RecvOpShapeInference); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 9697579707..2a2fe53c71 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/grpc_client.h" @@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { auto ins = Inputs("X"); - auto outs = Outputs("Out"); - std::vector epmap = Attr>("epmap"); - std::vector endpoints = - Attr>("endpoints"); - bool sync_mode = Attr("sync_mode"); + std::vector epmap = Attr>("epmap"); + int sync_send = Attr("sync_mode"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); @@ -55,32 +51,14 @@ class SendOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + // TODO(Yancey1989): we need to use an IO threadpool which has + // a larger number of threads than the computing threadpool. rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - rpc_client->Wait(); - - if (sync_mode) { - for (auto& ep : endpoints) { - VLOG(3) << "batch barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); - } - rpc_client->Wait(); - } - - if (outs.size() > 0) { - for (size_t i = 0; i < outs.size(); i++) { - VLOG(2) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); - } - rpc_client->Wait(); - // tell pservers that current trainer have called fetch - for (auto& ep : endpoints) { - VLOG(2) << "send fetch barrier, ep: " << ep; - rpc_client->AsyncSendFetchBarrier(ep); - } + if (sync_send) { rpc_client->Wait(); } } @@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase { class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { - AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); - AddOutput("Out", "(Tensor) Output tensor to be received from server") + AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); AddComment(R"DOC( Send operator -This operator will send tensor to recv_op at the parameter server. +This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - // TODO(typhoonzero): remove this attr generate de-duplicated vector from - // epmap when initializing. - AddAttr>("endpoints", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints to send variables to.") - .SetDefault({}); + AddAttr("sync_mode", + "(int, default 0)" + "sync send or async send.") + .SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" "Server endpoints in the order of input " "variables for mapping") - .SetDefault({}); - AddAttr("sync_mode", "work in sync_mode or not").SetDefault(true); + .SetDefault({"127.0.0.1:6164"}); } }; diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc deleted file mode 100644 index 564e40461f..0000000000 --- a/paddle/fluid/operators/send_vars_op.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/send_recv_util.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { - -class SendVarsOp : public framework::OperatorBase { - public: - SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto ins = Inputs("X"); - - std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_send"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); - - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); - } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; - } - } - if (sync_send) { - rpc_client->Wait(); - } - } -}; - -class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") - .AsDuplicable(); - AddComment(R"DOC( -Send operator - -This operator will send variables to listen_and_serve op at the parameter server. -)DOC"); - AddAttr("sync_send", - "(int, default 0)" - "sync send or async send.") - .SetDefault(0); - AddAttr>("epmap", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints in the order of input " - "variables for mapping") - .SetDefault({"127.0.0.1:6164"}); - } -}; - -class SendVarsOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override {} -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(send_vars, ops::SendVarsOp, - paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker, - ops::SendVarsOpShapeInference); diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 32647f9aa8..b4379ad447 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import unittest import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import delete_ops @@ -54,10 +55,10 @@ class TestDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "split_byref", "send_vars", "send_barrier", "recv", "recv", + "split_byref", "send", "send_barrier", "recv", "recv", "fetch_barrier", "concat" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py index 5ae2844e29..f4aa7426bc 100644 --- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py @@ -59,9 +59,9 @@ class TestSimpleDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "send_vars", "send_barrier", "recv", "recv", "fetch_barrier" + "send", "send_barrier", "recv", "recv", "fetch_barrier" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops def _transpiler_instance(self): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index c7ab300e0f..5b299a0f29 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -24,9 +24,9 @@ Steps to transpile trainer: 1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 3. modify trainer program add split_op to each grad variable. -4. append send_op to send splited variables to server and fetch - params(splited blocks or origin param) from server. -5. append concat_op to merge splited blocks to update local weights. +4. append send_op to send splited variables to server and +5. add recv_op to fetch params(splited blocks or origin param) from server. +6. append concat_op to merge splited blocks to update local weights. Steps to transpile pserver: 1. create new program for parameter server. @@ -317,7 +317,7 @@ class DistributeTranspiler: program.global_block().insert_op( index=index + 1, - type="send_vars", + type="send", inputs={"X": splited_vars}, outputs={}, attrs={ @@ -678,7 +678,7 @@ class DistributeTranspiler: break def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): - # 2. add split_ids_op and send_vars_op to send gradient to pservers + # 2. add split_ids_op and send_op to send gradient to pservers # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -695,11 +695,11 @@ class DistributeTranspiler: outputs={"Out": self.trainer_side_table_grad_list}) program.global_block().insert_op( index=op_index + 2, - type="send_vars", + type="send", inputs={'X': self.trainer_side_table_grad_list}, outputs={}, attrs={ - "sync_send": True, + "sync_mode": True, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) From 9087c6687f661119225a9d9f9b47ae4bc21bdf76 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 11 Jun 2018 18:49:26 +0800 Subject: [PATCH 21/23] polish (#11363) --- paddle/fluid/framework/op_info.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc index b99e82f8c4..f1261dee03 100644 --- a/paddle/fluid/framework/op_info.cc +++ b/paddle/fluid/framework/op_info.cc @@ -17,12 +17,11 @@ limitations under the License. */ namespace paddle { namespace framework { -static OpInfoMap* g_op_info_map = nullptr; - +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex OpInfoMap& OpInfoMap::Instance() { - if (g_op_info_map == nullptr) { - g_op_info_map = new OpInfoMap(); - } + static OpInfoMap* g_op_info_map = new OpInfoMap(); return *g_op_info_map; } } // namespace framework From bfa3fd6f152227c5a527b3fa89bbdd34e37ca94a Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 11 Jun 2018 19:38:33 +0800 Subject: [PATCH 22/23] add inplace attribute to op_proto_maker (#10665) * "add inplace attribute" * "register inplace attribute" * "change se-next model for memory-reuse" * "fix typo" * repick * fix merge conflict * "fix stupid error" --- paddle/fluid/framework/framework.proto | 1 + paddle/fluid/framework/op_proto_maker.cc | 19 ++++++++++++++++++ paddle/fluid/framework/op_proto_maker.h | 9 +++++++++ paddle/fluid/framework/op_proto_maker_test.cc | 20 +++++++++++++++++++ paddle/fluid/operators/activation_op.cc | 2 +- paddle/fluid/operators/adam_op.cc | 6 +++--- paddle/fluid/operators/batch_norm_op.cc | 8 +++++--- paddle/fluid/operators/conv_op.cc | 6 ++++-- paddle/fluid/operators/cross_entropy_op.cc | 3 ++- paddle/fluid/operators/elementwise_op.h | 2 +- paddle/fluid/operators/mean_op.cc | 2 +- paddle/fluid/operators/pool_op.cc | 6 ++++-- paddle/fluid/operators/sgd_op.cc | 3 ++- paddle/fluid/operators/softmax_op.cc | 3 ++- paddle/fluid/operators/sum_op.cc | 2 +- paddle/fluid/operators/top_k_op.cc | 2 +- 16 files changed, 76 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index d35125fe8c..68fcc104d4 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -71,6 +71,7 @@ message OpProto { optional bool duplicable = 3 [ default = false ]; optional bool intermediate = 4 [ default = false ]; optional bool dispensable = 5 [ default = false ]; + optional string reuse = 6; } // AttrProto describes the C++ type Attribute. diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index ae9f4efd44..001b5cb5a8 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -21,6 +21,7 @@ namespace framework { void OpProtoAndCheckerMaker::Validate() { validated_ = true; CheckNoDuplicatedInOutAttrs(); + CheckReuseVars(); } OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( @@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { } } +void OpProtoAndCheckerMaker::CheckReuseVars() { + std::unordered_set names; + for (auto& input : proto_->inputs()) { + names.insert(input.name()); + } + auto checker = [&](const std::string& name, const std::string& reused) { + PADDLE_ENFORCE( + names.count(reused), + "Output [%s] reuse Input [%s], but the input is not registered.", name, + reused); + }; + for (auto& output : proto_->outputs()) { + if (output.has_reuse()) { + checker(output.name(), output.reuse()); + } + } +} + void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, OpAttrChecker* attr_checker) { proto_ = proto; diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 8493b9d8b3..92f86bb5de 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include + #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/framework.pb.h" @@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker { var_->set_dispensable(true); return *this; } + + VariableBuilder &Reuse(const std::string &name) { + var_->set_reuse(name); + return *this; + } }; VariableBuilder AddInput(const std::string &name, const std::string &comment); @@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker { void CheckNoDuplicatedInOutAttrs(); void Validate(); + void CheckReuseVars(); + proto::OpProto *proto_; OpAttrChecker *op_checker_; bool validated_{false}; diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index a8030d377f..58f70cb39c 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) { ASSERT_THROW(proto_maker(&op_proto, &op_checker), paddle::platform::EnforceNotMet); } + +class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + AddOutput("NoOut", "output of test op").Reuse("NotExists"); + } +}; + +TEST(ProtoMaker, InplaceOutput) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestInplaceProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); + // proto_maker(&op_proto, &op_checker); + // proto_maker.Make(); + // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 96e4c0e04c..af1d85047e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -25,7 +25,7 @@ namespace operators { public: \ void Make() override { \ AddInput("X", "Input of " #OP_NAME " operator"); \ - AddOutput("Out", "Output of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \ AddAttr("use_mkldnn", \ "(bool, default false) Only used in mkldnn kernel") \ .SetDefault(false); \ diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc index 99b0239855..6ee73c3000 100644 --- a/paddle/fluid/operators/adam_op.cc +++ b/paddle/fluid/operators/adam_op.cc @@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); - AddOutput("ParamOut", "(Tensor) Output parameter"); - AddOutput("Moment1Out", "(Tensor) Output first moment"); - AddOutput("Moment2Out", "(Tensor) Output second moment"); + AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param"); + AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1"); + AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2"); AddAttr("beta1", "(float, default 0.9) " diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index d7e0af28c1..92fbb9adaf 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -151,13 +151,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Variance", "The global variance (for training) " "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); + AddOutput("Y", "result after normalization").Reuse("X"); AddOutput("MeanOut", "Share memory with Mean. " - "Store the global mean when training"); + "Store the global mean when training") + .Reuse("Mean"); AddOutput("VarianceOut", "Share memory with Variance. " - "Store the global Variance when training"); + "Store the global Variance when training") + .Reuse("Variance"); AddOutput("SavedMean", "Mean of the current mini batch, " "will apply to output when training") diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 850297a232..27f1313116 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -125,7 +125,8 @@ void Conv2DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); + "The format of output tensor is also NCHW.") + .Reuse("Input"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -220,7 +221,8 @@ void Conv3DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator." - "The format of output tensor is also NCDHW."); + "The format of output tensor is also NCDHW.") + .Reuse("Input"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a3bec3da45..d5e095f9ca 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss."); + "[N x 1]. The cross entropy loss.") + .Reuse("X"); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index f4cec8ad97..0803a6035d 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -59,7 +59,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op."); + AddOutput("Out", "The output of elementwise op.").Reuse("X"); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 74477eb439..4881cff4a3 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "The input of mean op"); - AddOutput("Out", "The output of mean op"); + AddOutput("Out", "The output of mean op").Reuse("X"); AddComment(R"DOC( Mean Operator. diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 18aa2bd352..6707cdded4 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -151,7 +151,8 @@ void Pool2dOpMaker::Make() { "The format of output tensor is also NCHW, " "where N is batch size, C is the number of channels, " "H is the height of the feature, " - "and W is the width of the feature."); + "and W is the width of the feature.") + .Reuse("X"); AddAttr("pooling_type", "(string), pooling type, can be \"max\" for max-pooling " @@ -244,7 +245,8 @@ void Pool3dOpMaker::Make() { "The format of output tensor is also NCDHW, " "where N is batch size, C is " "the number of channels, and D, H and W is the depth, height and " - "width of the feature, respectively."); + "width of the feature, respectively.") + .Reuse("X"); AddAttr("pooling_type", "(string) Pooling type, can be \"max\" for max-pooling " diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7a2bdeac09..fef230e42d 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Grad", "(Tensor or SelectedRows) Input gradient"); AddOutput("ParamOut", "(Tensor or SelectedRows, same with Param) " - "Output parameter, should share the same memory with Param"); + "Output parameter, should share the same memory with Param") + .Reuse("Param"); AddComment(R"DOC( SGD operator diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index c90a3be964..847b3cbd1b 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -83,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of softmax. " "2-D with shape [batch_size, input_feature_dimensions]."); - AddOutput("Out", "The normalized values with the same shape as X."); + AddOutput("Out", "The normalized values with the same shape as X.") + .Reuse("X"); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index bcc5e22d4a..863baba9ea 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -115,7 +115,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "(vector) The input tensors of sum operator.") .AsDuplicable(); - AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X"); AddComment(R"DOC( Sum operator. diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc30..4a8ac441cf 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); - AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( Top K operator From d9de6b8621cd7f3bf94ef67cef9ffccb3e70eb36 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 11 Jun 2018 20:24:56 +0800 Subject: [PATCH 23/23] Add brpc surpport. (#11263) --- CMakeLists.txt | 14 +- cmake/configure.cmake | 4 + cmake/external/brpc.cmake | 58 ++++++ cmake/external/leveldb.cmake | 44 +++++ cmake/generic.cmake | 18 ++ .../inference/tensorrt/convert/ut_helper.h | 3 +- paddle/fluid/operators/CMakeLists.txt | 16 +- paddle/fluid/operators/detail/CMakeLists.txt | 34 +++- paddle/fluid/operators/detail/brpc_client.cc | 180 ++++++++++++++++++ paddle/fluid/operators/detail/brpc_client.h | 100 ++++++++++ paddle/fluid/operators/detail/brpc_server.cc | 144 ++++++++++++++ paddle/fluid/operators/detail/brpc_server.h | 53 ++++++ paddle/fluid/operators/detail/grpc_client.cc | 1 + .../{serde_test.cc => grpc_serde_test.cc} | 0 paddle/fluid/operators/detail/macros.h | 27 +++ .../fluid/operators/detail/request_handler.h | 5 +- .../operators/detail/request_handler_impl.cc | 3 - .../operators/detail/request_handler_impl.h | 1 - paddle/fluid/operators/detail/rpc_client.h | 2 + ...grpc_server_test.cc => rpc_server_test.cc} | 16 +- paddle/fluid/operators/detail/send_recv.proto | 2 + .../fluid/operators/detail/sendrecvop_utils.h | 10 - paddle/fluid/operators/fetch_barrier_op.cc | 6 +- paddle/fluid/operators/gen_nccl_id_op.cc | 24 +-- paddle/fluid/operators/listen_and_serv_op.cc | 17 +- paddle/fluid/operators/prefetch_op.cc | 4 +- paddle/fluid/operators/recv_op.cc | 5 +- paddle/fluid/operators/send_barrier_op.cc | 4 +- paddle/fluid/operators/send_op.cc | 4 +- paddle/fluid/operators/test_send_nccl_id.cc | 20 +- 30 files changed, 748 insertions(+), 71 deletions(-) create mode 100644 cmake/external/brpc.cmake create mode 100644 cmake/external/leveldb.cmake create mode 100644 paddle/fluid/operators/detail/brpc_client.cc create mode 100644 paddle/fluid/operators/detail/brpc_client.h create mode 100644 paddle/fluid/operators/detail/brpc_server.cc create mode 100644 paddle/fluid/operators/detail/brpc_server.h rename paddle/fluid/operators/detail/{serde_test.cc => grpc_serde_test.cc} (100%) create mode 100644 paddle/fluid/operators/detail/macros.h rename paddle/fluid/operators/detail/{grpc_server_test.cc => rpc_server_test.cc} (92%) diff --git a/CMakeLists.txt b/CMakeLists.txt index cfaab206e1..b35290e12f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,12 +55,13 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) -option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -147,7 +148,16 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) -include(external/grpc) + +if(WITH_DISTRIBUTE) + if(WITH_GRPC) + include(external/grpc) + else() + include(external/leveldb) + include(external/brpc) + endif() +endif() + include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4158d0528a..89d7a62fe9 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -166,3 +166,7 @@ if(WITH_GOLANG) endif() endif(WITH_GOLANG) + +if(WITH_GRPC) + add_definitions(-DPADDLE_WITH_GRPC) +endif(WITH_GRPC) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake new file mode 100644 index 0000000000..8e2c913b2c --- /dev/null +++ b/cmake/external/brpc.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf") + +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +ExternalProject_Add( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/brpc/brpc" + GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2" + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DBRPC_WITH_GLOG=ON + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) + + +LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake new file mode 100644 index 0000000000..fb5091731d --- /dev/null +++ b/cmake/external/leveldb.cmake @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb) +SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) +SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) +INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_SOURCES_DIR} + URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" + URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ + && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_leveldb snappy) + +ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +ADD_DEPENDENCIES(leveldb extern_leveldb) + +LIST(APPEND external_project_dependencies leveldb) + diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9ddd05b3d9..0e2df86c19 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -610,3 +610,21 @@ function(grpc_library TARGET_NAME) COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() + + +function(brpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating brpc ${brpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") + cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") + cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") +endfunction() diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 236d169017..3b1f531adc 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -64,7 +64,8 @@ class TRTConvertValidation { TRTConvertValidation(int batch_size, const std::unordered_set& parameters, - framework::Scope& scope, int workspace_size = 1 << 10) + framework::Scope& scope, // NOLINT + int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 0f4050250d..d6a36eff09 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -186,8 +186,14 @@ endif() add_subdirectory(detail) if(WITH_DISTRIBUTE) - - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + + set(DISTRIBUTE_DEPS "") + if(WITH_GRPC) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + else() + set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) + endif() + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -207,7 +213,11 @@ if(WITH_DISTRIBUTE) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) - op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + if(WITH_GRPC) + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + else() + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc) + endif() set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index c29dc5d7e0..abc5aad043 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,12 +1,38 @@ -if(WITH_DISTRIBUTE) +if(NOT WITH_DISTRIBUTE) + return() +endif() + + +if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc SERIAL) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc + cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) + return() endif() + + +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) + +find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so) +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC}) + + +find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so) +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC}) + +cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc + brpc protobuf leveldb gflags glog + protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL) diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/detail/brpc_client.cc new file mode 100644 index 0000000000..9a4e410f1d --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace operators { +namespace detail { + +DEFINE_int32(brpc_channel_num, 24, + "Number of channels to send requests connected to one server"); +DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); + +BRPCClient::~BRPCClient() { Wait(); } + +void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; +} + +bool BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + google::protobuf::Closure* done = + brpc::NewCallback(&HandleSendResponse, cntl, response); + + sendrecv::VariableMessage request; + ch_ctx->stub->SendVariable(cntl, &request, response, done); + }); + req_count_++; + + return true; +} + +void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; + + // framework::Variable* outvar = nullptr; + // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); +} + +bool BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + + req_count_++; + + return true; +} + +bool BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] {}); + + req_count_++; + return true; +} + +void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +} + +ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + } + + ChannelQueuePtr q(new framework::BlockingQueue()); + + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "pooled"; + options.connect_timeout_ms = 100; + options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; + options.max_retry = FLAGS_max_retry; + for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + std::shared_ptr c(new ChannelContext()); + if (c->channel.Init(ep.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return nullptr; + } + + c->stub.reset(new sendrecv::SendRecvService_Stub( + static_cast(&c->channel))); + q->Push(c); + } + + { + std::lock_guard guard(chan_mutex_); + channels_[ep] = q; + } + + return q; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/detail/brpc_client.h new file mode 100644 index 0000000000..1e953ea431 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace detail { + +struct ChannelContext { + brpc::Channel channel; + std::shared_ptr stub; +}; + +typedef std::shared_ptr ChannelContextPtr; +typedef std::shared_ptr> + ChannelQueuePtr; + +class BRPCClient : public RPCClient { + public: + BRPCClient() {} + virtual ~BRPCClient(); + + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendBatchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendFetchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void Wait() override; + + private: + void Proceed(); + ChannelQueuePtr GetChannel(const std::string& ep); + + private: + std::unordered_map channels_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(BRPCClient); +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/detail/brpc_server.cc new file mode 100644 index 0000000000..2170abe679 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_server.h" +#include "paddle/fluid/operators/detail/request_handler.h" + +namespace sendrecv { + +typedef std::unordered_map + HandlerMap; + +class BRPCServiceImpl : public SendRecvService { + public: + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) + : request_send_h_(nullptr), + request_get_h_(nullptr), + request_prefetch_h_(nullptr) { + auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_send_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_get_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); + if (it != rpc_call_map.end()) { + request_prefetch_h_ = it->second; + } + } + + virtual ~BRPCServiceImpl() {} + + void SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_send_h_ != nullptr, + "RequestSend handler should be registed first!"); + brpc::ClosureGuard done_guard(done); + + paddle::framework::Scope* local_scope = request_send_h_->scope(); + paddle::framework::Variable* outvar = nullptr; + paddle::framework::Variable* invar = nullptr; + + std::string varname = request->varname(); + + if (!request_send_h_->sync_mode()) { + local_scope = &request_send_h_->scope()->NewScope(); + invar = local_scope->Var(varname); + } else { + invar = local_scope->FindVar(varname); + } + + request_send_h_->Handle(varname, local_scope, invar, &outvar); + + if (!request_send_h_->sync_mode()) { + request_send_h_->scope()->DeleteScope(local_scope); + } + } + + void GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_get_h_ != nullptr, + "RequestGet handler should be registed first!"); + } + + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_prefetch_h_ != nullptr, + "kRequestPrefetch handler should be registed first!"); + } + + private: + paddle::operators::detail::RequestHandler* request_send_h_; + paddle::operators::detail::RequestHandler* request_get_h_; + paddle::operators::detail::RequestHandler* request_prefetch_h_; +}; +} // namespace sendrecv + +namespace paddle { +namespace operators { +namespace detail { + +void AsyncBRPCServer::StartServer() { + // Instance of your service. + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + + // Add the service into server. Notice the second parameter, because the + // service is put on stack, we don't want server to delete it, otherwise + // use brpc::SERVER_OWNS_SERVICE. + if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(FATAL) << "Fail to add service"; + return; + } + + brpc::ServerOptions options; + options.idle_timeout_sec = idle_timeout_s_; + options.max_concurrency = max_concurrency_; + if (server_.Start(bind_address_.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to start EchoServer" << bind_address_; + return; + } + + butil::EndPoint ep = server_.listen_address(); + selected_port_ = ep.port; + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + server_.Join(); +} + +void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } + +void AsyncBRPCServer::WaitServerReady() { + VLOG(3) << "AsyncGRPCServer is wait server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(3) << "AsyncGRPCServer WaitSeverReady"; +} + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/detail/brpc_server.h new file mode 100644 index 0000000000..0105c8074a --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include // NOLINT +#include + +#include "brpc/server.h" +#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace detail { + +class AsyncBRPCServer final : public RPCServer { + public: + explicit AsyncBRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncBRPCServer() {} + void StartServer() override; + void WaitServerReady() override; + + private: + void ShutDownImpl() override; + + brpc::Server server_; + + static constexpr int idle_timeout_s_ = -1; + static constexpr int max_concurrency_ = 0; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index fae39418b4..6b8373b150 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/grpc_serde_test.cc similarity index 100% rename from paddle/fluid/operators/detail/serde_test.cc rename to paddle/fluid/operators/detail/grpc_serde_test.cc diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h new file mode 100644 index 0000000000..da1de72dad --- /dev/null +++ b/paddle/fluid/operators/detail/macros.h @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" +#define RPCSERVER_T detail::AsyncGRPCServer +#define RPCCLIENT_T detail::GRPCClient +#else +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/operators/detail/brpc_server.h" +#define RPCSERVER_T detail::AsyncBRPCServer +#define RPCCLIENT_T detail::BRPCClient +#endif diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h index d74206aaba..fa979024e3 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/detail/request_handler.h @@ -28,7 +28,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { @@ -38,6 +37,10 @@ constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" + class RPCServer; class RequestHandler { diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index 9473dce550..5f1a346e93 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -16,15 +16,12 @@ #include #include -#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h index 443d951914..c392267cfa 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/detail/request_handler_impl.h @@ -29,7 +29,6 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h index 8e3717f076..7e76ac0348 100644 --- a/paddle/fluid/operators/detail/rpc_client.h +++ b/paddle/fluid/operators/detail/rpc_client.h @@ -26,6 +26,8 @@ namespace detail { class RPCClient { public: + RPCClient() {} + virtual ~RPCClient() {} virtual bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc similarity index 92% rename from paddle/fluid/operators/detail/grpc_server_test.cc rename to paddle/fluid/operators/detail/rpc_server_test.cc index a1f9ba15e6..f49274a7b5 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/rpc_server_test.cc @@ -17,15 +17,14 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/rpc_client.h" - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/rpc_server.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -33,7 +32,7 @@ namespace detail = paddle::operators::detail; USE_OP(lookup_table); -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { @@ -112,20 +111,19 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); server_thread.join(); } TEST(PREFETCH, CPU) { g_req_handler.reset(new detail::RequestPrefetchHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); + detail::RPCClient* client = detail::RPCClient::GetInstance(); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); int port = g_rpc_service->GetSelectedPort(); std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index a244afc46f..54cb93e04d 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -14,6 +14,8 @@ limitations under the License. */ syntax = "proto3"; package sendrecv; +// option cc_generic_services = true; + service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index c72e1bd076..bd16bf1dab 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -32,16 +32,6 @@ namespace paddle { namespace operators { namespace detail { -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" - -static int64_t GetTimestamp() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000 + tp.tv_usec / 1000; -} - typedef void (*DestroyCallback)(void*); void SerializeToByteBuffer(const std::string& name, framework::Variable* var, diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index ad67585e48..98b051afb5 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -19,9 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); rpc_client->Wait(); diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 547de4fa49..111e58844c 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -21,8 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -61,8 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase { std::vector endpoint_list = Attr>("endpoint_list"); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); + detail::RPCClient* client = detail::RPCClient::GetInstance(); + for (auto& ep : endpoint_list) { VLOG(3) << "sending nccl id to " << ep; client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); @@ -78,9 +77,11 @@ class GenNCCLIdOp : public framework::OperatorBase { // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. detail::RequestSendHandler rpc_h(true); - detail::AsyncGRPCServer rpc_service(endpoint, 1); - rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h); - rpc_h.SetRPCServer(&rpc_service); + std::unique_ptr rpc_service( + new RPCSERVER_T(endpoint, 1)); + + rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); + rpc_h.SetRPCServer(rpc_service.get()); framework::ProgramDesc empty_program; framework::Executor executor(dev_ctx.GetPlace()); @@ -90,12 +91,13 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_h.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service)); - rpc_service.SetCond(detail::kRequestSend); + std::bind(&detail::RPCServer::StartServer, rpc_service.get())); + + rpc_service->SetCond(detail::kRequestSend); VLOG(3) << "start getting nccl id from trainer 0..."; - rpc_service.WaitBarrier(detail::kRequestSend); + rpc_service->WaitBarrier(detail::kRequestSend); VLOG(3) << "got nccl id and stop server..."; - rpc_service.ShutDown(); + rpc_service->ShutDown(); VLOG(3) << "rpc server stopped"; server_thread.join(); } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 66d31c8895..7610600440 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" + #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" @@ -89,6 +90,12 @@ void ListenAndServOp::SavePort() const { rpc_service_->SavePort(); } +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope, @@ -127,7 +134,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, int32_t last_parent_blkid = program->Block(1).Parent(); std::vector parallel_blkids; parallel_blkids.push_back(1); - double ts = detail::GetTimestamp(); + double ts = GetTimestamp(); for (size_t blkid = 2; blkid < num_blocks; ++blkid) { if (blkid != static_cast(prefetch_block->ID())) { if (program->Block(blkid).Parent() != last_parent_blkid) { @@ -141,7 +148,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; + VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); @@ -235,8 +242,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in << ", end_point:" << endpoint; - // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode)); - rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in)); + rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); + request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); request_prefetch_handler_.reset( diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index d96359d6be..f71ba84b31 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" namespace paddle { @@ -42,7 +42,7 @@ class PrefetchOp : public framework::OperatorBase { auto& ctx = *pool.Get(place); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 4198c3ee56..15dfb5469b 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +44,7 @@ class RecvOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 511ad75387..c6c975a23c 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +45,7 @@ class SendBarrierOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 2a2fe53c71..84ec366253 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -46,7 +46,7 @@ class SendOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index a01a75cbb0..5015b10055 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -20,8 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" @@ -29,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/send_recv_util.h" +#endif + USE_NO_KERNEL_OP(listen_and_serv); namespace f = paddle::framework; @@ -37,7 +40,7 @@ namespace m = paddle::operators::math; namespace detail = paddle::operators::detail; namespace string = paddle::string; -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; void StartServer() { @@ -58,7 +61,7 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); g_rpc_service->SetCond(detail::kRequestSend); g_rpc_service->WaitBarrier(detail::kRequestSend); @@ -68,9 +71,9 @@ void StartServer() { server_thread.join(); } -TEST(SendNcclId, GrpcServer) { +TEST(SendNcclId, RPCServer) { g_req_handler.reset(new detail::RequestSendHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); @@ -87,8 +90,9 @@ TEST(SendNcclId, GrpcServer) { int port = g_rpc_service->GetSelectedPort(); std::string ep = string::Sprintf("127.0.0.1:%d", port); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); + + detail::RPCClient* client = detail::RPCClient::GetInstance(); + LOG(INFO) << "connect to server" << ep; client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); client->Wait();