From 92a6c7a04906e7d26196ac795eccace84156d42d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 16 Jan 2019 10:08:14 +0800 Subject: [PATCH 01/98] init async ssa executor --- .../details/async_ssa_graph_executor.cc | 99 +++++++++++++++++++ .../details/async_ssa_graph_executor.h | 51 ++++++++++ 2 files changed, 150 insertions(+) create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.h diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc new file mode 100644 index 0000000000..9b26fdd545 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + graphs_(std::move(graphs)) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + } +} + +FeedFetchList AsyncSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + + std::vector fetch_data; + FeedFetchList ret; + + fetch_data.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); + }; + + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + fetch_data.emplace_back(std::move(call())); + } + } + + if (pool_) { + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(std::move(f.get())); + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); + } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h new file mode 100644 index 0000000000..4091c56d74 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class AsyncSSAGraphExecutor : public SSAGraphExecutor { + public: + AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs); + ~AsyncSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector> graphs_; + + std::vector> executors_; + ExceptionHolder exception_holder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle From afda84012643353fbf9849fb5f26bbcd0c45bcea Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 16 Jan 2019 10:32:56 +0800 Subject: [PATCH 02/98] init communicator --- paddle/fluid/framework/communicator.h | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 paddle/fluid/framework/communicator.h diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h new file mode 100644 index 0000000000..e459729f5c --- /dev/null +++ b/paddle/fluid/framework/communicator.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace framework { + +class Communicator { + public: + Communicator() {} + ~Communicator() {} + + private: +}; + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/tensor_impl.h" From ea66979684c53743b9eb749106e0400542ec83da Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 17 Jan 2019 13:28:15 +0800 Subject: [PATCH 03/98] can run --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../details/async_ssa_graph_executor.cc | 1 + .../fluid/framework/details/build_strategy.cc | 5 +- .../fluid/framework/details/build_strategy.h | 1 + .../details/multi_devices_graph_pass.cc | 2 + .../details/multi_devices_graph_pass.h | 16 ++++++- paddle/fluid/framework/parallel_executor.cc | 46 +++++++++++++++---- paddle/fluid/pybind/pybind.cc | 3 ++ 9 files changed, 65 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a167511160..e22c7f8a40 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -184,7 +184,7 @@ endif() target_link_libraries(executor garbage_collector) cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c1ba6606f1..01c24b0d82 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -79,6 +79,8 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) +cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 9b26fdd545..d3e4573e22 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -27,6 +27,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), graphs_(std::move(graphs)) { + VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); // set the correct size of thread pool to each device. diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index df0ff772c9..f8911cd9ad 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -116,7 +116,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass; - if (strategy_.is_distribution_) { + + if (strategy_.async_mode_) { + multi_devices_pass = AppendPass("async_multi_devices_pass").get(); + } else if (strategy_.is_distribution_) { multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 15c2e01b61..1632483965 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -86,6 +86,7 @@ struct BuildStrategy { // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. bool is_distribution_{false}; + bool async_mode_{false}; int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 75f922d2cc..d7a4b5692b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -975,3 +975,5 @@ REGISTER_MULTI_DEVICES_PASS( paddle::framework::details::AllReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, paddle::framework::details::DistSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass, + paddle::framework::details::AsyncSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 6d4386538e..e91397816c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -55,7 +55,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { bool UseGPU() const; - bool NeedCollectiveOps() const; + virtual bool NeedCollectiveOps() const; bool IsScaleLossOp(ir::Node *node) const; @@ -116,6 +116,20 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { virtual void InsertPostprocessOps(ir::Graph *result) const {} }; +class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const {} + + bool NeedCollectiveOps() const override { return false; } + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { + return false; + } + + virtual void InsertPostprocessOps(ir::Graph *result) const {} +}; + class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { protected: int GetVarDeviceID(const std::string &varname) const; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f61c9e3a91..4173b39e10 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -282,10 +283,19 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); } #else - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); + if (build_strategy.async_mode_) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_); + graphs.push_back(std::move(graph)); + } #endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { @@ -323,23 +333,31 @@ ParallelExecutor::ParallelExecutor( "please don't pass loss_var_name."; } } - - if (build_strategy.enable_parallel_graph_) { + if (build_strategy.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs))); + } else if (build_strategy.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs))); } else { if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs[0]))); } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs[0]))); } } + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); @@ -401,14 +419,22 @@ void ParallelExecutor::BCastParamsToDevices( auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. - if (member_->use_all_reduce_ || member_->use_cuda_ || - var == "@LR_DECAY_COUNTER@") { + auto share_memory = [&] { t->Resize(dims); t->mutable_data(cpu, main_tensor.type()); paddle::framework::TensorCopy(main_tensor, cpu, t); + }; + + auto copy_memory = [&] { t->ShareDataWith(main_tensor); }; + + // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. + if (member_->build_strategy_.async_mode_) { + share_memory(); + } else if (member_->use_all_reduce_ || member_->use_cuda_ || + var == "@LR_DECAY_COUNTER@") { + copy_memory(); } else { - t->ShareDataWith(main_tensor); + share_memory(); } } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f3f4854a9e..88d12c69b7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1030,6 +1030,9 @@ All parameter, weight, gradient are variables in Paddle. "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) + .def_property("async_mode", + [](const BuildStrategy &self) { return self.async_mode_; }, + [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) .def_property( "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, From 88d71fa2f9655c206d398088effe3cb1a43dafc4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 17 Jan 2019 17:30:27 +0800 Subject: [PATCH 04/98] support num_iteration_per_run --- .../framework/details/async_ssa_graph_executor.cc | 3 +++ paddle/fluid/framework/details/execution_strategy.h | 2 ++ paddle/fluid/pybind/pybind.cc | 11 +++++++++++ 3 files changed, 16 insertions(+) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index d3e4573e22..ba2e90d052 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -56,6 +56,9 @@ FeedFetchList AsyncSSAGraphExecutor::Run( for (size_t i = 0; i < places_.size(); ++i) { auto call = [this, i, &fetch_tensors]() -> FeedFetchList { try { + for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { + executors_[i]->Run(fetch_tensors); + } return executors_[i]->Run(fetch_tensors); } catch (...) { exception_holder_.Catch(std::current_exception()); diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736..dec4589cad 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -28,6 +28,8 @@ struct ExecutionStrategy { size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; + size_t num_iteration_per_run_{1}; // only use with async_ssa_graph_executor + // and pyreader with data queue }; } // namespace details diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 88d12c69b7..b52f99f324 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -892,6 +892,17 @@ All parameter, weight, gradient are variables in Paddle. 2. In some NLP model, it may cause the GPU memory is insufficient, in this case, you should reduce `num_iteration_per_drop_scope`. )DOC") + .def_property( + "num_iteration_per_run", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_run_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_run) { + self.num_iteration_per_run_ = num_iteration_per_run; + }, + R"DOC(This config that how many iteration the executor will run when + user call pe.run() in python + )DOC") .def_property("_dry_run", [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { From 69484f71e0c842633df77470c80dc26222f6fd3b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 18 Jan 2019 12:25:30 +0800 Subject: [PATCH 05/98] remote communicator --- paddle/fluid/framework/communicator.h | 45 --------------------------- 1 file changed, 45 deletions(-) delete mode 100644 paddle/fluid/framework/communicator.h diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h deleted file mode 100644 index e459729f5c..0000000000 --- a/paddle/fluid/framework/communicator.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { - -namespace framework { - -class Communicator { - public: - Communicator() {} - ~Communicator() {} - - private: -}; - -} // namespace framework -} // namespace paddle - -#include "paddle/fluid/framework/tensor_impl.h" From 7021979bc2a3c03ae8fa601b967539a4416ab325 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 18 Jan 2019 12:52:19 +0800 Subject: [PATCH 06/98] init communicator --- paddle/fluid/framework/communicator.h | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 paddle/fluid/framework/communicator.h diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h new file mode 100644 index 0000000000..ba8fb3e173 --- /dev/null +++ b/paddle/fluid/framework/communicator.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace framework { + +class Communicator { + public: + Communicator() {} + ~Communicator() {} + + // send grad + void send() {} + + void receive() {} + + void wait() {} + + private: + std::unique_ptr communicate_thread_; +}; + +} // namespace framework +} // namespace paddle From 9958775b312e7a4802f574dfd4ea6162a773ed28 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 18 Jan 2019 14:52:15 +0800 Subject: [PATCH 07/98] add NewTmpScope to scope --- paddle/fluid/framework/scope.cc | 2 ++ paddle/fluid/framework/scope.h | 2 ++ .../operators/distributed/grpc/grpc_server.cc | 3 +++ .../operators/distributed/parameter_prefetch.cc | 16 ++++++++-------- .../operators/distributed/request_handler.h | 6 +++++- .../distributed/request_handler_impl.cc | 10 +++------- .../operators/distributed/variable_response.h | 12 ++++++++---- 7 files changed, 31 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 9536185609..c774eaf4c8 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -81,6 +81,8 @@ Scope& Scope::NewScope() const { return *child; } +Scope* Scope::NewTmpScope() const { return new Scope(this); } + Variable* Scope::Var(const std::string& name) { SCOPE_VARS_WRITER_LOCK return VarInternal(name); diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index f0915d2eee..0e9b8edeb3 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -55,6 +55,8 @@ class Scope { /// Mark it to const because that new kid scope cannot change parent scope. Scope& NewScope() const; + Scope* NewTmpScope() const; + /// Create a variable with given name if it doesn't exist. /// Caller doesn't own the returned Variable. Variable* Var(const std::string& name); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index 08f777e279..8bc8d5772f 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -107,6 +107,9 @@ class RequestSend final : public RequestBase { int trainer_id = request_->GetTrainerId(); framework::Variable* outvar = nullptr; + if (!request_handler_->sync_mode()) { + request_->ReleaseOwnershipOfLocalScope(); + } request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); Finish(reply_, &responder_); } diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index c63d653488..9dfbc80870 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -180,7 +180,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = scope.NewScope(); + framework::Scope* local_scope = scope.NewTmpScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -224,22 +224,22 @@ void prefetch(const std::string& id_name, const std::string& out_name, #endif } - auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope); + auto splited_ids = SplitIds(ids_vector, height_sections, local_scope); SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, - &local_scope); + local_scope); // create output var in local scope for (auto& name : out_var_names) { - local_scope.Var(name)->GetMutable(); + local_scope->Var(name)->GetMutable(); } std::vector rets; for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(local_scope, in_var_names[i])) { + if (NeedSend(*local_scope, in_var_names[i])) { VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] << " to get " << out_var_names[i] << " back"; rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i], + epmap[i], cpu_ctx, *local_scope, in_var_names[i], out_var_names[i], table_names[i])); } else { VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; @@ -252,8 +252,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, &local_scope, &actual_ctx); - scope.DeleteScope(&local_scope); + context, local_scope, &actual_ctx); + delete local_scope; } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 62b24f150b..f58c2bc380 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -58,13 +58,15 @@ class VarHandle { VarHandle(const std::string ep, const std::string& method, const std::string& name, const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr) + const framework::Scope* p_scope = nullptr, + bool delete_local_scope = false) : status_(kDefaultState) { ep_ = ep; ctx_ = p_ctx; scope_ = p_scope; name_ = name; method_ = method; + delete_local_scope_ = delete_local_scope; } virtual ~VarHandle() {} @@ -86,6 +88,7 @@ class VarHandle { std::unique_lock lk(sync_mutex_); status_ = ok ? kFinishState : kErrorState; } + if (delete_local_scope_ && scope_) delete scope_; VLOG(7) << "VarHandle finish:" << ok; wait_cond_.notify_all(); } @@ -112,6 +115,7 @@ class VarHandle { std::string name_; // RPC method name. std::string method_; + bool delete_local_scope_; protected: std::mutex sync_mutex_; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 9722f8c96e..1625e55d5a 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -53,13 +53,9 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; - try { - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), - scope); - } catch (std::exception& e) { - LOG(ERROR) << "async: run sub program error " << e.what(); - return false; - } + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); + delete scope; return true; } else { // sync rpc_server_->WaitCond(kRequestSend); diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 294cae5f44..3ecb696069 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -60,14 +60,12 @@ class VariableResponse { bool create_scope = false) : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) { if (create_scope) { - local_scope_ = &scope->NewScope(); + local_scope_ = scope->NewTmpScope(); } } virtual ~VariableResponse() { - if (create_scope_) { - scope_->DeleteScope(local_scope_); - } + if (local_scope_) delete local_scope_; } int Parse(Source* source, const sendrecv::VariableMessage& meta) { @@ -86,6 +84,12 @@ class VariableResponse { inline std::string Varname() const { return meta_.varname(); } inline std::string OutVarname() const { return meta_.out_varname(); } inline std::string TableName() const { return meta_.table_name(); } + inline void ReleaseOwnershipOfLocalScope() { + PADDLE_ENFORCE(create_scope_, + "only when create_scope_ is true can you release the " + "ownership of local scope"); + local_scope_ = nullptr; + } // should call parse first. framework::Variable* GetVar() { From b5aefc8b6d4c2aa2d28fbb1546d64ac52a754a26 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 18 Jan 2019 15:07:55 +0800 Subject: [PATCH 08/98] fix compile problem --- paddle/fluid/operators/distributed/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 1249ef9a9b..ed819ac9f0 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -50,7 +50,7 @@ endif() cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) +cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc From f3210b60ba3a5f23cfed95148c44e5d5db298f35 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 18 Jan 2019 15:49:32 +0800 Subject: [PATCH 09/98] fix copy_memory and share_memory --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4173b39e10..3997294f17 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -419,13 +419,13 @@ void ParallelExecutor::BCastParamsToDevices( auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - auto share_memory = [&] { + auto copy_memory = [&] { t->Resize(dims); t->mutable_data(cpu, main_tensor.type()); paddle::framework::TensorCopy(main_tensor, cpu, t); }; - auto copy_memory = [&] { t->ShareDataWith(main_tensor); }; + auto share_memory = [&] { t->ShareDataWith(main_tensor); }; // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. if (member_->build_strategy_.async_mode_) { From ca5d96bb3d376be0ade29db4f58700ba2c81b88a Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 24 Jan 2019 16:36:48 +0800 Subject: [PATCH 10/98] complete send lod tensor --- paddle/fluid/framework/communicator.h | 2 + .../operators/distributed/CMakeLists.txt | 3 +- .../operators/distributed/parameter_send.cc | 189 ++++++++++++++++++ .../operators/distributed/parameter_send.h | 35 ++++ .../operators/distributed_ops/send_op.cc | 15 ++ 5 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/distributed/parameter_send.cc create mode 100644 paddle/fluid/operators/distributed/parameter_send.h diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h index ba8fb3e173..0e90ba02e6 100644 --- a/paddle/fluid/framework/communicator.h +++ b/paddle/fluid/framework/communicator.h @@ -41,6 +41,8 @@ class Communicator { void receive() {} + void prefetch() {} + void wait() {} private: diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index cb361e95e8..fa8abf4cec 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(WITH_GRPC) else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc request_handler_impl.cc rpc_client.cc rpc_server.cc @@ -50,6 +50,7 @@ cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc new file mode 100644 index 0000000000..01e7341f15 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_send.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +static size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (id < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + +static std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +static std::vector> SplitIds( + const std::vector& ids_vector, + const std::vector& height_section, framework::Scope* scope) { + std::set all_ids; + for (auto id : ids_vector) { + all_ids.insert(id); + } + + auto abs_sections = ToAbsoluteSection(height_section); + std::vector> splited_ids; + splited_ids.resize(height_section.size() + 1); + for (auto& id : all_ids) { + auto section_index = GetSectionIndex(id, abs_sections); + splited_ids[section_index].push_back(id - abs_sections[section_index]); + } + return splited_ids; +} + +static void SplitIdsIntoMultipleVarsBySection( + const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); + + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + auto* id_tensor = + scope->Var(in_var_names[i])->GetMutable(); + auto& ids = splited_ids[i]; + if (!ids.empty()) { + auto* id_tensor_data = id_tensor->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); + } + } +} + +void send(const std::string& var_name, + const std::vector& send_varnames, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, bool sync) { + framework::Scope* local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& cpu_ctx = *pool.Get(platform::CPUPlace()); + auto& actual_ctx = *pool.Get(context.GetPlace()); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); + + auto* send_var = scope.FindVar(var_name); + size_t out_num = send_varnames.size(); + if (send_var->IsType()) { + auto& send_tensor = send_var->Get(); + auto& send_tensor_dims = send_tensor.dims(); + std::vector outs_dims; + outs_dims.reserve(out_num); + + // infer output shape + int num = context.Attr("num"); + if (num > 0) { + int64_t in_axis_dim = send_tensor_dims[0]; + PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, + "tensor split does not result" + " in an equal division"); + size_t out_axis_dim = in_axis_dim / num; + for (size_t i = 0; i < out_num; ++i) { + auto dim = send_tensor_dims; + dim[0] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (height_sections.size() > 0) { + PADDLE_ENFORCE_EQ(height_sections.size(), out_num, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < out_num; ++i) { + auto dim = send_tensor_dims; + dim[0] = height_sections[i]; + outs_dims.push_back(dim); + } + } + + // create output var in local scope + size_t row_offset = 0; + for (auto i = 0; i < out_num; ++i) { + auto* out = + local_scope->Var(send_varnames[i])->GetMutable(); + *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); + row_offset += outs_dims[i][0]; + } + } else if (send_var->IsType()) { + // create output var in local scope + for (auto& name : send_varnames) { + local_scope->Var(name)->GetMutable(); + } + } else { + PADDLE_THROW("unsupported var type"); + } + + std::vector rets; + for (size_t i = 0; i < send_varnames.size(); i++) { + auto& send_var_name = send_varnames[i]; + auto& endpoint = epmap[i]; + if (NeedSend(*local_scope, send_var_name)) { + VLOG(3) << "sending " << send_var_name << " to " << endpoint; + rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, + send_var_name)); + } else { + VLOG(3) << "don't send non-initialized variable: " << send_varnames[i]; + } + } + + if (sync) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + + delete local_scope; +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h new file mode 100644 index 0000000000..ee4da997b7 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void send(const std::string& var_name, + const std::vector& send_varnames, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, bool sync); + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index e2c2147ab5..02397bb6b3 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -88,6 +88,21 @@ This operator will send variables to listen_and_serve op at the parameter server "Server endpoints in the order of input " "variables for mapping") .SetDefault({"127.0.0.1:6164"}); + AddAttr>("sections", + "(vector) " + "the length of each output along the " + "specified axis.") + .SetDefault(std::vector{}); + AddAttr>( + "send_varnames", + "(vector) " + "the splited output varnames to send to pserver") + .SetDefault(std::vector{}); + AddAttr("num", + "(int, default 0)" + "Number of sub-tensors. This must evenly divide " + "Input.dims()[axis]") + .SetDefault(0); } }; From 1866d2dbefbaa630eac57da6838b8423d1074dd8 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 24 Jan 2019 17:16:32 +0800 Subject: [PATCH 11/98] parameter send support selected_rows --- .../operators/distributed/parameter_send.cc | 84 +++++++++++++++++-- .../operators/distributed/parameter_send.h | 1 + 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 01e7341f15..d79ea8cdb9 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -47,6 +47,15 @@ static size_t GetSectionIndex(int64_t id, return abs_sections.size() - 1; } +static int FindOutIdx(int row, const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (row < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + static std::vector ToAbsoluteSection( const std::vector& height_sections) { std::vector abs_sections; @@ -97,21 +106,22 @@ static void SplitIdsIntoMultipleVarsBySection( } } +template void send(const std::string& var_name, const std::vector& send_varnames, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context, - const framework::Scope& scope, bool sync) { + const framework::ExecutionContext& ctx, const framework::Scope& scope, + bool sync) { framework::Scope* local_scope = scope.NewTmpScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); - auto& actual_ctx = *pool.Get(context.GetPlace()); + auto& actual_ctx = *pool.Get(ctx.GetPlace()); distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); + ctx.Attr("trainer_id")); auto* send_var = scope.FindVar(var_name); size_t out_num = send_varnames.size(); @@ -122,7 +132,7 @@ void send(const std::string& var_name, outs_dims.reserve(out_num); // infer output shape - int num = context.Attr("num"); + int num = ctx.Attr("num"); if (num > 0) { int64_t in_axis_dim = send_tensor_dims[0]; PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, @@ -153,13 +163,71 @@ void send(const std::string& var_name, *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); row_offset += outs_dims[i][0]; } - } else if (send_var->IsType()) { + } else if (send_var->IsType()) { + auto& send_slr = send_var->Get(); + auto abs_sections = ToAbsoluteSection(height_sections); + + auto send_rows = send_slr.rows(); + std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + + outs_rows_idx.resize(out_num); + outs_dense_idx.resize(out_num); + + auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; + auto src = send_slr.value().data(); + // create output var in local scope + std::vector outs; for (auto& name : send_varnames) { - local_scope->Var(name)->GetMutable(); + auto* out = local_scope->Var(name)->GetMutable(); + outs.push_back(out); + } + + // split rows index into output sparse vars + for (size_t i = 0; i < send_rows.size(); ++i) { + int out_idx = FindOutIdx(send_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(send_rows[i]); + outs_dense_idx[out_idx].push_back(i); } + auto place = ctx.GetPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + outs[i]->set_height(height_sections[i]); + auto dims = send_slr.GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); + outs[i]->mutable_rows()->clear(); + if (rows_idx.size() > 0) { + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + } + auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); + } else { +#ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, + sizeof(T) * row_numel, stream); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } + } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); + } + } else { - PADDLE_THROW("unsupported var type"); + PADDLE_THROW("unsupported var type to send!"); } std::vector rets; diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h index ee4da997b7..e337649cf2 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -23,6 +23,7 @@ namespace paddle { namespace operators { namespace distributed { +template void send(const std::string& var_name, const std::vector& send_varnames, const std::vector& epmap, From 74040cb4aad1c8390fcc080c32f0c12bee46a05b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 24 Jan 2019 18:38:52 +0800 Subject: [PATCH 12/98] code clean --- .../distributed/parameter_prefetch.cc | 29 ++------ .../distributed/parameter_prefetch.h | 4 +- .../operators/distributed/parameter_send.cc | 71 +------------------ .../operators/distributed/parameter_send.h | 2 +- .../operators/distributed_ops/send_op.cc | 10 +-- .../distributed_ops/send_recv_util.h | 36 ++++++++++ .../operators/hierarchical_sigmoid_op.cc | 6 +- .../fluid/operators/hierarchical_sigmoid_op.h | 2 +- paddle/fluid/operators/lookup_table_op.cc | 6 +- paddle/fluid/operators/lookup_table_op.h | 3 +- paddle/fluid/operators/nce_op.cc | 6 +- paddle/fluid/operators/nce_op.h | 3 +- .../fluid/operators/split_selected_rows_op.h | 21 +----- 13 files changed, 64 insertions(+), 135 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 9dfbc80870..7434265929 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -37,30 +37,9 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -static size_t GetSectionIndex(int64_t id, - const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (id < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - static std::vector> SplitIds( const std::vector& ids_vector, - const std::vector& height_section, framework::Scope* scope) { + const std::vector& height_section, framework::Scope* scope) { std::set all_ids; for (auto id : ids_vector) { all_ids.insert(id); @@ -78,7 +57,7 @@ static std::vector> SplitIds( static void SplitIdsIntoMultipleVarsBySection( const std::vector& in_var_names, - const std::vector& height_section, + const std::vector& height_section, const std::vector>& splited_ids, framework::Scope* scope) { PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); @@ -100,7 +79,7 @@ static void SplitIdsIntoMultipleVarsBySection( static void MergeMultipleVarsIntoOneBySection( const std::string& id_name, const std::vector& ids_vector, const std::string& out_name, const std::vector& out_var_names, - const std::vector& height_section, + const std::vector& height_section, const std::vector>& splited_ids, const framework::ExecutionContext& context, framework::Scope* scope, platform::DeviceContext* actual_ctx) { @@ -177,7 +156,7 @@ static void MergeMultipleVarsIntoOneBySection( void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { framework::Scope* local_scope = scope.NewTmpScope(); diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 2f850a0332..0429ec4415 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -26,7 +26,7 @@ namespace distributed { void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope); @@ -35,7 +35,7 @@ void prefetch_with_reconstruct(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope, framework::LoDTensor* original) { diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index d79ea8cdb9..09fce06b5a 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -37,80 +37,11 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -static size_t GetSectionIndex(int64_t id, - const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (id < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static int FindOutIdx(int row, const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (row < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - -static std::vector> SplitIds( - const std::vector& ids_vector, - const std::vector& height_section, framework::Scope* scope) { - std::set all_ids; - for (auto id : ids_vector) { - all_ids.insert(id); - } - - auto abs_sections = ToAbsoluteSection(height_section); - std::vector> splited_ids; - splited_ids.resize(height_section.size() + 1); - for (auto& id : all_ids) { - auto section_index = GetSectionIndex(id, abs_sections); - splited_ids[section_index].push_back(id - abs_sections[section_index]); - } - return splited_ids; -} - -static void SplitIdsIntoMultipleVarsBySection( - const std::vector& in_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - framework::Scope* scope) { - PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); - - auto place = platform::CPUPlace(); - - for (size_t i = 0; i < in_var_names.size(); ++i) { - auto* id_tensor = - scope->Var(in_var_names[i])->GetMutable(); - auto& ids = splited_ids[i]; - if (!ids.empty()) { - auto* id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - template void send(const std::string& var_name, const std::vector& send_varnames, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& ctx, const framework::Scope& scope, bool sync) { framework::Scope* local_scope = scope.NewTmpScope(); diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h index e337649cf2..6272cc5d25 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -27,7 +27,7 @@ template void send(const std::string& var_name, const std::vector& send_varnames, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope, bool sync); diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 02397bb6b3..f8b9a1d15a 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -88,11 +88,11 @@ This operator will send variables to listen_and_serve op at the parameter server "Server endpoints in the order of input " "variables for mapping") .SetDefault({"127.0.0.1:6164"}); - AddAttr>("sections", - "(vector) " - "the length of each output along the " - "specified axis.") - .SetDefault(std::vector{}); + AddAttr>("sections", + "(vector) " + "the length of each output along the " + "specified axis.") + .SetDefault(std::vector{}); AddAttr>( "send_varnames", "(vector) " diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h index dc26c53c64..1e91f0dd51 100644 --- a/paddle/fluid/operators/distributed_ops/send_recv_util.h +++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h @@ -13,8 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include +#include + #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" namespace paddle { namespace operators { @@ -42,5 +48,35 @@ inline bool NeedSend(const framework::Scope& scope, return false; } +inline int FindOutIdx(int row, const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (row < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + +inline std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +inline size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (id < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 6ca6f0bc04..13820e54aa 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -134,9 +134,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 1a7ca96301..2247131137 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -70,7 +70,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server - auto height_sections = ctx.Attr>("height_sections"); + auto height_sections = ctx.Attr>("height_sections"); auto table_names = ctx.Attr>("table_names"); std::vector real_rows = PathToRows(*path); framework::Scope& local_scope = ctx.scope().NewScope(); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 0029932bc0..9f6fbe05fa 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -91,9 +91,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index a7d0fd4856..f95f29356f 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -50,7 +50,8 @@ class LookupTableKernel : public framework::OpKernel { // for remote prefetch auto epmap = context.Attr>("epmap"); - auto height_sections = context.Attr>("height_sections"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); if (!epmap.empty()) { diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 256da34912..8160f45e74 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -156,9 +156,9 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2c97eef096..fab46a5971 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -167,7 +167,8 @@ class NCEKernel : public framework::OpKernel { framework::Scope &local_scope = context.scope().NewScope(); - auto height_sections = context.Attr>("height_sections"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); auto *ids = local_scope.Var("Ids@Prefetch"); diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 1fef2b3d37..c29065649e 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -16,31 +16,12 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { namespace operators { -static int FindOutIdx(int row, const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (row < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - template class SplitSelectedRowsOpKernel : public framework::OpKernel { public: From 1edc0423d2f2a96a342acdd8750e3608aa7b8ce9 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 24 Jan 2019 19:26:07 +0800 Subject: [PATCH 13/98] update send_op --- .../operators/distributed_ops/send_op.cc | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index f8b9a1d15a..2136670103 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/parameter_send.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -37,30 +38,46 @@ class SendOp : public framework::OperatorBase { const platform::Place& place) const override { auto ins = Inputs("X"); - std::vector epmap = Attr>("epmap"); + auto epmap = Attr>("epmap"); int sync_send = Attr("sync_mode"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - Attr("trainer_id")); - - std::vector rets; - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); - } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; + auto send_varnames = Attr>("send_varnames"); + auto height_sections = Attr>("height_sections"); + + if (send_varnames.size() > 0) { + PADDLE_ENFORCE_EQ(ins.size(), 1, ""); + framework::RuntimeContext ctx(Inputs(), Outputs(), scope); + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); + distributed::send(ins[0], send_varnames, epmap, height_sections, + exe_ctx, scope, static_cast(sync_send)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + Attr("trainer_id")); + + std::vector rets; + for (size_t i = 0; i < ins.size(); i++) { + if (NeedSend(scope, ins[i])) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + rets.push_back( + rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); + } else { + VLOG(3) << "don't send no-initialied variable: " << ins[i]; + } } - } - if (sync_send) { - for (size_t i = 0; i < rets.size(); i++) { - VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; + if (sync_send) { + for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; + } } } } From fab8457e6b117be26e23171b649a1bfda14531b2 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 26 Jan 2019 23:12:23 +0800 Subject: [PATCH 14/98] code optimize --- .../details/async_ssa_graph_executor.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index ba2e90d052..7dc269242f 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -68,20 +68,18 @@ FeedFetchList AsyncSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(std::move(f.get())); + } + } } else { fetch_data.emplace_back(std::move(call())); } } - if (pool_) { - for (auto &f : run_futures) { - if (exception_holder_.IsCaught()) { - f.wait(); - } else { - fetch_data.emplace_back(std::move(f.get())); - } - } - } if (exception_holder_.IsCaught()) { exception_holder_.ReThrow(); } From 62549e071402530e862285ab1613eb8e8e5e5150 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 27 Jan 2019 17:10:45 +0800 Subject: [PATCH 15/98] add GenParentScopeTreeDebugInfo --- paddle/fluid/framework/parallel_executor.cc | 1 + paddle/fluid/framework/scope.cc | 29 +++++++++++++++++++++ paddle/fluid/framework/scope.h | 1 + 3 files changed, 31 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3997294f17..f0bc3acccc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -365,6 +365,7 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToDevices( const std::unordered_set &vars) const { + VLOG(3) << "BCastParamsToDevices"; // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 9536185609..884ad3b34b 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -259,5 +259,34 @@ std::string GenScopeTreeDebugInfo(Scope* root) { return os.str(); } +std::string GenParentScopeTreeDebugInfo(Scope* leaf) { + std::stringstream os; + + if (!leaf) return ""; + + // level traversal + std::vector scopes; + const Scope* current_scope = leaf; + + while (current_scope != nullptr) { + scopes.push_back(current_scope); + current_scope = current_scope->parent(); + // end of a level + os << "\n------------------------------------------\n"; + } + + os << "\nDetails:\n\n"; + + for (auto* q : scopes) { + os << "====\n"; + os << q << ":\n"; + for (auto& var : q->LocalVarNames()) { + os << " - " << var << "\n"; + } + } + + return os.str(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index f0915d2eee..eb5c12def6 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -144,6 +144,7 @@ class Scope { // Generate some debug string about the inherience structure of scope, quite // naive. std::string GenScopeTreeDebugInfo(Scope*); +std::string GenParentScopeTreeDebugInfo(Scope*); } // namespace framework } // namespace paddle From be738a646e2f760a53c36a658c7d07c4f75cd814 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 27 Jan 2019 21:56:25 +0800 Subject: [PATCH 16/98] add some debug infor --- .../details/async_ssa_graph_executor.cc | 17 ++++++++++------- .../details/multi_devices_graph_pass.cc | 2 ++ paddle/fluid/framework/scope.cc | 12 +++++------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 7dc269242f..c259ff4f74 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -68,18 +68,21 @@ FeedFetchList AsyncSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); - for (auto &f : run_futures) { - if (exception_holder_.IsCaught()) { - f.wait(); - } else { - fetch_data.emplace_back(std::move(f.get())); - } - } } else { fetch_data.emplace_back(std::move(call())); } } + if (pool_) { + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(std::move(f.get())); + } + } + } + if (exception_holder_.IsCaught()) { exception_holder_.ReThrow(); } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index d7a4b5692b..f1347e2b0d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -249,6 +249,8 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( break; } + VLOG(3) << "loss_scale: " << loss_scale; + if (loss_scale) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 884ad3b34b..2c76ab22f6 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -271,16 +271,14 @@ std::string GenParentScopeTreeDebugInfo(Scope* leaf) { while (current_scope != nullptr) { scopes.push_back(current_scope); current_scope = current_scope->parent(); - // end of a level - os << "\n------------------------------------------\n"; } - os << "\nDetails:\n\n"; + os << "\n--------------GenParentScopeTreeDebugInfo--------------\n"; - for (auto* q : scopes) { - os << "====\n"; - os << q << ":\n"; - for (auto& var : q->LocalVarNames()) { + for (int i = scopes.size() - 1; i >= 0; --i) { + os << "=======level [" << i << "]=======\n"; + os << scopes[i] << ":\n"; + for (auto& var : scopes[i]->LocalVarNames()) { os << " - " << var << "\n"; } } From 9da96aba956abe13aec945c1e71e338df56a13b5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 27 Jan 2019 23:04:50 +0800 Subject: [PATCH 17/98] clean code of test_async_ssa_graph_executor_mnist --- .../test_async_ssa_graph_executor_mnist.py | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py new file mode 100644 index 0000000000..e2b3b2b0f2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -0,0 +1,214 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from PIL import Image +import numpy +import paddle +import paddle.fluid as fluid + +BATCH_SIZE = 64 +PASS_NUM = 5 + + +def loss_net(hidden, label): + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + acc = fluid.layers.accuracy(input=prediction, label=label) + return prediction, avg_loss, acc + + +def convolutional_neural_network(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + return loss_net(conv_pool_2, label) + + +def train(use_cuda, + save_dirname=None, + model_filename=None, + params_filename=None): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + prediction, avg_loss, acc = convolutional_neural_network(img, label) + + test_program = fluid.default_main_program().clone(for_test=True) + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(avg_loss) + + def train_test(train_test_program, train_test_feed, train_test_reader): + acc_set = [] + avg_loss_set = [] + for test_data in train_test_reader(): + acc_np, avg_loss_np = exe.run(program=train_test_program, + feed=train_test_feed.feed(test_data), + fetch_list=[acc, avg_loss]) + acc_set.append(float(acc_np)) + avg_loss_set.append(float(avg_loss_np)) + # get test acc and loss + acc_val_mean = numpy.array(acc_set).mean() + avg_loss_val_mean = numpy.array(avg_loss_set).mean() + return avg_loss_val_mean, acc_val_mean + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + exe = fluid.Executor(place) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + + exe.run(fluid.default_startup_program()) + main_program = fluid.default_main_program() + + exec_strategy = fluid.ExecutionStrategy() + build_strategy = fluid.BuildStrategy() + + cpu_num = int(os.environ.get('CPU_NUM')) + thread_num = int(os.getenv("NUM_THREADS")) + + print("cpu_num:" + str(cpu_num)) + print("thread_num:" + str(thread_num)) + + build_strategy.async_mode = True + + exec_strategy.num_threads = thread_num + exec_strategy.num_iteration_per_drop_scope = 1 + exec_strategy.num_iteration_per_run = 10 + + pe = fluid.ParallelExecutor( + use_cuda=False, + loss_name=avg_loss.name, + main_program=main_program, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + lists = [] + step = 0 + for epoch_id in range(PASS_NUM): + for step_id, data in enumerate(train_reader()): + loss_val, acc_val = pe.run(feed=feeder.feed(data), + fetch_list=[avg_loss.name, acc.name]) + loss_val = numpy.mean(loss_val) + acc_val = numpy.mean(acc_val) + if step % 100 == 0: + print("Pass %d, Batch %d, Cost %f" % (epoch_id, step, loss_val)) + step += 1 + # test for epoch + avg_loss_val, acc_val = train_test( + train_test_program=test_program, + train_test_reader=test_reader, + train_test_feed=feeder) + + print("Test with Epoch %d, avg_cost: %s, acc: %s" % + (epoch_id, avg_loss_val, acc_val)) + lists.append((epoch_id, avg_loss_val, acc_val)) + if save_dirname is not None: + fluid.io.save_inference_model( + save_dirname, ["img"], [prediction], + exe, + model_filename=model_filename, + params_filename=params_filename) + + # find the best pass + best = sorted(lists, key=lambda list: float(list[1]))[0] + print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1])) + print('The classification accuracy is %.2f%%' % (float(best[2]) * 100)) + + +def infer(use_cuda, + save_dirname=None, + model_filename=None, + params_filename=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + def load_image(file): + im = Image.open(file).convert('L') + im = im.resize((28, 28), Image.ANTIALIAS) + im = numpy.array(im).reshape(1, 1, 28, 28).astype(numpy.float32) + im = im / 255.0 * 2.0 - 1.0 + return im + + cur_dir = os.path.dirname(os.path.realpath(__file__)) + tensor_img = load_image(cur_dir + '/image/infer_3.png') + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + save_dirname, exe, model_filename, params_filename) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + lab = numpy.argsort(results) + print("Inference result of image/infer_3.png is: %d" % lab[0][0][-1]) + + +def main(use_cuda): + model_filename = None + params_filename = None + save_dirname = "recognize_digits" + ".inference.model" + + # call train() with is_local argument to run distributed train + train( + use_cuda=use_cuda, + save_dirname=save_dirname, + model_filename=model_filename, + params_filename=params_filename) + infer( + use_cuda=use_cuda, + save_dirname=save_dirname, + model_filename=model_filename, + params_filename=params_filename) + + +if __name__ == '__main__': + use_cuda = False + main(use_cuda=use_cuda) From 7e145b7c0e8a877ce78135dc74d3d65090e9c704 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 28 Jan 2019 10:13:09 +0800 Subject: [PATCH 18/98] optimize test_async_ssa_graph_executor_mnist --- .../test_async_ssa_graph_executor_mnist.py | 138 ++++-------------- 1 file changed, 31 insertions(+), 107 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index e2b3b2b0f2..03d7df8852 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -15,13 +15,13 @@ from __future__ import print_function import os -from PIL import Image +import unittest + import numpy import paddle import paddle.fluid as fluid BATCH_SIZE = 64 -PASS_NUM = 5 def loss_net(hidden, label): @@ -51,11 +51,9 @@ def convolutional_neural_network(img, label): return loss_net(conv_pool_2, label) -def train(use_cuda, - save_dirname=None, - model_filename=None, - params_filename=None): +def train(use_cuda, thread_num, cpu_num): if use_cuda and not fluid.core.is_compiled_with_cuda(): + print("paddle is not compiled with cuda, exit!") return img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') @@ -84,8 +82,6 @@ def train(use_cuda, place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), @@ -94,24 +90,22 @@ def train(use_cuda, paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - main_program = fluid.default_main_program() - exec_strategy = fluid.ExecutionStrategy() - build_strategy = fluid.BuildStrategy() - - cpu_num = int(os.environ.get('CPU_NUM')) - thread_num = int(os.getenv("NUM_THREADS")) + os.environ['CPU_NUM'] = str(cpu_num) print("cpu_num:" + str(cpu_num)) print("thread_num:" + str(thread_num)) - build_strategy.async_mode = True + build_strategy = fluid.BuildStrategy() + build_strategy.async_mode = True # enable async mode + exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = thread_num - exec_strategy.num_iteration_per_drop_scope = 1 - exec_strategy.num_iteration_per_run = 10 + exec_strategy.num_iteration_per_run = 2 + main_program = fluid.default_main_program() pe = fluid.ParallelExecutor( use_cuda=False, loss_name=avg_loss.name, @@ -119,96 +113,26 @@ def train(use_cuda, build_strategy=build_strategy, exec_strategy=exec_strategy) - lists = [] step = 0 - for epoch_id in range(PASS_NUM): - for step_id, data in enumerate(train_reader()): - loss_val, acc_val = pe.run(feed=feeder.feed(data), - fetch_list=[avg_loss.name, acc.name]) - loss_val = numpy.mean(loss_val) - acc_val = numpy.mean(acc_val) - if step % 100 == 0: - print("Pass %d, Batch %d, Cost %f" % (epoch_id, step, loss_val)) - step += 1 - # test for epoch - avg_loss_val, acc_val = train_test( - train_test_program=test_program, - train_test_reader=test_reader, - train_test_feed=feeder) - - print("Test with Epoch %d, avg_cost: %s, acc: %s" % - (epoch_id, avg_loss_val, acc_val)) - lists.append((epoch_id, avg_loss_val, acc_val)) - if save_dirname is not None: - fluid.io.save_inference_model( - save_dirname, ["img"], [prediction], - exe, - model_filename=model_filename, - params_filename=params_filename) - - # find the best pass - best = sorted(lists, key=lambda list: float(list[1]))[0] - print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1])) - print('The classification accuracy is %.2f%%' % (float(best[2]) * 100)) - - -def infer(use_cuda, - save_dirname=None, - model_filename=None, - params_filename=None): - if save_dirname is None: - return + for step_id, data in enumerate(train_reader()): + loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name]) + loss_val = numpy.mean(loss_val) + if step % 100 == 0: + print("Batch %d, Cost %f" % (step, loss_val)) + step += 1 + # test for epoch + avg_loss_val, acc_val = train_test( + train_test_program=test_program, + train_test_reader=test_reader, + train_test_feed=feeder) + + print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val)) + + +class TestAsyncSSAGraphExecutor(unittest.TestCase): + def test_check_async_ssa_exe_train(self): + train(use_cuda=False, thread_num=2, cpu_num=2) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - def load_image(file): - im = Image.open(file).convert('L') - im = im.resize((28, 28), Image.ANTIALIAS) - im = numpy.array(im).reshape(1, 1, 28, 28).astype(numpy.float32) - im = im / 255.0 * 2.0 - 1.0 - return im - - cur_dir = os.path.dirname(os.path.realpath(__file__)) - tensor_img = load_image(cur_dir + '/image/infer_3.png') - - inference_scope = fluid.core.Scope() - with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, - # the feed_target_names (the names of variables that will be feeded - # data using feed operators), and the fetch_targets (variables that - # we want to obtain data from using fetch operators). - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model( - save_dirname, exe, model_filename, params_filename) - - # Construct feed as a dictionary of {feed_target_name: feed_target_data} - # and results will contain a list of data corresponding to fetch_targets. - results = exe.run(inference_program, - feed={feed_target_names[0]: tensor_img}, - fetch_list=fetch_targets) - lab = numpy.argsort(results) - print("Inference result of image/infer_3.png is: %d" % lab[0][0][-1]) - - -def main(use_cuda): - model_filename = None - params_filename = None - save_dirname = "recognize_digits" + ".inference.model" - - # call train() with is_local argument to run distributed train - train( - use_cuda=use_cuda, - save_dirname=save_dirname, - model_filename=model_filename, - params_filename=params_filename) - infer( - use_cuda=use_cuda, - save_dirname=save_dirname, - model_filename=model_filename, - params_filename=params_filename) - - -if __name__ == '__main__': - use_cuda = False - main(use_cuda=use_cuda) +if __name__ == "__main__": + unittest.main() From 02dab46ab8101873663a63614f88931ead7846d9 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 28 Jan 2019 16:23:06 +0800 Subject: [PATCH 19/98] add some debug info --- .../details/async_ssa_graph_executor.cc | 2 ++ .../framework/details/exception_holder.h | 17 ++++++++++++ .../fluid/operators/reader/blocking_queue.h | 1 + .../test_async_ssa_graph_executor_mnist.py | 27 ++++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index c259ff4f74..e21d5fb96d 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -84,6 +84,8 @@ FeedFetchList AsyncSSAGraphExecutor::Run( } if (exception_holder_.IsCaught()) { + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; exception_holder_.ReThrow(); } diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 1b1afce04e..77ca03b86e 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -64,6 +66,21 @@ class ExceptionHolder { ClearImpl(); } + std::string Type() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + return "None"; + case kEnforceNotMet: { + return "EnforceNotMet"; + } + case kEOF: { + return "EOF"; + } + } + return "unknown"; + } + private: void ClearImpl() { exception_.reset(); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5..45c3ad802f 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -79,6 +79,7 @@ class BlockingQueue { return true; } else { PADDLE_ENFORCE(closed_); + VLOG(3) << "queue is closed! return nothing."; return false; } } diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 03d7df8852..6a2f829654 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -59,6 +59,13 @@ def train(use_cuda, thread_num, cpu_num): img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') + py_reader = fluid.layers.create_py_reader_by_data( + capacity=64, + feed_list=[img, label], + name='py_reader', + use_double_buffer=True) + img, label = fluid.layers.read_file(py_reader) + prediction, avg_loss, acc = convolutional_neural_network(img, label) test_program = fluid.default_main_program().clone(for_test=True) @@ -103,7 +110,7 @@ def train(use_cuda, thread_num, cpu_num): exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = thread_num - exec_strategy.num_iteration_per_run = 2 + exec_strategy.num_iteration_per_run = 1 main_program = fluid.default_main_program() pe = fluid.ParallelExecutor( @@ -113,6 +120,22 @@ def train(use_cuda, thread_num, cpu_num): build_strategy=build_strategy, exec_strategy=exec_strategy) + py_reader.decorate_paddle_reader(train_reader) + py_reader.start() + + step = 0 + try: + while True: + print("step %d in" % step) + loss_val = pe.run(fetch_list=[avg_loss.name]) + loss_val = numpy.mean(loss_val) + if step % 1 == 0: + print("Batch %d, Cost %f, queue size %d" % + (step, loss_val, py_reader.queue.size())) + step += 1 + except fluid.core.EOFException: + py_reader.reset() + """ step = 0 for step_id, data in enumerate(train_reader()): loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name]) @@ -120,6 +143,8 @@ def train(use_cuda, thread_num, cpu_num): if step % 100 == 0: print("Batch %d, Cost %f" % (step, loss_val)) step += 1 + """ + # test for epoch avg_loss_val, acc_val = train_test( train_test_program=test_program, From 4a172611f989eaae04638784cf96c3a2be3c6b8c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 28 Jan 2019 17:11:48 +0800 Subject: [PATCH 20/98] complete test_async_ssa_graph_executor_mnist test=develop --- .../test_async_ssa_graph_executor_mnist.py | 162 ++++++++++-------- 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 6a2f829654..1104604970 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -18,60 +18,61 @@ import os import unittest import numpy +import time import paddle import paddle.fluid as fluid BATCH_SIZE = 64 -def loss_net(hidden, label): - prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) - acc = fluid.layers.accuracy(input=prediction, label=label) - return prediction, avg_loss, acc - - -def convolutional_neural_network(img, label): - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=img, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - return loss_net(conv_pool_2, label) - - -def train(use_cuda, thread_num, cpu_num): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - print("paddle is not compiled with cuda, exit!") - return - - img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - py_reader = fluid.layers.create_py_reader_by_data( - capacity=64, - feed_list=[img, label], - name='py_reader', - use_double_buffer=True) - img, label = fluid.layers.read_file(py_reader) - - prediction, avg_loss, acc = convolutional_neural_network(img, label) +def convolutional_neural_network(use_py_reader): + with fluid.unique_name.guard(): + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + py_reader = None + if use_py_reader: + py_reader = fluid.layers.create_py_reader_by_data( + capacity=64, + feed_list=[img, label], + name='py_reader', + use_double_buffer=True) + img, label = fluid.layers.read_file(py_reader) + + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return img, label, prediction, avg_loss, acc, py_reader + + +def test(): + place = fluid.CPUPlace() + exe = fluid.Executor(place) - test_program = fluid.default_main_program().clone(for_test=True) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) - optimizer.minimize(avg_loss) + img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + use_py_reader=False) + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) def train_test(train_test_program, train_test_feed, train_test_reader): acc_set = [] @@ -87,16 +88,33 @@ def train(use_cuda, thread_num, cpu_num): avg_loss_val_mean = numpy.array(avg_loss_set).mean() return avg_loss_val_mean, acc_val_mean - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # test for epoch + avg_loss_val, acc_val = train_test( + train_test_program=fluid.default_main_program(), + train_test_reader=test_reader, + train_test_feed=feeder) + + print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val)) + assert acc_val > 0.96 + + +def train(use_cuda, thread_num, cpu_num): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + print("paddle is not compiled with cuda, exit!") + return + + img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + use_py_reader=True) + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(avg_loss) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=BATCH_SIZE) - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) - feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -106,11 +124,11 @@ def train(use_cuda, thread_num, cpu_num): print("thread_num:" + str(thread_num)) build_strategy = fluid.BuildStrategy() - build_strategy.async_mode = True # enable async mode + build_strategy.async_mode = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = thread_num - exec_strategy.num_iteration_per_run = 1 + exec_strategy.num_iteration_per_run = 10 main_program = fluid.default_main_program() pe = fluid.ParallelExecutor( @@ -126,37 +144,39 @@ def train(use_cuda, thread_num, cpu_num): step = 0 try: while True: - print("step %d in" % step) loss_val = pe.run(fetch_list=[avg_loss.name]) loss_val = numpy.mean(loss_val) - if step % 1 == 0: + if step % 100 == 0: print("Batch %d, Cost %f, queue size %d" % (step, loss_val, py_reader.queue.size())) step += 1 except fluid.core.EOFException: + print("train end") py_reader.reset() - """ - step = 0 - for step_id, data in enumerate(train_reader()): - loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name]) - loss_val = numpy.mean(loss_val) - if step % 100 == 0: - print("Batch %d, Cost %f" % (step, loss_val)) - step += 1 - """ - - # test for epoch - avg_loss_val, acc_val = train_test( - train_test_program=test_program, - train_test_reader=test_reader, - train_test_feed=feeder) - print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val)) + return step class TestAsyncSSAGraphExecutor(unittest.TestCase): def test_check_async_ssa_exe_train(self): - train(use_cuda=False, thread_num=2, cpu_num=2) + step_list = [] + for cpu_num in [1, 2, 4]: + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard( + fluid.Program(), startup_program=fluid.Program()): + start_time = time.time() + step = train( + use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num) + end_time = time.time() + step_list.append(step) + print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) + + " time -> " + str(end_time - start_time)) + with fluid.program_guard( + fluid.Program(), startup_program=fluid.Program()): + test() + assert step_list[0] / 2 == step_list[1] + assert step_list[1] / 2 == step_list[2] if __name__ == "__main__": From 657a4f9430913da999b025a55c213c5c9e603a73 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 28 Jan 2019 21:40:51 +0800 Subject: [PATCH 21/98] code can compile --- .../operators/distributed/parameter_send.cc | 48 ++++++++++--------- .../operators/distributed/parameter_send.h | 14 +++--- .../operators/distributed_ops/CMakeLists.txt | 4 +- .../operators/distributed_ops/send_op.cc | 5 +- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 09fce06b5a..38b64c3fcd 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -38,27 +38,27 @@ using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; template -void send(const std::string& var_name, - const std::vector& send_varnames, - const std::vector& epmap, - const std::vector& height_sections, - const framework::ExecutionContext& ctx, const framework::Scope& scope, - bool sync) { - framework::Scope* local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& cpu_ctx = *pool.Get(platform::CPUPlace()); - auto& actual_ctx = *pool.Get(ctx.GetPlace()); - - distributed::RPCClient* rpc_client = +void ParameterSend::operator()(const std::string &var_name, + const std::vector &send_varnames, + const std::vector &epmap, + const std::vector &height_sections, + const framework::ExecutionContext &ctx, + const framework::Scope &scope, bool sync) { + framework::Scope *local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + auto &actual_ctx = *pool.Get(ctx.GetPlace()); + + distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance( ctx.Attr("trainer_id")); - auto* send_var = scope.FindVar(var_name); + auto *send_var = scope.FindVar(var_name); size_t out_num = send_varnames.size(); if (send_var->IsType()) { - auto& send_tensor = send_var->Get(); - auto& send_tensor_dims = send_tensor.dims(); + auto &send_tensor = send_var->Get(); + auto &send_tensor_dims = send_tensor.dims(); std::vector outs_dims; outs_dims.reserve(out_num); @@ -89,13 +89,13 @@ void send(const std::string& var_name, // create output var in local scope size_t row_offset = 0; for (auto i = 0; i < out_num; ++i) { - auto* out = + auto *out = local_scope->Var(send_varnames[i])->GetMutable(); *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); row_offset += outs_dims[i][0]; } } else if (send_var->IsType()) { - auto& send_slr = send_var->Get(); + auto &send_slr = send_var->Get(); auto abs_sections = ToAbsoluteSection(height_sections); auto send_rows = send_slr.rows(); @@ -109,9 +109,9 @@ void send(const std::string& var_name, auto src = send_slr.value().data(); // create output var in local scope - std::vector outs; - for (auto& name : send_varnames) { - auto* out = local_scope->Var(name)->GetMutable(); + std::vector outs; + for (auto &name : send_varnames) { + auto *out = local_scope->Var(name)->GetMutable(); outs.push_back(out); } @@ -163,8 +163,8 @@ void send(const std::string& var_name, std::vector rets; for (size_t i = 0; i < send_varnames.size(); i++) { - auto& send_var_name = send_varnames[i]; - auto& endpoint = epmap[i]; + auto &send_var_name = send_varnames[i]; + auto &endpoint = epmap[i]; if (NeedSend(*local_scope, send_var_name)) { VLOG(3) << "sending " << send_var_name << " to " << endpoint; rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, @@ -183,6 +183,8 @@ void send(const std::string& var_name, delete local_scope; } +template struct ParameterSend; + }; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h index 6272cc5d25..1746377228 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -24,12 +24,14 @@ namespace operators { namespace distributed { template -void send(const std::string& var_name, - const std::vector& send_varnames, - const std::vector& epmap, - const std::vector& height_sections, - const framework::ExecutionContext& context, - const framework::Scope& scope, bool sync); +struct ParameterSend { + void operator()(const std::string &var_name, + const std::vector &send_varnames, + const std::vector &epmap, + const std::vector &height_sections, + const framework::ExecutionContext &context, + const framework::Scope &scope, bool sync); +}; }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index a8bb597cbd..0eb30ce695 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 2136670103..e7ccaa83de 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -51,8 +51,9 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); - distributed::send(ins[0], send_varnames, epmap, height_sections, - exe_ctx, scope, static_cast(sync_send)); + auto send_functor = distributed::ParameterSend(); + send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx, + scope, static_cast(sync_send)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From 249f48e5397359696f1c2844473f4dcf55ce0ebe Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 29 Jan 2019 07:10:00 +0800 Subject: [PATCH 22/98] update test test=develop --- .../tests/unittests/test_async_ssa_graph_executor_mnist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 1104604970..41fa39e06b 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -175,8 +175,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase): with fluid.program_guard( fluid.Program(), startup_program=fluid.Program()): test() - assert step_list[0] / 2 == step_list[1] - assert step_list[1] / 2 == step_list[2] + assert int(step_list[0] / 2) == int(step_list[1]) + assert int(step_list[1] / 2) == int(step_list[2]) if __name__ == "__main__": From b1fe8d45709e0d7d0dcde4e969b5fc4e833320c6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Feb 2019 09:48:00 +0800 Subject: [PATCH 23/98] add a check for async_ssa_graph_exe test=develop --- .../framework/details/async_ssa_graph_executor.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index e21d5fb96d..79b390dde4 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -30,6 +30,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + if (strategy_.num_iteration_per_run_ > 1) { + int read_op_num = 0; + for (auto *node : graphs_[0]->Nodes()) { + if (node->IsOp() && node->Name() == "read") { + read_op_num++; + } + } + if (read_op_num == 0) { + LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " + "should use pyreader to feed data!"; + } + } + // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() ? 1UL From 741b7cfda9e6b921fba69b7a6ed904a3b5406f02 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Feb 2019 23:02:47 +0800 Subject: [PATCH 24/98] fix compile test=develop --- paddle/fluid/operators/distributed/parameter_send.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 38b64c3fcd..efe094fd1f 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -48,7 +48,6 @@ void ParameterSend::operator()(const std::string &var_name, platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &cpu_ctx = *pool.Get(platform::CPUPlace()); - auto &actual_ctx = *pool.Get(ctx.GetPlace()); distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance( From 4356f186b4a3015ea1a2877e60f1d8a05fe5312d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 6 Feb 2019 11:08:12 +0800 Subject: [PATCH 25/98] complete parameter_send --- .../operators/distributed/parameter_send.cc | 42 ++++++----------- .../operators/distributed_ops/send_op.cc | 2 +- .../fluid/tests/unittests/test_dist_base.py | 5 ++ .../fluid/transpiler/distribute_transpiler.py | 47 +++++++++++++------ 4 files changed, 54 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index efe094fd1f..47ca42c790 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -56,25 +56,13 @@ void ParameterSend::operator()(const std::string &var_name, auto *send_var = scope.FindVar(var_name); size_t out_num = send_varnames.size(); if (send_var->IsType()) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - int num = ctx.Attr("num"); - if (num > 0) { - int64_t in_axis_dim = send_tensor_dims[0]; - PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, - "tensor split does not result" - " in an equal division"); - size_t out_axis_dim = in_axis_dim / num; - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = out_axis_dim; - outs_dims.push_back(dim); - } - } else if (height_sections.size() > 0) { + if (out_num > 1) { + auto &send_tensor = send_var->Get(); + auto &send_tensor_dims = send_tensor.dims(); + std::vector outs_dims; + outs_dims.reserve(out_num); + + // infer output shape PADDLE_ENFORCE_EQ(height_sections.size(), out_num, "tensor split sections size" "should be equal to output size."); @@ -83,15 +71,15 @@ void ParameterSend::operator()(const std::string &var_name, dim[0] = height_sections[i]; outs_dims.push_back(dim); } - } - // create output var in local scope - size_t row_offset = 0; - for (auto i = 0; i < out_num; ++i) { - auto *out = - local_scope->Var(send_varnames[i])->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; + // create output var in local scope + size_t row_offset = 0; + for (auto i = 0; i < out_num; ++i) { + auto *out = + local_scope->Var(send_varnames[i])->GetMutable(); + *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); + row_offset += outs_dims[i][0]; + } } } else if (send_var->IsType()) { auto &send_slr = send_var->Get(); diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index e7ccaa83de..0f0ad6b8f9 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -42,7 +42,7 @@ class SendOp : public framework::OperatorBase { int sync_send = Attr("sync_mode"); auto send_varnames = Attr>("send_varnames"); - auto height_sections = Attr>("height_sections"); + auto height_sections = Attr>("sections"); if (send_varnames.size() > 0) { PADDLE_ENFORCE_EQ(ins.size(), 1, ""); diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0968ace62b..758c510dc7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -48,6 +48,7 @@ class TestDistRunnerBase(object): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd + config.runtime_split_send_recv = True t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, @@ -87,6 +88,9 @@ class TestDistRunnerBase(object): args.endpoints, args.trainers, args.sync_mode, args.dc_asgd) trainer_prog = t.get_trainer_program() + with open("/tmp/trainer." + str(args.trainer_id) + ".proto", + "w") as f: + f.write(str(trainer_prog)) elif args.update_method == "nccl2": # transpile for nccl2 config = fluid.DistributeTranspilerConfig() @@ -115,6 +119,7 @@ class TestDistRunnerBase(object): strategy.allow_op_delay = False build_stra = fluid.BuildStrategy() + build_stra.debug_graphviz_path = "/tmp/graph-" + str(args.trainer_id) if args.use_reduce: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index a3293afbbd..1b1b416593 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -156,6 +156,8 @@ class DistributeTranspilerConfig(object): mode = "pserver" print_log = False wait_port = True + # split the send recv var in runtime + runtime_split_send_recv = False class DistributeTranspiler(object): @@ -398,8 +400,10 @@ class DistributeTranspiler(object): orig_var = program.global_block().vars[splited_grad_varname] index = find_op_by_output_arg( program.global_block(), splited_grad_varname, reverse=True) - self._insert_split_op(program, orig_var, index, splited_vars) - index += 1 + if not self.config.runtime_split_send_recv: + self._insert_split_op(program, orig_var, index, + splited_vars) + index += 1 else: AssertionError("Can not insert the send op by original " "variable name :", splited_grad_varname) @@ -408,6 +412,17 @@ class DistributeTranspiler(object): name=framework.generate_control_dev_var_name()) self.grad_name_to_send_dummy_out[grad_varname] = dummy_output + if self.config.runtime_split_send_recv: + send_input_vars = [ + program.global_block().vars[splited_grad_varname] + ] + sections = self._get_splited_var_sections(splited_vars) + send_varnames = [var.name for var in splited_vars] + else: + send_input_vars = splited_vars + sections = [] + send_varnames = [] + # get send op_role_var, if not splited, the grad should have .trainer suffix # if splited, grad should be the original grad var name (split_by_ref and send # will be on the same place). ParallelExecutor @@ -415,10 +430,12 @@ class DistributeTranspiler(object): program.global_block()._insert_op( index=index + 1, type="send", - inputs={"X": splited_vars}, + inputs={"X": send_input_vars}, outputs={"Out": dummy_output}, attrs={ "epmap": eplist, + "sections": sections, + "send_varnames": send_varnames, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ self.grad_name_to_param_name[grad_varname], @@ -1372,9 +1389,8 @@ class DistributeTranspiler(object): # create table param and grad var in pserver program # create table optimize block in pserver program table_opt_op = [ - op for op in self.optimize_ops - if 'Param' in op.input_names and op.input("Param")[0] == - self.table_name + op for op in self.optimize_ops if 'Param' in op.input_names and + op.input("Param")[0] == self.table_name ][0] origin_param_var = self.origin_program.global_block().vars[ @@ -1548,11 +1564,17 @@ class DistributeTranspiler(object): lod_level=var.lod_level, persistable=persistable) + @staticmethod + def _get_splited_var_sections(splited_vars): + height_sections = [] + for v in splited_vars: + height_sections.append(v.shape[0]) + return height_sections + def _insert_split_op(self, program, orig_var, index, splited_vars): + height_sections = self._get_splited_var_sections(splited_vars) + if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS: - height_sections = [] - for v in splited_vars: - height_sections.append(v.shape[0]) sparse_param_name = self.grad_name_to_param_name[orig_var.name] if self._is_input_of_remote_sparse_update_op(sparse_param_name): self.sparse_param_to_height_sections[ @@ -1567,16 +1589,13 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE }) elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR: - sections = [] - for v in splited_vars: - sections.append(v.shape[0]) program.global_block()._insert_op( index=index + 1, type="split_byref", inputs={"X": orig_var}, outputs={"Out": splited_vars}, attrs={ - "sections": sections, + "sections": height_sections, RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE }) else: @@ -2048,7 +2067,7 @@ class DistributeTranspiler(object): Get optimizer operators, parameters and gradients from origin_program Returns: opt_ops (list): optimize operators. - params_grads (dict): paramter->gradient. + params_grads (dict): parameter->gradient. """ block = self.origin_program.global_block() opt_ops = [] From 5c36eb8b6962446e95840f775f87308d0df32ff6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 6 Feb 2019 20:36:31 +0800 Subject: [PATCH 26/98] fix build --- paddle/fluid/operators/distributed/parameter_send.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 47ca42c790..fd97926623 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -75,8 +75,8 @@ void ParameterSend::operator()(const std::string &var_name, // create output var in local scope size_t row_offset = 0; for (auto i = 0; i < out_num; ++i) { - auto *out = - local_scope->Var(send_varnames[i])->GetMutable(); + framework::Tensor *out = local_scope->Var(send_varnames[i]) + ->GetMutable(); *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); row_offset += outs_dims[i][0]; } @@ -161,7 +161,8 @@ void ParameterSend::operator()(const std::string &var_name, } } - if (sync) { + // note!! only support sync send now + if (true || sync) { for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } From 5cf0092825a9625018e8856931cbdb8ff15b71a5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Feb 2019 14:19:21 +0800 Subject: [PATCH 27/98] add more log and fix test_dist_base in multi_batch_merge_pass --- paddle/fluid/framework/details/build_strategy.cc | 2 ++ paddle/fluid/framework/ir/pass.cc | 1 + python/paddle/fluid/tests/unittests/test_dist_base.py | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 51ce973272..ca9843057d 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -177,11 +177,13 @@ std::unique_ptr BuildStrategy::Apply( #else const bool use_cuda) const { #endif + VLOG(3) << "apply all passes"; // Create a default one if not finalized by user. CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(3) << "apply " << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 33ccee6aa0..823697495e 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -19,6 +19,7 @@ namespace paddle { namespace framework { namespace ir { std::unique_ptr Pass::Apply(std::unique_ptr graph) const { + VLOG(3) << "apply pass -> " << Type(); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 758c510dc7..98e6923c11 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -128,8 +128,7 @@ class TestDistRunnerBase(object): if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() - mypass = pass_builder.insert_pass( - len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") + mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": From a0585d08ed42aa9caeefe1973549b6dd69d46823 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Feb 2019 20:44:18 +0800 Subject: [PATCH 28/98] init parameter recv --- .../operators/distributed/CMakeLists.txt | 3 +- .../operators/distributed/parameter_recv.cc | 178 ++++++++++++++++++ .../operators/distributed/parameter_recv.h | 38 ++++ .../operators/distributed_ops/CMakeLists.txt | 4 +- .../operators/distributed_ops/recv_op.cc | 5 + 5 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/distributed/parameter_recv.cc create mode 100644 paddle/fluid/operators/distributed/parameter_recv.h diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 03f47b594d..231f4b3bc4 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(WITH_GRPC) else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) @@ -53,6 +53,7 @@ cc_test(rpc_server_test SRCS rpc_server_test.cc cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc new file mode 100644 index 0000000000..e5b486d121 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_recv.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +void ParameterRecv::operator()(const std::string &var_name, + const std::vector &send_varnames, + const std::vector &epmap, + const std::vector &height_sections, + const framework::ExecutionContext &ctx, + const framework::Scope &scope, bool sync) { + framework::Scope *local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + + distributed::RPCClient *rpc_client = + distributed::RPCClient::GetInstance( + ctx.Attr("trainer_id")); + + auto *send_var = scope.FindVar(var_name); + size_t out_num = send_varnames.size(); + if (send_var->IsType()) { + if (out_num > 1) { + auto &send_tensor = send_var->Get(); + auto &send_tensor_dims = send_tensor.dims(); + std::vector outs_dims; + outs_dims.reserve(out_num); + + // infer output shape + PADDLE_ENFORCE_EQ(height_sections.size(), out_num, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < out_num; ++i) { + auto dim = send_tensor_dims; + dim[0] = height_sections[i]; + outs_dims.push_back(dim); + } + + // create output var in local scope + size_t row_offset = 0; + for (auto i = 0; i < out_num; ++i) { + framework::Tensor *out = local_scope->Var(send_varnames[i]) + ->GetMutable(); + *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); + row_offset += outs_dims[i][0]; + } + } + } else if (send_var->IsType()) { + auto &send_slr = send_var->Get(); + auto abs_sections = ToAbsoluteSection(height_sections); + + auto send_rows = send_slr.rows(); + std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + + outs_rows_idx.resize(out_num); + outs_dense_idx.resize(out_num); + + auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; + auto src = send_slr.value().data(); + + // create output var in local scope + std::vector outs; + for (auto &name : send_varnames) { + auto *out = local_scope->Var(name)->GetMutable(); + outs.push_back(out); + } + + // split rows index into output sparse vars + for (size_t i = 0; i < send_rows.size(); ++i) { + int out_idx = FindOutIdx(send_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(send_rows[i]); + outs_dense_idx[out_idx].push_back(i); + } + auto place = ctx.GetPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + outs[i]->set_height(height_sections[i]); + auto dims = send_slr.GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); + outs[i]->mutable_rows()->clear(); + if (rows_idx.size() > 0) { + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + } + auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); + } else { +#ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, + sizeof(T) * row_numel, stream); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } + } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); + } + + } else { + PADDLE_THROW("unsupported var type to send!"); + } + + std::vector rets; + for (size_t i = 0; i < send_varnames.size(); i++) { + auto &send_var_name = send_varnames[i]; + auto &endpoint = epmap[i]; + if (NeedSend(*local_scope, send_var_name)) { + VLOG(3) << "sending " << send_var_name << " to " << endpoint; + rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, + send_var_name)); + } else { + VLOG(3) << "don't send non-initialized variable: " << send_varnames[i]; + } + } + + // note!! only support sync send now + if (true || sync) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + + delete local_scope; +} + +template struct ParameterRecv; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h new file mode 100644 index 0000000000..817115e2d1 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +template +struct ParameterRecv { + void operator()(const std::string &var_name, + const std::vector &send_varnames, + const std::vector &epmap, + const std::vector &height_sections, + const framework::ExecutionContext &context, + const framework::Scope &scope, bool sync); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 0eb30ce695..3bcfc532e8 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 120c65f296..5e004a7a3c 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -110,6 +110,11 @@ This operator can get variables from server side. "for example: we need var named 'moment_1@127.0.0.1:1001', " "and it real name on parameter server is 'moment_1'. ") .SetDefault({}); + AddAttr>( + "recv_varnames", + "(vector) " + "the splited parameter varnames to be recved from pserver") + .SetDefault(std::vector{}); } }; From a804a2ae2ada43244774cebc349b08b6bd65ecfd Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Feb 2019 11:14:58 +0800 Subject: [PATCH 29/98] complete parameter recv --- .../operators/distributed/parameter_recv.cc | 141 ++++-------------- .../operators/distributed/parameter_recv.h | 5 +- 2 files changed, 34 insertions(+), 112 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index e5b486d121..2664a89ed6 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" +#include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { namespace operators { @@ -39,11 +40,10 @@ using DDim = framework::DDim; template void ParameterRecv::operator()(const std::string &var_name, - const std::vector &send_varnames, + const std::vector &recv_varnames, const std::vector &epmap, - const std::vector &height_sections, const framework::ExecutionContext &ctx, - const framework::Scope &scope, bool sync) { + const framework::Scope &scope) { framework::Scope *local_scope = scope.NewTmpScope(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -53,118 +53,41 @@ void ParameterRecv::operator()(const std::string &var_name, distributed::RPCClient::GetInstance( ctx.Attr("trainer_id")); - auto *send_var = scope.FindVar(var_name); - size_t out_num = send_varnames.size(); - if (send_var->IsType()) { - if (out_num > 1) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - PADDLE_ENFORCE_EQ(height_sections.size(), out_num, - "tensor split sections size" - "should be equal to output size."); - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = height_sections[i]; - outs_dims.push_back(dim); - } - - // create output var in local scope - size_t row_offset = 0; - for (auto i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(send_varnames[i]) - ->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; - } + auto *recv_var = scope.FindVar(var_name); + + std::vector recved_tensors; + + // recv all vars to local scope + if (recv_var->IsType()) { + std::vector rets; + for (size_t i = 0; i < recv_varnames.size(); i++) { + auto &recv_var_name = recv_varnames[i]; + framework::Tensor *t = + local_scope->Var(recv_var_name)->GetMutable(); + recved_tensors.push_back(t); + VLOG(3) << "recv " << recv_var_name << " from " << epmap[i]; + rets.push_back(rpc_client->AsyncGetVar(epmap[i], cpu_ctx, *local_scope, + recv_var_name, recv_var_name)); } - } else if (send_var->IsType()) { - auto &send_slr = send_var->Get(); - auto abs_sections = ToAbsoluteSection(height_sections); - - auto send_rows = send_slr.rows(); - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; - - outs_rows_idx.resize(out_num); - outs_dense_idx.resize(out_num); - - auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto src = send_slr.value().data(); - - // create output var in local scope - std::vector outs; - for (auto &name : send_varnames) { - auto *out = local_scope->Var(name)->GetMutable(); - outs.push_back(out); - } - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - int out_idx = FindOutIdx(send_rows[i], abs_sections); - outs_rows_idx[out_idx].push_back(send_rows[i]); - outs_dense_idx[out_idx].push_back(i); - } - auto place = ctx.GetPlace(); - - for (size_t i = 0; i < outs_rows_idx.size(); ++i) { - auto rows_idx = outs_rows_idx[i]; - outs[i]->set_height(height_sections[i]); - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); - outs[i]->mutable_rows()->clear(); - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); - } - auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy( - platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), - src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); - } else { -#ifdef PADDLE_WITH_CUDA - auto stream = ctx.cuda_device_context().stream(); - memory::Copy(platform::CUDAPlace(), dst + j * row_numel, - platform::CUDAPlace(), - src + outs_dense_idx[i][j] * row_numel, - sizeof(T) * row_numel, stream); -#else - PADDLE_THROW("Paddle is not compiled with GPU"); -#endif - } - } - } - PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), - "rows should has the same size with tensor dim 0"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } - } else { PADDLE_THROW("unsupported var type to send!"); } - std::vector rets; - for (size_t i = 0; i < send_varnames.size(); i++) { - auto &send_var_name = send_varnames[i]; - auto &endpoint = epmap[i]; - if (NeedSend(*local_scope, send_var_name)) { - VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, - send_var_name)); - } else { - VLOG(3) << "don't send non-initialized variable: " << send_varnames[i]; - } - } - - // note!! only support sync send now - if (true || sync) { - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + // concat recved tensor into one var + { + size_t output_offset = 0; + framework::Tensor *recv_tensor = + recv_var->GetMutable(); + for (auto *in : recved_tensors) { + auto in_stride = framework::stride_numel(in->dims()); + auto out_stride = framework::stride_numel(recv_tensor->dims()); + StridedNumelCopyWithAxis( + ctx.device_context(), 0, recv_tensor->data() + output_offset, + out_stride, in->data(), in_stride, in_stride[0]); + output_offset += in_stride[0]; } } diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h index 817115e2d1..bc6f5f5adf 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -26,11 +26,10 @@ namespace distributed { template struct ParameterRecv { void operator()(const std::string &var_name, - const std::vector &send_varnames, + const std::vector &recv_varnames, const std::vector &epmap, - const std::vector &height_sections, const framework::ExecutionContext &context, - const framework::Scope &scope, bool sync); + const framework::Scope &scope); }; }; // namespace distributed From fbd186bd5d6dced8255607f9b6266cd438c564dc Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Feb 2019 14:18:14 +0800 Subject: [PATCH 30/98] complete recv op --- .../operators/distributed_ops/recv_op.cc | 58 ++++++++++++------- .../fluid/transpiler/distribute_transpiler.py | 25 +++++--- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 5e004a7a3c..a0185d66f0 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -48,32 +49,45 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient::GetInstance( Attr("trainer_id")); - if (with_barrier) { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVar"; - rets.push_back( - rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); - } - if (sync_mode) { + std::vector recv_varnames = + Attr>("recv_varnames"); + + if (recv_varnames.size() > 0) { + framework::RuntimeContext ctx(Inputs(), Outputs(), scope); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = pool.Get(place); + auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); + auto recv_functor = distributed::ParameterRecv(); + recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope); + } else { + if (with_barrier) { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVar"; + rets.push_back( + rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); + } + if (sync_mode) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + } else { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVarNoBarrier"; + rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, + varname, outs[i])); + } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } } - } else { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVarNoBarrier"; - rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, - varname, outs[i])); - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } } } }; diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1b1b416593..ae7deda897 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -519,12 +519,20 @@ class DistributeTranspiler(object): param_varname, height_sections, eps, table_names) else: all_recv_outputs.extend(splited_var) + + recv_varnames = [] + if self.config.runtime_split_send_recv: + orig_param = program.global_block().vars[param_varname] + recv_varnames = [var.name for var in splited_vars] + splited_var = [orig_param] + program.global_block().append_op( type="recv", inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, + "recv_varnames": recv_varnames, "trainer_id": self.trainer_id, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: @@ -549,14 +557,15 @@ class DistributeTranspiler(object): continue orig_param = program.global_block().vars[param_varname] if param_varname not in self.sparse_param_to_height_sections: - program.global_block().append_op( - type="concat", - inputs={"X": splited_var}, - outputs={"Out": [orig_param]}, - attrs={ - "axis": 0, - RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE - }) + if not self.config.runtime_split_send_recv: + program.global_block().append_op( + type="concat", + inputs={"X": splited_var}, + outputs={"Out": [orig_param]}, + attrs={ + "axis": 0, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) From 8bda4ab213c52871435fc6d74ef51d16b9f3235e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Feb 2019 18:22:50 +0800 Subject: [PATCH 31/98] parameter recv can run --- python/paddle/fluid/transpiler/distribute_transpiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ae7deda897..b9b0cd24eb 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -518,13 +518,12 @@ class DistributeTranspiler(object): self._update_remote_sparse_update_op( param_varname, height_sections, eps, table_names) else: - all_recv_outputs.extend(splited_var) - recv_varnames = [] if self.config.runtime_split_send_recv: orig_param = program.global_block().vars[param_varname] - recv_varnames = [var.name for var in splited_vars] + recv_varnames = [var.name for var in splited_var] splited_var = [orig_param] + all_recv_outputs.extend(splited_var) program.global_block().append_op( type="recv", From e72637ddd22765dd915119b96bc1821734cd28ef Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 9 Feb 2019 17:11:46 +0800 Subject: [PATCH 32/98] ThreadedSSAGraphExecutor support num_iteration_per_run test=develop --- .../details/async_ssa_graph_executor.cc | 16 ------------ .../details/threaded_ssa_graph_executor.cc | 25 +++++++++++++++++-- .../details/threaded_ssa_graph_executor.h | 1 + 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 79b390dde4..5ce92ad826 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -30,19 +30,6 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); - if (strategy_.num_iteration_per_run_ > 1) { - int read_op_num = 0; - for (auto *node : graphs_[0]->Nodes()) { - if (node->IsOp() && node->Name() == "read") { - read_op_num++; - } - } - if (read_op_num == 0) { - LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " - "should use pyreader to feed data!"; - } - } - // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() ? 1UL @@ -69,9 +56,6 @@ FeedFetchList AsyncSSAGraphExecutor::Run( for (size_t i = 0; i < places_.size(); ++i) { auto call = [this, i, &fetch_tensors]() -> FeedFetchList { try { - for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { - executors_[i]->Run(fetch_tensors); - } return executors_[i]->Run(fetch_tensors); } catch (...) { exception_holder_.Catch(std::current_exception()); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a293794..16fa2a6db6 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -32,9 +32,22 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( places_(places), fetch_ctxs_(places), running_ops_(0), - strategy_(strategy) {} + strategy_(strategy) { + if (strategy_.num_iteration_per_run_ > 1) { + int read_op_num = 0; + for (auto *node : graph_->Nodes()) { + if (node->IsOp() && node->Name() == "read") { + read_op_num++; + } + } + if (read_op_num == 0) { + LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " + "should use pyreader to feed data!"; + } + } +} -FeedFetchList ThreadedSSAGraphExecutor::Run( +inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( const std::vector &fetch_tensors) { std::unique_ptr event( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); @@ -140,6 +153,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( return fetch_data; } +FeedFetchList ThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { + RunImpl({}); + } + return RunImpl(fetch_tensors); +} + void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector *fetch_ops, diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 24da56c09e..3809b6e9ae 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() final = default; private: + inline FeedFetchList RunImpl(const std::vector &fetch_tensors); void RunOp(const std::shared_ptr> &ready_var_q, details::OpHandleBase *op); From 84367cf8bc4195d82dc1851d116980746f7c68b6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 10 Feb 2019 19:58:50 +0800 Subject: [PATCH 33/98] support async mode in dist mode parallel executor --- .../details/multi_devices_graph_pass.cc | 35 ++++++++++++++++--- .../details/multi_devices_graph_pass.h | 12 +++---- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index f1347e2b0d..a2bbfc91b7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -167,6 +167,10 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( bool is_forwarding = true; bool insert_collection_ops = NeedCollectiveOps(); + if (strategy_.async_mode_) { + // async mode did not need to merge gradient + insert_collection_ops = false; + } for (ir::Node *node : sorted_ops) { if (DealWithSpecialOp(&result, node)) { @@ -192,8 +196,22 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & static_cast(OpRole::kBackward)); + // optimize op is already processed in DealWithSpecialOp, + // here we only consider backward op if (!is_bk_op) continue; + /* + * the op that will generate the gradient of on parameter will have + one attr op_role_var + * to record the parameter and gradient, like: + attrs { + name: "op_role_var" + type: STRINGS + strings: "fc_1.b_0" + strings: "fc_1.b_0@GRAD" + } + */ + // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. auto backward_vars = @@ -204,7 +222,7 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( for (size_t i = 0; i < backward_vars.size(); i += 2) { auto &p_name = backward_vars[i]; auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + VLOG(3) << "Bcast " << g_name << " for parameter " << p_name; InsertCollectiveOp(&result, p_name, g_name); } @@ -385,7 +403,7 @@ void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, ir::Node *node, - int dev_id) const { + size_t dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id], dev_id)); @@ -454,9 +472,8 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( } } -VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, - const std::string &og, - int dst_dev_id) const { +VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( + ir::Graph *result, const std::string &og, size_t dst_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), @@ -720,6 +737,10 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { bool insert_op = false; if (OpHaveRole(*node, OpRole::kRPC)) { + // in async_mode, each graph will send it's own gradient. + if (strategy_.async_mode_ && node->Op()->Type() == "send") { + return false; + } int op_dev_id = CreateRPCOp(result, node); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -737,6 +758,8 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(result, node); if (node->Op()->Type() == "concat") { + // the input(block of parameter) of concat is on different device, + // the output(parameter) will on one device. auto origin_param_name = node->Op()->OutputArgumentNames()[0]; bcast_var_name_set_[op_dev_id].emplace(origin_param_name); } @@ -744,6 +767,7 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, } else { int op_dev_id = GetOpDeviceID(node); if (op_dev_id != -1) { // This op only runs on one specific device. + // optimize op will be processed here. CreateComputationalOp(result, node, op_dev_id); for (ir::Node *n : node->outputs) { sharded_var_device_.emplace(n->Name(), op_dev_id); @@ -905,6 +929,7 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const { + // collective gradient to each device size_t cur_device_id = 0; switch (strategy_.reduce_) { case BuildStrategy::ReduceStrategy::kReduce: diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index e91397816c..377ba50fcc 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -68,10 +68,10 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, - int dst_dev_id) const; + size_t dst_dev_id) const; void CreateComputationalOp(ir::Graph *result, ir::Node *node, - int dev_id) const; + size_t dev_id) const; bool IsSparseGradient(const std::string &og) const; @@ -118,16 +118,16 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { protected: - virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, - const std::string &g_name) const {} + void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const override {} bool NeedCollectiveOps() const override { return false; } - virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { + bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { return false; } - virtual void InsertPostprocessOps(ir::Graph *result) const {} + void InsertPostprocessOps(ir::Graph *result) const override {} }; class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { From c4ded17e8cbcbf33e68145c1a4ffe777582bf3ab Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 11 Feb 2019 09:19:48 +0800 Subject: [PATCH 34/98] async mode support dist train --- paddle/fluid/framework/details/build_strategy.cc | 6 +++--- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 7 ++++++- paddle/fluid/framework/parallel_executor.cc | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index a286cb30a2..e917395259 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -133,10 +133,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass; - if (strategy_.async_mode_) { - multi_devices_pass = AppendPass("async_multi_devices_pass").get(); - } else if (strategy_.is_distribution_) { + if (strategy_.is_distribution_) { multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); + } else if (strategy_.async_mode_) { + multi_devices_pass = AppendPass("async_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { multi_devices_pass = diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a2bbfc91b7..572d374b50 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -756,6 +756,11 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, insert_op = true; need_broadcast_var_ = true; } else if (OpHaveRole(*node, OpRole::kDist)) { + // in async_mode, each graph will send it's own gradient, do not need to + // merge gradient. + if (strategy_.async_mode_ && node->Op()->Type() != "concat") { + return false; + } int op_dev_id = CreateDistTrainOp(result, node); if (node->Op()->Type() == "concat") { // the input(block of parameter) of concat is on different device, @@ -827,7 +832,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { } auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - if (recv_param_grad.size() == 2U) { + if (recv_param_grad.size() == 2U && !strategy_.async_mode_) { op_dev_id = GetVarDeviceID(recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f0bc3acccc..c85fe4f200 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -283,7 +283,7 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); } #else - if (build_strategy.async_mode_) { + if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, From 2171aa77f100b53c59b8dfd615f2a7ebcf447b77 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 11 Feb 2019 09:29:36 +0800 Subject: [PATCH 35/98] async ssa exe only support local mode --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c85fe4f200..e8531cd8d8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -333,7 +333,7 @@ ParallelExecutor::ParallelExecutor( "please don't pass loss_var_name."; } } - if (build_strategy.async_mode_) { + if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use AsyncSSAGraphExecutor"; member_->executor_.reset(new details::AsyncSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, From 9465c3d0c393f7e7c5665f561433ca65e193396c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 21 Feb 2019 16:28:38 +0800 Subject: [PATCH 36/98] fix compile problem --- paddle/fluid/framework/parallel_executor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfadfb57db..67ccf04d05 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/all_reduce_deps_pass.h" +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -260,6 +260,7 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::unique_ptr graph; + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, member_->local_scopes_, member_->nranks_, @@ -273,10 +274,9 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); } } else { - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); + graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); } #endif auto max_memory_size = GetEagerDeletionThreshold(); From 7f3be09045e349ef9028337083604c1d3a126169 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 21 Feb 2019 17:08:56 +0800 Subject: [PATCH 37/98] fix multi graph test=develop --- .../fluid/framework/details/build_strategy.cc | 1 + paddle/fluid/framework/parallel_executor.cc | 46 +++++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1b0ec02910..e5c108f890 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -249,6 +249,7 @@ std::unique_ptr BuildStrategy::Apply( graph = pass->Apply(std::move(graph)); VLOG(3) << "Finish Apply Pass " << pass->Type(); } + VLOG(3) << "All Passes Applied"; return graph; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 67ccf04d05..ecae729124 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -259,14 +259,15 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::unique_ptr graph; std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, - member_->use_cuda_, member_->nccl_ctxs_.get()); + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); #else if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { + VLOG(3) << "use local async mode"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, @@ -274,39 +275,44 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); } } else { - graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, - member_->use_cuda_); + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_); + graphs.push_back(std::move(graph)); } #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - graph = member_->PrepareGCAndRefCnts(std::move(graph), - static_cast(max_memory_size)); + for (size_t i = 0; i < graphs.size(); ++i) { + graphs[i] = member_->PrepareGCAndRefCnts( + std::move(graphs[i]), static_cast(max_memory_size)); + } } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } } // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -326,7 +332,7 @@ ParallelExecutor::ParallelExecutor( // allreduce_seq_pass doesn't need it as the attr. member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, main_program, - std::move(graph))); + std::move(graphs[0]))); #else PADDLE_THROW( "Paddle should be compiled with CUDA for ParallelGraph Execution."); @@ -336,12 +342,12 @@ ParallelExecutor::ParallelExecutor( VLOG(3) << "use ThreadedSSAGraphExecutor"; member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + std::move(graphs[0]))); } else { VLOG(3) << "use FastThreadedSSAGraphExecutor"; member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + std::move(graphs[0]))); } } From 12f6b8c3d623d166e77b77eb11837783ffc5fe42 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 21 Feb 2019 18:23:31 +0800 Subject: [PATCH 38/98] change the include of ThreadPool.h test=develop --- paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 3809b6e9ae..ae9cb1ebca 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -21,8 +21,8 @@ #include #include +#include // ThreadPool in thrird party #include -#include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" From f4f4816b0c1ffdf7689523f732cd728c196e5aff Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 22 Feb 2019 16:26:50 +0800 Subject: [PATCH 39/98] fix gpu error test=develop --- .../details/async_ssa_graph_executor.cc | 1 + paddle/fluid/framework/parallel_executor.cc | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 5ce92ad826..0780fb040a 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -29,6 +29,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( graphs_(std::move(graphs)) { VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + PADDLE_ENFORCE_EQ(graphs_.size, local_scopes_.size()); // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ecae729124..cfd6609a4b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -261,10 +261,21 @@ ParallelExecutor::ParallelExecutor( // ncclOp std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); + if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { + VLOG(3) << "use local async mode"; + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } #else if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; From ecedd531c1ba9b68a1f24bce9b7b98ced67cc128 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 22 Feb 2019 16:37:40 +0800 Subject: [PATCH 40/98] fix code bug test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 0780fb040a..a584b3a708 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -29,7 +29,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( graphs_(std::move(graphs)) { VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); - PADDLE_ENFORCE_EQ(graphs_.size, local_scopes_.size()); + PADDLE_ENFORCE_EQ(graphs_.size(), local_scopes_.size()); // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() From b5b8e6cc9c0b219d9fea2c43944798509f035d04 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 23 Feb 2019 09:28:56 +0800 Subject: [PATCH 41/98] revert the change of scope test=develop --- paddle/fluid/framework/scope.cc | 27 --------------------------- paddle/fluid/framework/scope.h | 1 - 2 files changed, 28 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 4fe843dde9..87f0f307d3 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -255,32 +255,5 @@ std::string GenScopeTreeDebugInfo(Scope* root) { return os.str(); } -std::string GenParentScopeTreeDebugInfo(Scope* leaf) { - std::stringstream os; - - if (!leaf) return ""; - - // level traversal - std::vector scopes; - const Scope* current_scope = leaf; - - while (current_scope != nullptr) { - scopes.push_back(current_scope); - current_scope = current_scope->parent(); - } - - os << "\n--------------GenParentScopeTreeDebugInfo--------------\n"; - - for (int i = scopes.size() - 1; i >= 0; --i) { - os << "=======level [" << i << "]=======\n"; - os << scopes[i] << ":\n"; - for (auto& var : scopes[i]->LocalVarNames()) { - os << " - " << var << "\n"; - } - } - - return os.str(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index eb5c12def6..f0915d2eee 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -144,7 +144,6 @@ class Scope { // Generate some debug string about the inherience structure of scope, quite // naive. std::string GenScopeTreeDebugInfo(Scope*); -std::string GenParentScopeTreeDebugInfo(Scope*); } // namespace framework } // namespace paddle From 10393dd0d16e57203b8cb039174cff97b6efbc89 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 25 Feb 2019 10:09:25 +0800 Subject: [PATCH 42/98] add some check test=develop --- paddle/fluid/framework/parallel_executor.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index cfd6609a4b..8236773672 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -217,6 +217,11 @@ ParallelExecutor::ParallelExecutor( } } + if (build_strategy.async_mode_) { + PADDLE_ENFORCE(!member_->use_cuda_, + "gpu mode does not support async_mode_ now!"); + } + // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. From 43c82376cba493bf622d452741c395da275f0a1b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 25 Feb 2019 22:39:34 +0800 Subject: [PATCH 43/98] use one graph --- .../details/async_ssa_graph_executor.cc | 7 +- .../details/async_ssa_graph_executor.h | 6 +- paddle/fluid/framework/parallel_executor.cc | 66 ++++++++----------- 3 files changed, 33 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index a584b3a708..b6d1ee5073 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -21,15 +21,14 @@ namespace details { AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> &&graphs) + std::unique_ptr &&graph) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)) { + graph_(std::move(graph)) { VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); - PADDLE_ENFORCE_EQ(graphs_.size(), local_scopes_.size()); // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() @@ -39,7 +38,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + strategy_, {local_scopes_[i]}, {places_[i]}, graph_.get())); } } diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h index 4091c56d74..50f207361f 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -29,9 +29,9 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> &&graphs); + std::unique_ptr &&graph); ~AsyncSSAGraphExecutor() final = default; - const ir::Graph &Graph() const override { return *graphs_[0]; } + const ir::Graph &Graph() const override { return *graph_; } FeedFetchList Run(const std::vector &fetch_tensors) override; @@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; - std::vector> graphs_; + std::unique_ptr graph_; std::vector> executors_; ExceptionHolder exception_holder_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8236773672..129d3a7f0d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -264,71 +264,59 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::vector> graphs; + std::unique_ptr graph; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::unique_ptr graph = build_strategy.Apply( - main_program, {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); - } + graph = + build_strategy.Apply(main_program, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); } else { - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); + graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); } #else if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::unique_ptr graph = build_strategy.Apply( - main_program, {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); - } + graph = build_strategy.Apply(main_program, {member_->places_[0]}, + loss_var_name, {member_->local_scopes_[0]}, + member_->nranks_, member_->use_cuda_); } else { - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); + graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); } #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - for (size_t i = 0; i < graphs.size(); ++i) { - graphs[i] = member_->PrepareGCAndRefCnts( - std::move(graphs[i]), static_cast(max_memory_size)); - } + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &graph : graphs) { - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); } } // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graphs[0]); + size_t graph_num = ir::GraphNum(*graph); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graphs[0]) + << ir::GraphNum(*graph) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -340,7 +328,7 @@ ParallelExecutor::ParallelExecutor( VLOG(3) << "use AsyncSSAGraphExecutor"; member_->executor_.reset(new details::AsyncSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs))); + std::move(graph))); } else if (build_strategy.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; #ifdef PADDLE_WITH_CUDA @@ -358,12 +346,12 @@ ParallelExecutor::ParallelExecutor( VLOG(3) << "use ThreadedSSAGraphExecutor"; member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + std::move(graph))); } else { VLOG(3) << "use FastThreadedSSAGraphExecutor"; member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + std::move(graph))); } } From dab7f36909a61af51beacd145228bb2a4acc4db5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 25 Feb 2019 22:49:03 +0800 Subject: [PATCH 44/98] optimize code test=develop --- .../details/async_ssa_graph_executor.cc | 6 ++-- .../details/async_ssa_graph_executor.h | 4 +-- paddle/fluid/framework/parallel_executor.cc | 30 +++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index b6d1ee5073..8757842996 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -21,12 +21,12 @@ namespace details { AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph) + ir::Graph* graph) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graph_(std::move(graph)) { + graph_(graph) { VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); @@ -38,7 +38,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, {local_scopes_[i]}, {places_[i]}, graph_.get())); + strategy_, {local_scopes_[i]}, {places_[i]}, graph_)); } } diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h index 50f207361f..8536852a00 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -29,7 +29,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + ir::Graph *graph); ~AsyncSSAGraphExecutor() final = default; const ir::Graph &Graph() const override { return *graph_; } @@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; - std::unique_ptr graph_; + ir::Graph *graph_; std::vector> executors_; ExceptionHolder exception_holder_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a498ec5b0b..081d06b6aa 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -269,25 +269,26 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; - temp_owned_graph = - build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, member_->nranks_, - member_->use_cuda_, member_->nccl_ctxs_.get()); + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); } else { - temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, - member_->use_cuda_, member_->nccl_ctxs_.get()); + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); } #else if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; - temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, - loss_var_name, {member_->local_scopes_[0]}, - member_->nranks_, member_->use_cuda_); + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_); } else { - temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, - member_->use_cuda_); + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, member_->use_cuda_); } #endif @@ -333,8 +334,7 @@ ParallelExecutor::ParallelExecutor( if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use AsyncSSAGraphExecutor"; member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - graph)); + exec_strategy, member_->local_scopes_, member_->places_, graph)); } else if (build_strategy.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; #ifdef PADDLE_WITH_CUDA From ff01d705835c5e1ccac4d9f1e109725bf6efeb53 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 25 Feb 2019 23:31:56 +0800 Subject: [PATCH 45/98] fix style test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 8757842996..21741667a3 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -20,8 +20,7 @@ namespace details { AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, - ir::Graph* graph) + const std::vector &places, ir::Graph *graph) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), From f768fbf7157e4b500de3aa456beddaa138f00cd5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 26 Feb 2019 15:01:59 +0800 Subject: [PATCH 46/98] support multi graph test=develop --- .../details/async_ssa_graph_executor.cc | 6 +-- .../details/async_ssa_graph_executor.h | 6 +-- paddle/fluid/framework/parallel_executor.cc | 40 ++++++++++++++----- paddle/fluid/framework/parallel_executor.h | 2 +- .../fluid/operators/reader/blocking_queue.h | 1 + .../operators/reader/create_py_reader_op.cc | 5 ++- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/parallel_executor.py | 9 ++++- 8 files changed, 50 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 21741667a3..dfb9d73dcb 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -20,12 +20,12 @@ namespace details { AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, ir::Graph *graph) + const std::vector &places, std::vector graphs) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graph_(graph) { + graphs_(std::move(graphs)) { VLOG(3) << "build AsyncSSAGraphExecutor"; PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); @@ -37,7 +37,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, {local_scopes_[i]}, {places_[i]}, graph_)); + strategy_, {local_scopes_[i]}, {places_[i]}, graphs_[i])); } } diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h index 8536852a00..ff85ba2c6c 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -29,9 +29,9 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - ir::Graph *graph); + std::vector graphs); ~AsyncSSAGraphExecutor() final = default; - const ir::Graph &Graph() const override { return *graph_; } + const ir::Graph &Graph() const override { return *graphs_[0]; } FeedFetchList Run(const std::vector &fetch_tensors) override; @@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; - ir::Graph *graph_; + std::vector graphs_; std::vector> executors_; ExceptionHolder exception_holder_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 081d06b6aa..b1f4091148 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -188,7 +188,7 @@ ParallelExecutor::ParallelExecutor( const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - ir::Graph *graph) + std::vector graphs) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -222,6 +222,8 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(!member_->use_cuda_, "gpu mode does not support async_mode_ now!"); } + + ir::Graph *graph = graphs[0]; std::unique_ptr temp_owned_graph(graph); // FIXME(Yancey1989): parallel graph mode get better performance @@ -262,17 +264,26 @@ ParallelExecutor::ParallelExecutor( if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector async_graphs(places.size()); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use local async mode"; - temp_owned_graph = build_strategy.Apply( - std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); + temp_owned_graph = + build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, + loss_var_name, {member_->local_scopes_[0]}, 1, + member_->use_cuda_, member_->nccl_ctxs_.get()); + for (int i = 1; i < member_->places_.size(); ++i) { + std::unique_ptr temp_graph(graphs[i]); + temp_graph = + build_strategy.Apply(std::move(temp_graph), {member_->places_[i]}, + loss_var_name, {member_->local_scopes_[i]}, 1, + member_->use_cuda_, member_->nccl_ctxs_.get()); + async_graphs[i] = temp_graph.release(); + } } else { temp_owned_graph = build_strategy.Apply( std::move(temp_owned_graph), member_->places_, loss_var_name, @@ -284,7 +295,14 @@ ParallelExecutor::ParallelExecutor( VLOG(3) << "use local async mode"; temp_owned_graph = build_strategy.Apply( std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_); + {member_->local_scopes_[0]}, 1, member_->use_cuda_); + for (int i = 1; i < member_->places_.size(); ++i) { + std::unique_ptr temp_graph(graphs[i]); + temp_graph = build_strategy.Apply( + std::move(temp_graph), {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_cuda_); + async_graphs[i] = temp_graph.release(); + } } else { temp_owned_graph = build_strategy.Apply( std::move(temp_owned_graph), member_->places_, loss_var_name, @@ -304,6 +322,8 @@ ParallelExecutor::ParallelExecutor( graph = temp_owned_graph.release(); } + async_graphs[0] = graph; + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; @@ -334,7 +354,7 @@ ParallelExecutor::ParallelExecutor( if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { VLOG(3) << "use AsyncSSAGraphExecutor"; member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, graph)); + exec_strategy, member_->local_scopes_, member_->places_, async_graphs)); } else if (build_strategy.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ddf60b3946..0e05b2a460 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -50,7 +50,7 @@ class ParallelExecutor { const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - ir::Graph *graph); + std::vector graphs); ~ParallelExecutor(); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 45c3ad802f..c99b2bc593 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -95,6 +95,7 @@ class BlockingQueue { void Close() { std::lock_guard lock(mutex_); + VLOG(3) << "close queue"; closed_ = true; send_cv_.notify_all(); receive_cv_.notify_all(); diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b..b2469ad0eb 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -35,7 +35,10 @@ class PyReader : public framework::FileReader { ~PyReader() { queue_->Close(); } - void Shutdown() override { queue_->Close(); } + void Shutdown() override { + VLOG(3) << "PyReader shutdown!"; + queue_->Close(); + } void Start() override { queue_->ReOpen(); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f9e7366779..fdee5a6d66 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1230,7 +1230,7 @@ All parameter, weight, gradient are variables in Paddle. pe.def(py::init &, const std::unordered_set &, const std::string &, Scope *, std::vector &, const ExecutionStrategy &, - const BuildStrategy &, ir::Graph *>()) + const BuildStrategy &, std::vector>()) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 889156ff74..9c578ef662 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -177,12 +177,17 @@ class ParallelExecutor(object): # step7: init ParallelExecutor # ParallelExecutor API will be deprecated, don't support parallel graph. - self._graph = core.Graph(main.desc) + self._graphs = [] + if build_strategy.async_mode: + for _ in range(cpu_num): + self._graphs.append(core.Graph(main.desc)) + else: + self._graphs.append(core.Graph(main.desc)) self.executor = core.ParallelExecutor( places, persistable_vars, cpt.to_text(loss_name) if loss_name else six.u(''), scope, - local_scopes, exec_strategy, build_strategy, self._graph) + local_scopes, exec_strategy, build_strategy, self._graphs) self.scope = scope From 02425b2f648f5dbb5773b0eab8901a42bf955f33 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Feb 2019 09:31:27 +0800 Subject: [PATCH 47/98] fix compile --- paddle/fluid/operators/distributed_ops/recv_op.cc | 2 +- paddle/fluid/operators/distributed_ops/send_op.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index a0185d66f0..bcb16ff2e5 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -57,7 +57,7 @@ class RecvOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = pool.Get(place); - auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); + auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto recv_functor = distributed::ParameterRecv(); recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope); } else { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 0f0ad6b8f9..801909e2c0 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -50,7 +50,7 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); - auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); + auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto send_functor = distributed::ParameterSend(); send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx, scope, static_cast(sync_send)); From 847e4f4e854b3f73625816d152f65ca5f5c7a27e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 1 Mar 2019 11:24:14 +0800 Subject: [PATCH 48/98] pure async mode train --- .../details/async_ssa_graph_executor.cc | 114 ++++++++++++------ .../details/async_ssa_graph_executor.h | 12 ++ .../details/threaded_ssa_graph_executor.cc | 2 + paddle/fluid/framework/parallel_executor.cc | 8 +- paddle/fluid/framework/reader.cc | 5 +- paddle/fluid/framework/reader.h | 10 +- .../fluid/operators/reader/blocking_queue.h | 3 +- .../fluid/operators/reader/buffered_reader.cc | 3 + .../operators/reader/create_py_reader_op.cc | 7 +- .../reader/lod_tensor_blocking_queue.h | 5 +- paddle/fluid/pybind/pybind.cc | 1 + .../test_async_ssa_graph_executor_mnist.py | 41 ++++--- 12 files changed, 148 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index dfb9d73dcb..69f770afee 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -14,10 +14,31 @@ #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" +#include "paddle/fluid/framework/variable_helper.h" + namespace paddle { namespace framework { namespace details { +inline void NewTempScopeAndInitVars(const std::vector &var_infos, + Scope *scope) { + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &info : var_infos) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } + } +} + AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::vector graphs) @@ -39,58 +60,81 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, graphs_[i])); } -} -FeedFetchList AsyncSSAGraphExecutor::Run( - const std::vector &fetch_tensors) { - std::vector> run_futures; - - std::vector fetch_data; - FeedFetchList ret; - - fetch_data.reserve(places_.size()); - ret.reserve(fetch_tensors.size()); - exception_holder_.Clear(); + for (auto &node : graphs_[0]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos_.emplace_back(); + var_infos_.back().name_ = node->Var()->Name(); + var_infos_.back().type_ = node->Var()->GetType(); + var_infos_.back().persistable_ = node->Var()->Persistable(); + } + } + for (auto *scope : local_scopes_) { + NewTempScopeAndInitVars(var_infos_, scope); + } +} - for (size_t i = 0; i < places_.size(); ++i) { - auto call = [this, i, &fetch_tensors]() -> FeedFetchList { +void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() { + VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size(); + for (size_t i = 1; i < places_.size(); ++i) { + auto call = [this, i]() -> void { + VLOG(3) << "start off python thread " << i; try { - return executors_[i]->Run(fetch_tensors); + while (true) { + executors_[i]->Run({}); + } } catch (...) { exception_holder_.Catch(std::current_exception()); + VLOG(3) << "get exception type = " << exception_holder_.Type(); } - return FeedFetchList(); + VLOG(3) << "thread " << i << " exited!"; }; - - if (pool_) { - run_futures.emplace_back(pool_->enqueue(std::move(call))); - } else { - fetch_data.emplace_back(std::move(call())); - } - } - - if (pool_) { - for (auto &f : run_futures) { - if (exception_holder_.IsCaught()) { - f.wait(); - } else { - fetch_data.emplace_back(std::move(f.get())); - } - } + run_futures_.emplace_back(pool_->enqueue(std::move(call))); } +} +void AsyncSSAGraphExecutor::HandleException() { if (exception_holder_.IsCaught()) { + for (auto &f : run_futures_) { + VLOG(3) << "wait future"; + f.wait(); + } VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it"; + run_futures_.clear(); exception_holder_.ReThrow(); } +} + +FeedFetchList AsyncSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + // init once + if (run_futures_.size() == 0 && places_.size() > 1) { + exception_holder_.Clear(); + StartOffPythonTrainLoop(); + } + + if (places_.size() == 1) { + exception_holder_.Clear(); + } else { + HandleException(); + } + + FeedFetchList fetch_data; + fetch_data.reserve(fetch_tensors.size()); + + try { + fetch_data = executors_[0]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + + HandleException(); + FeedFetchList ret; for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { std::vector lodtensor_ptrs; - lodtensor_ptrs.reserve(local_scopes_.size()); - for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { - lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); - } + lodtensor_ptrs.push_back(&fetch_data.at(fetch_idx)); ret.emplace_back(); ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); } diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h index ff85ba2c6c..7d7296772d 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -24,6 +24,12 @@ namespace paddle { namespace framework { namespace details { +struct VarInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + class AsyncSSAGraphExecutor : public SSAGraphExecutor { public: AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, @@ -35,6 +41,10 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector &fetch_tensors) override; + private: + void StartOffPythonTrainLoop(); + void HandleException(); + private: ExecutionStrategy strategy_; std::vector local_scopes_; @@ -44,6 +54,8 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor { std::vector> executors_; ExceptionHolder exception_holder_; + std::vector> run_futures_; + std::vector var_infos_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 8436626362..fa0c90e1f4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -119,6 +119,8 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( if (timeout) { if (exception_holder_.IsCaught()) { + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b1f4091148..c133772e6e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -379,9 +379,11 @@ ParallelExecutor::ParallelExecutor( } VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; - member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos), - member_->places_, std::move(member_->executor_))); + if (!build_strategy.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, std::move(var_infos), + member_->places_, std::move(member_->executor_))); + } } void ParallelExecutor::BCastParamsToDevices( diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index 40eafda9bf..d3513fb7db 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -69,6 +69,9 @@ void ReaderBase::Start() { ReaderBase::~ReaderBase() {} -DecoratedReader::~DecoratedReader() { reader_->Shutdown(); } +DecoratedReader::~DecoratedReader() { + VLOG(1) << "~DecoratedReader"; + reader_->Shutdown(); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 82562bf883..6cf0ec2937 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -77,7 +77,10 @@ class DecoratedReader : public ReaderBase, ~DecoratedReader(); protected: - void ShutdownImpl() override { reader_->Shutdown(); } + void ShutdownImpl() override { + VLOG(1) << "ShutdownImpl"; + reader_->Shutdown(); + } void StartImpl() override { reader_->Start(); } @@ -98,6 +101,8 @@ class ReaderHolder { reader_ = reader_base; } + ~ReaderHolder() { VLOG(1) << "~ReaderHolder"; } + const std::shared_ptr& Get() const { return reader_; } void ReadNext(std::vector* out) { @@ -106,6 +111,7 @@ class ReaderHolder { } void ResetAll() { + VLOG(1) << "ResetAll"; auto end_readers = reader_->GetEndPoints(); for (auto* reader : end_readers) { reader->Shutdown(); @@ -116,11 +122,13 @@ class ReaderHolder { } void Shutdown() { + VLOG(1) << "Shutdown"; PADDLE_ENFORCE_NOT_NULL(reader_); reader_->Shutdown(); } void Start() { + VLOG(1) << "start"; PADDLE_ENFORCE_NOT_NULL(reader_); reader_->Start(); } diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index c99b2bc593..fe3f2f4031 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -86,6 +86,7 @@ class BlockingQueue { void ReOpen() { std::lock_guard lock(mutex_); + VLOG(1) << "reopen queue"; closed_ = false; std::deque new_deque; queue_.swap(new_deque); @@ -95,7 +96,7 @@ class BlockingQueue { void Close() { std::lock_guard lock(mutex_); - VLOG(3) << "close queue"; + VLOG(1) << "close queue"; closed_ = true; send_cv_.notify_all(); receive_cv_.notify_all(); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index defc29b91f..db80fda695 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -20,6 +20,7 @@ namespace paddle { namespace operators { namespace reader { BufferedReader::~BufferedReader() { + VLOG(1) << "~BufferedReader"; reader_->Shutdown(); while (!position_.empty()) { position_.front().wait(); @@ -41,6 +42,7 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { + VLOG(1) << "BufferedReader"; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); @@ -121,6 +123,7 @@ void BufferedReader::ReadAsync(size_t i) { } void BufferedReader::ShutdownImpl() { + VLOG(1) << "ShutdownImpl"; reader_->Shutdown(); while (!position_.empty()) { position_.pop(); diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index b2469ad0eb..2916be618c 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -33,10 +33,13 @@ class PyReader : public framework::FileReader { if (!success) out->clear(); } - ~PyReader() { queue_->Close(); } + ~PyReader() { + VLOG(1) << "~PyReader"; + queue_->Close(); + } void Shutdown() override { - VLOG(3) << "PyReader shutdown!"; + VLOG(1) << "PyReader shutdown!"; queue_->Close(); } diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 5b53edff5d..eeba330d66 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -57,7 +57,10 @@ class LoDTensorBlockingQueue { inline void ReOpen() { queue_.ReOpen(); } - inline void Close() { queue_.Close(); } + inline void Close() { + VLOG(1) << "LoDTensorBlockingQueue close"; + queue_.Close(); + } inline bool IsClosed() const { return queue_.IsClosed(); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fdee5a6d66..af049127aa 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -557,6 +557,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_lod_tensor_blocking_queue", [](Variable &var, size_t capacity) -> std::shared_ptr { + VLOG(1) << "init_lod_tensor_blocking_queue"; auto *holder = var.GetMutable(); holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); return holder->GetQueue(); diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 41fa39e06b..4fbda407f1 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -36,7 +36,7 @@ def convolutional_neural_network(use_py_reader): capacity=64, feed_list=[img, label], name='py_reader', - use_double_buffer=True) + use_double_buffer=False) img, label = fluid.layers.read_file(py_reader) conv_pool_1 = fluid.nets.simple_img_conv_pool( @@ -139,20 +139,21 @@ def train(use_cuda, thread_num, cpu_num): exec_strategy=exec_strategy) py_reader.decorate_paddle_reader(train_reader) - py_reader.start() - - step = 0 - try: - while True: - loss_val = pe.run(fetch_list=[avg_loss.name]) - loss_val = numpy.mean(loss_val) - if step % 100 == 0: - print("Batch %d, Cost %f, queue size %d" % - (step, loss_val, py_reader.queue.size())) - step += 1 - except fluid.core.EOFException: - print("train end") - py_reader.reset() + + for pass_id in range(2): + step = 0 + py_reader.start() + try: + while True: + loss_val = pe.run(fetch_list=[avg_loss.name]) + loss_val = numpy.mean(loss_val) + if step % 10 == 0: + print("Pass %d, Batch %d, Cost %f, queue size %d" % + (pass_id, step, loss_val, py_reader.queue.size())) + step += 1 + except fluid.core.EOFException: + print("train end pass = " + str(pass_id)) + py_reader.reset() return step @@ -161,10 +162,11 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase): def test_check_async_ssa_exe_train(self): step_list = [] for cpu_num in [1, 2, 4]: - scope = fluid.core.Scope() - with fluid.scope_guard(scope): + print("run cpu_num -> " + str(cpu_num)) + with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard( - fluid.Program(), startup_program=fluid.Program()): + main_program=fluid.Program(), + startup_program=fluid.Program()): start_time = time.time() step = train( use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num) @@ -173,7 +175,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase): print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) + " time -> " + str(end_time - start_time)) with fluid.program_guard( - fluid.Program(), startup_program=fluid.Program()): + main_program=fluid.Program(), + startup_program=fluid.Program()): test() assert int(step_list[0] / 2) == int(step_list[1]) assert int(step_list[1] / 2) == int(step_list[2]) From 3691a46fa36750bb5a3c828d2eaf55305aa88f69 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 10:29:42 +0800 Subject: [PATCH 49/98] improve communicator --- paddle/fluid/framework/communicator.h | 53 ------- paddle/fluid/framework/variable_helper.cc | 26 +++- paddle/fluid/framework/variable_helper.h | 3 +- .../operators/distributed/CMakeLists.txt | 1 + .../operators/distributed/communicator.cc | 113 +++++++++++++++ .../operators/distributed/communicator.h | 129 ++++++++++++++++++ .../distributed/parameter_prefetch.cc | 4 +- .../operators/distributed/parameter_recv.cc | 2 +- .../fluid/operators/distributed/rpc_common.h | 33 +++++ .../operators/math/selected_rows_functor.h | 2 +- 10 files changed, 306 insertions(+), 60 deletions(-) delete mode 100644 paddle/fluid/framework/communicator.h create mode 100644 paddle/fluid/operators/distributed/communicator.cc create mode 100644 paddle/fluid/operators/distributed/communicator.h create mode 100644 paddle/fluid/operators/distributed/rpc_common.h diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h deleted file mode 100644 index 0e90ba02e6..0000000000 --- a/paddle/fluid/framework/communicator.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { - -namespace framework { - -class Communicator { - public: - Communicator() {} - ~Communicator() {} - - // send grad - void send() {} - - void receive() {} - - void prefetch() {} - - void wait() {} - - private: - std::unique_ptr communicate_thread_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index fc4525549c..d59f3ea7dc 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -27,7 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { -void InitializeVariable(Variable* var, proto::VarType::Type var_type) { +void InitializeVariable(Variable *var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == proto::VarType::SELECTED_ROWS) { @@ -37,7 +37,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { @@ -56,5 +56,27 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { var_type); } } + +void CopyVariable(const Variable &src_var, Variable *dst_var) { + // only support cpu now + auto cpu_place = platform::CPUPlace(); + + if (src_var.IsType()) { + auto *tmp_grad_tensor = dst_var->GetMutable(); + auto &src_tensor = src_var.Get(); + tmp_grad_tensor->set_lod(src_tensor.lod()); + framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); + tmp_grad_slr->set_rows(src_slr.rows()); + tmp_grad_slr->set_height(src_slr.height()); + auto &src_t = src_slr.value(); + auto *dst_t = tmp_grad_slr->mutable_value(); + framework::TensorCopy(src_t, cpu_place, dst_t); + } else { + PADDLE_THROW("unknown var type to copy"); + } +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h index 0e0c72c362..f8e90d5396 100644 --- a/paddle/fluid/framework/variable_helper.h +++ b/paddle/fluid/framework/variable_helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { -void InitializeVariable(Variable *var, proto::VarType::Type var_type); +void InitializeVariable(Variable* var, proto::VarType::Type var_type); +void CopyVariable(const Variable& src_var, Variable* dst_var); } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 231f4b3bc4..22f44c4217 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -54,6 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) +cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc new file mode 100644 index 0000000000..fb9ecfa808 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/distributed/communicator.h" + +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" +#include "paddle/fluid/operators/distributed/parameter_send.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { +namespace distributed { + +static void MergeVars(const std::string &var_name, + const std::vector> &vars, + Scope *scope) { + PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); + auto cpu_place = platform::CPUPlace(); + auto &var0 = vars[0]; + auto *out_var = scope->Var(var_name); + if (var0->IsType()) { + auto *out_t = out_var->GetMutable(); + auto *out_ptr = out_t->mutable_data( + var0->Get().dims(), cpu_place); + auto numel = out_t->numel(); + for (auto i = 0; i < numel; ++i) { + out_ptr[i] = 0; + for (auto &var : vars) { + auto &var_t = var->Get(); + PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims"); + out_ptr[i] += var_t.data()[i]; + } + } + } else if (var0->IsType()) { + auto *out_slr = out_var->GetMutable(); + std::vector inputs; + inputs.reserve(vars.size()); + for (auto &var : vars) { + inputs.push_back(&var->Get()); + } + math::scatter::MergeAdd + merge_add; + auto dev_ctx = paddle::platform::CPUDeviceContext(); + merge_add(dev_ctx, inputs, out_slr, false); + } else { + PADDLE_THROW("unsupported var type!"); + } +} + +void Communicator::SendThread() { + for (auto &iter : send_varname_to_queue_) { + auto &var_name = iter.first; + VLOG(3) << "merge var " << var_name << " and send"; + auto &var_queue = iter.second; + std::vector> vars; + const size_t max_merge_var_num = 20; + size_t merged_var_num = 0; + while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { + vars.push_back(var_queue->Pop()); + merged_var_num++; + } + MergeVars(var_name, vars, send_scope_.get()); + auto send_functor = distributed::ParameterSend(); + // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx, + // send_scope_, true); + } +} + +void Communicator::RecvThread() { + // parallel run recv graph + for (auto &iter : recv_varname_to_ctx_) { + auto &var_name = iter.first; + VLOG(3) << "recv var " << iter.first; + auto recv_functor = distributed::ParameterRecv(); + // recv_functor(var_name, iter.second, exe_ctx, recv_scope_); + } +} + +void Communicator::Send(const std::string &var_name, + const framework::Scope &scope) { + // push var into send queue by var_name + auto *grad_var = scope.FindVar(var_name); + PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); + auto tmp_grad_var = std::make_shared(); + framework::CopyVariable(*grad_var, tmp_grad_var.get()); + send_varname_to_queue_[var_name]->Push(tmp_grad_var); +} + +void Communicator::Start() { + // start send and recv thread + send_thread_.reset( + new std::thread(std::bind(&Communicator::SendThread, this))); + recv_thread_.reset( + new std::thread(std::bind(&Communicator::RecvThread, this))); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h new file mode 100644 index 0000000000..614d6ade81 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using Scope = framework::Scope; +using Variable = framework::Variable; + +template +class BlockingQueue { + public: + explicit BlockingQueue(size_t capacity) : capacity_(capacity) { + PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0."); + } + + bool Push(const T& elem) { + std::unique_lock lock(mutex_); + send_cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.push_back(elem); + recv_cv_.notify_one(); + return true; + } + + bool Push(T&& elem) { + std::unique_lock lock(mutex_); + send_cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.emplace_back(std::move(elem)); + recv_cv_.notify_one(); + return true; + } + + T Pop() { + std::unique_lock lock(mutex_); + recv_cv_.wait(lock, [=] { return !queue_.empty(); }); + T rc(std::move(queue_.front())); + queue_.pop_front(); + return rc; + } + + size_t Cap() const { + std::lock_guard lock(mutex_); + return capacity_; + } + + size_t Size() const { + std::lock_guard lock(mutex_); + return queue_.size(); + } + + private: + const size_t capacity_; + std::deque queue_; + + mutable std::mutex mutex_; + std::condition_variable recv_cv_; + std::condition_variable send_cv_; +}; + +class Communicator { + public: + Communicator( + const std::unordered_map& send_varname_to_ctx, + const std::unordered_map& recv_varname_to_ctx, + Scope* recv_scope) + : send_varname_to_ctx_(send_varname_to_ctx), + recv_varname_to_ctx_(recv_varname_to_ctx), + recv_scope_(recv_scope) { + // get all send information from graph, build vars_to_send + send_scope_.reset(new Scope()); + for (auto& iter : send_varname_to_ctx_) { + send_varname_to_queue_[iter.first] = + std::make_shared>>(10); + } + } + + ~Communicator() {} + + void Start(); + + // send grad + void Send(const std::string& var_name, const framework::Scope& scope); + + private: + void SendThread(); + void RecvThread(); + + std::unordered_map>>> + send_varname_to_queue_; + std::unordered_map send_varname_to_ctx_; + std::unordered_map recv_varname_to_ctx_; + std::unique_ptr send_thread_; + std::unique_ptr recv_thread_; + Scope* recv_scope_; // should be global scope + std::unique_ptr send_scope_; // an independent scope +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 7434265929..539a038099 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -39,7 +39,7 @@ using DDim = framework::DDim; static std::vector> SplitIds( const std::vector& ids_vector, - const std::vector& height_section, framework::Scope* scope) { + const std::vector& height_section) { std::set all_ids; for (auto id : ids_vector) { all_ids.insert(id); @@ -203,7 +203,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, #endif } - auto splited_ids = SplitIds(ids_vector, height_sections, local_scope); + auto splited_ids = SplitIds(ids_vector, height_sections); SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, local_scope); diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index 2664a89ed6..b8d3b77ae4 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -73,7 +73,7 @@ void ParameterRecv::operator()(const std::string &var_name, PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } } else { - PADDLE_THROW("unsupported var type to send!"); + PADDLE_THROW("unsupported var type to recv!"); } // concat recved tensor into one var diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h new file mode 100644 index 0000000000..dc50414b9a --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +struct RpcContext { + std::string var_name; + std::vector splited_var_names; + std::vector epmap; + std::vector height_sections; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 222d761ef9..db0ee9bc16 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -95,7 +95,7 @@ struct MergeAdd { enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; -// out = seleted_rows_in / tensor +// out = selected_rows_in / tensor template struct UpdateToTensor { void operator()(const DeviceContext& context, const ScatterOps& op, From 9573d610ef7e364c91ea3346aa2d0903041c2f72 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 11:10:19 +0800 Subject: [PATCH 50/98] use rpc common in parameter send and recv --- .../operators/distributed/parameter_recv.cc | 17 +++++------ .../operators/distributed/parameter_recv.h | 5 ++-- .../operators/distributed/parameter_send.cc | 30 +++++++++---------- .../operators/distributed/parameter_send.h | 6 ++-- .../fluid/operators/distributed/rpc_common.h | 7 +++++ .../operators/distributed_ops/recv_op.cc | 7 +++-- .../operators/distributed_ops/send_op.cc | 9 ++++-- 7 files changed, 44 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index b8d3b77ae4..00956d8e6d 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -39,9 +39,7 @@ using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; template -void ParameterRecv::operator()(const std::string &var_name, - const std::vector &recv_varnames, - const std::vector &epmap, +void ParameterRecv::operator()(const RpcContext &rpc_ctx, const framework::ExecutionContext &ctx, const framework::Scope &scope) { framework::Scope *local_scope = scope.NewTmpScope(); @@ -53,21 +51,22 @@ void ParameterRecv::operator()(const std::string &var_name, distributed::RPCClient::GetInstance( ctx.Attr("trainer_id")); - auto *recv_var = scope.FindVar(var_name); + auto *recv_var = scope.FindVar(rpc_ctx.var_name); std::vector recved_tensors; // recv all vars to local scope if (recv_var->IsType()) { std::vector rets; - for (size_t i = 0; i < recv_varnames.size(); i++) { - auto &recv_var_name = recv_varnames[i]; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &recv_var_name = rpc_ctx.splited_var_names[i]; framework::Tensor *t = local_scope->Var(recv_var_name)->GetMutable(); recved_tensors.push_back(t); - VLOG(3) << "recv " << recv_var_name << " from " << epmap[i]; - rets.push_back(rpc_client->AsyncGetVar(epmap[i], cpu_ctx, *local_scope, - recv_var_name, recv_var_name)); + VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; + rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, + *local_scope, recv_var_name, + recv_var_name)); } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h index bc6f5f5adf..e25594024a 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -18,6 +18,7 @@ #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" namespace paddle { namespace operators { @@ -25,9 +26,7 @@ namespace distributed { template struct ParameterRecv { - void operator()(const std::string &var_name, - const std::vector &recv_varnames, - const std::vector &epmap, + void operator()(const RpcContext &rpc_ctx, const framework::ExecutionContext &context, const framework::Scope &scope); }; diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index fd97926623..eaa1c3ae8e 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -38,10 +38,7 @@ using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; template -void ParameterSend::operator()(const std::string &var_name, - const std::vector &send_varnames, - const std::vector &epmap, - const std::vector &height_sections, +void ParameterSend::operator()(const RpcContext &rpc_ctx, const framework::ExecutionContext &ctx, const framework::Scope &scope, bool sync) { framework::Scope *local_scope = scope.NewTmpScope(); @@ -53,8 +50,8 @@ void ParameterSend::operator()(const std::string &var_name, distributed::RPCClient::GetInstance( ctx.Attr("trainer_id")); - auto *send_var = scope.FindVar(var_name); - size_t out_num = send_varnames.size(); + auto *send_var = scope.FindVar(rpc_ctx.var_name); + size_t out_num = rpc_ctx.splited_var_names.size(); if (send_var->IsType()) { if (out_num > 1) { auto &send_tensor = send_var->Get(); @@ -63,19 +60,19 @@ void ParameterSend::operator()(const std::string &var_name, outs_dims.reserve(out_num); // infer output shape - PADDLE_ENFORCE_EQ(height_sections.size(), out_num, + PADDLE_ENFORCE_EQ(rpc_ctx.height_sections.size(), out_num, "tensor split sections size" "should be equal to output size."); for (size_t i = 0; i < out_num; ++i) { auto dim = send_tensor_dims; - dim[0] = height_sections[i]; + dim[0] = rpc_ctx.height_sections[i]; outs_dims.push_back(dim); } // create output var in local scope size_t row_offset = 0; for (auto i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(send_varnames[i]) + framework::Tensor *out = local_scope->Var(rpc_ctx.splited_var_names[i]) ->GetMutable(); *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); row_offset += outs_dims[i][0]; @@ -83,7 +80,7 @@ void ParameterSend::operator()(const std::string &var_name, } } else if (send_var->IsType()) { auto &send_slr = send_var->Get(); - auto abs_sections = ToAbsoluteSection(height_sections); + auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections); auto send_rows = send_slr.rows(); std::vector> outs_rows_idx; @@ -97,7 +94,7 @@ void ParameterSend::operator()(const std::string &var_name, // create output var in local scope std::vector outs; - for (auto &name : send_varnames) { + for (auto &name : rpc_ctx.splited_var_names) { auto *out = local_scope->Var(name)->GetMutable(); outs.push_back(out); } @@ -112,7 +109,7 @@ void ParameterSend::operator()(const std::string &var_name, for (size_t i = 0; i < outs_rows_idx.size(); ++i) { auto rows_idx = outs_rows_idx[i]; - outs[i]->set_height(height_sections[i]); + outs[i]->set_height(rpc_ctx.height_sections[i]); auto dims = send_slr.GetCompleteDims(); dims[0] = rows_idx.size(); outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); @@ -149,15 +146,16 @@ void ParameterSend::operator()(const std::string &var_name, } std::vector rets; - for (size_t i = 0; i < send_varnames.size(); i++) { - auto &send_var_name = send_varnames[i]; - auto &endpoint = epmap[i]; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &send_var_name = rpc_ctx.splited_var_names[i]; + auto &endpoint = rpc_ctx.epmap[i]; if (NeedSend(*local_scope, send_var_name)) { VLOG(3) << "sending " << send_var_name << " to " << endpoint; rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, send_var_name)); } else { - VLOG(3) << "don't send non-initialized variable: " << send_varnames[i]; + VLOG(3) << "don't send non-initialized variable: " + << rpc_ctx.splited_var_names[i]; } } diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h index 1746377228..4500497163 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -18,6 +18,7 @@ #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" namespace paddle { namespace operators { @@ -25,10 +26,7 @@ namespace distributed { template struct ParameterSend { - void operator()(const std::string &var_name, - const std::vector &send_varnames, - const std::vector &epmap, - const std::vector &height_sections, + void operator()(const RpcContext &rpc_ctx, const framework::ExecutionContext &context, const framework::Scope &scope, bool sync); }; diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h index dc50414b9a..7dede07b5a 100644 --- a/paddle/fluid/operators/distributed/rpc_common.h +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -22,6 +22,13 @@ namespace operators { namespace distributed { struct RpcContext { + RpcContext(const std::string& name, const std::vector& names, + const std::vector& emap, + const std::vector& sections) + : var_name(name), + splited_var_names(names), + epmap(emap), + height_sections(sections) {} std::string var_name; std::vector splited_var_names; std::vector epmap; diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index bcb16ff2e5..a4a5ab89a7 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/parameter_recv.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -57,9 +58,11 @@ class RecvOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = pool.Get(place); - auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); + auto exe_ctx = + framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto recv_functor = distributed::ParameterRecv(); - recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope); + auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); + recv_functor(rpc_ctx, exe_ctx, scope); } else { if (with_barrier) { std::vector rets; diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 801909e2c0..1823d89897 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/parameter_send.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -50,10 +51,12 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); - auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); + auto exe_ctx = + framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto send_functor = distributed::ParameterSend(); - send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx, - scope, static_cast(sync_send)); + auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, + height_sections); + send_functor(rpc_ctx, exe_ctx, scope, static_cast(sync_send)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From 3c6b733d14c0db61eb70208aa79c3999f29efc1d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 12:11:21 +0800 Subject: [PATCH 51/98] remove exe context --- .../operators/distributed/parameter_recv.cc | 9 +++--- .../operators/distributed/parameter_recv.h | 4 +-- .../operators/distributed/parameter_send.cc | 29 ++++++++++--------- .../operators/distributed/parameter_send.h | 5 ++-- .../operators/distributed_ops/recv_op.cc | 2 +- .../operators/distributed_ops/send_op.cc | 2 +- 6 files changed, 24 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index 00956d8e6d..fecc76955d 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -40,7 +40,6 @@ using DDim = framework::DDim; template void ParameterRecv::operator()(const RpcContext &rpc_ctx, - const framework::ExecutionContext &ctx, const framework::Scope &scope) { framework::Scope *local_scope = scope.NewTmpScope(); @@ -48,8 +47,7 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, auto &cpu_ctx = *pool.Get(platform::CPUPlace()); distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - ctx.Attr("trainer_id")); + distributed::RPCClient::GetInstance(0); auto *recv_var = scope.FindVar(rpc_ctx.var_name); @@ -80,12 +78,13 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, size_t output_offset = 0; framework::Tensor *recv_tensor = recv_var->GetMutable(); + auto dev_ctx = paddle::platform::CPUDeviceContext(); for (auto *in : recved_tensors) { auto in_stride = framework::stride_numel(in->dims()); auto out_stride = framework::stride_numel(recv_tensor->dims()); StridedNumelCopyWithAxis( - ctx.device_context(), 0, recv_tensor->data() + output_offset, - out_stride, in->data(), in_stride, in_stride[0]); + dev_ctx, 0, recv_tensor->data() + output_offset, out_stride, + in->data(), in_stride, in_stride[0]); output_offset += in_stride[0]; } } diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h index e25594024a..e955fca725 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -26,9 +26,7 @@ namespace distributed { template struct ParameterRecv { - void operator()(const RpcContext &rpc_ctx, - const framework::ExecutionContext &context, - const framework::Scope &scope); + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope); }; }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index eaa1c3ae8e..3fe3be193a 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -39,7 +39,6 @@ using DDim = framework::DDim; template void ParameterSend::operator()(const RpcContext &rpc_ctx, - const framework::ExecutionContext &ctx, const framework::Scope &scope, bool sync) { framework::Scope *local_scope = scope.NewTmpScope(); @@ -47,8 +46,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, auto &cpu_ctx = *pool.Get(platform::CPUPlace()); distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - ctx.Attr("trainer_id")); + distributed::RPCClient::GetInstance(0); auto *send_var = scope.FindVar(rpc_ctx.var_name); size_t out_num = rpc_ctx.splited_var_names.size(); @@ -105,7 +103,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, outs_rows_idx[out_idx].push_back(send_rows[i]); outs_dense_idx[out_idx].push_back(i); } - auto place = ctx.GetPlace(); + auto place = platform::CPUPlace(); for (size_t i = 0; i < outs_rows_idx.size(); ++i) { auto rows_idx = outs_rows_idx[i]; @@ -118,22 +116,25 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, for (auto idx : rows_idx) { outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); } - auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); + auto dst = outs[i]->mutable_value()->mutable_data(place); for (size_t j = 0; j < rows_idx.size(); j++) { if (platform::is_cpu_place(place)) { memory::Copy( platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); } else { -#ifdef PADDLE_WITH_CUDA - auto stream = ctx.cuda_device_context().stream(); - memory::Copy(platform::CUDAPlace(), dst + j * row_numel, - platform::CUDAPlace(), - src + outs_dense_idx[i][j] * row_numel, - sizeof(T) * row_numel, stream); -#else - PADDLE_THROW("Paddle is not compiled with GPU"); -#endif + PADDLE_THROW("do not support GPU now"); + /* + #ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, + sizeof(T) * row_numel, stream); + #else + PADDLE_THROW("Paddle is not compiled with GPU"); + #endif + */ } } } diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h index 4500497163..9077f4a4fb 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -26,9 +26,8 @@ namespace distributed { template struct ParameterSend { - void operator()(const RpcContext &rpc_ctx, - const framework::ExecutionContext &context, - const framework::Scope &scope, bool sync); + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope, + bool sync); }; }; // namespace distributed diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index a4a5ab89a7..41701d3a3e 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -62,7 +62,7 @@ class RecvOp : public framework::OperatorBase { framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto recv_functor = distributed::ParameterRecv(); auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); - recv_functor(rpc_ctx, exe_ctx, scope); + recv_functor(rpc_ctx, scope); } else { if (with_barrier) { std::vector rets; diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 1823d89897..5585ad21ce 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -56,7 +56,7 @@ class SendOp : public framework::OperatorBase { auto send_functor = distributed::ParameterSend(); auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, height_sections); - send_functor(rpc_ctx, exe_ctx, scope, static_cast(sync_send)); + send_functor(rpc_ctx, scope, static_cast(sync_send)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From c2cce6bafaabe8b2b32c42fc885c7e6a09586c8f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 13:20:34 +0800 Subject: [PATCH 52/98] simplify parameter send and recv --- paddle/fluid/operators/distributed/communicator.cc | 10 +++++----- paddle/fluid/operators/distributed_ops/recv_op.cc | 6 ------ paddle/fluid/operators/distributed_ops/send_op.cc | 6 ------ 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index fb9ecfa808..bc0a57f344 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -74,9 +74,9 @@ void Communicator::SendThread() { merged_var_num++; } MergeVars(var_name, vars, send_scope_.get()); - auto send_functor = distributed::ParameterSend(); - // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx, - // send_scope_, true); + // auto send_functor = distributed::ParameterSend(); + // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx, + // send_scope_, true); } } @@ -85,8 +85,8 @@ void Communicator::RecvThread() { for (auto &iter : recv_varname_to_ctx_) { auto &var_name = iter.first; VLOG(3) << "recv var " << iter.first; - auto recv_functor = distributed::ParameterRecv(); - // recv_functor(var_name, iter.second, exe_ctx, recv_scope_); + // auto recv_functor = distributed::ParameterRecv(); + // recv_functor(var_name, iter.second, exe_ctx, recv_scope_); } } diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 41701d3a3e..680b484d41 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -54,12 +54,6 @@ class RecvOp : public framework::OperatorBase { Attr>("recv_varnames"); if (recv_varnames.size() > 0) { - framework::RuntimeContext ctx(Inputs(), Outputs(), scope); - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto *dev_ctx = pool.Get(place); - auto exe_ctx = - framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto recv_functor = distributed::ParameterRecv(); auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); recv_functor(rpc_ctx, scope); diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 5585ad21ce..8b09cf86d7 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -47,12 +47,6 @@ class SendOp : public framework::OperatorBase { if (send_varnames.size() > 0) { PADDLE_ENFORCE_EQ(ins.size(), 1, ""); - framework::RuntimeContext ctx(Inputs(), Outputs(), scope); - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - auto exe_ctx = - framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr); auto send_functor = distributed::ParameterSend(); auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, height_sections); From 50601501e52ce6bd0b34864dc2410e1a6083a3cd Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 15:01:22 +0800 Subject: [PATCH 53/98] improve communicator --- .../operators/distributed/CMakeLists.txt | 2 +- .../operators/distributed/communicator.cc | 69 ++++++++++++------- .../operators/distributed/communicator.h | 16 ++++- .../fluid/operators/distributed/rpc_common.h | 8 +++ 4 files changed, 70 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 22f44c4217..1301467fa7 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -54,7 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor) +cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index bc0a57f344..403fcf4b16 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -25,9 +25,9 @@ namespace paddle { namespace operators { namespace distributed { -static void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope) { +static inline void MergeVars(const std::string &var_name, + const std::vector> &vars, + Scope *scope) { PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; @@ -62,31 +62,53 @@ static void MergeVars(const std::string &var_name, } void Communicator::SendThread() { - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - VLOG(3) << "merge var " << var_name << " and send"; - auto &var_queue = iter.second; - std::vector> vars; - const size_t max_merge_var_num = 20; - size_t merged_var_num = 0; - while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { - vars.push_back(var_queue->Pop()); - merged_var_num++; + while (running_) { + std::vector> task_futures; + task_futures.reserve(send_varname_to_ctx_.size()); + for (auto &iter : send_varname_to_queue_) { + auto send_task = [this, &iter] { + auto &var_name = iter.first; + VLOG(3) << "merge var " << var_name << " and send"; + auto &var_queue = iter.second; + std::vector> vars; + const size_t max_merge_var_num = 20; + size_t merged_var_num = 0; + while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { + vars.push_back(var_queue->Pop()); + merged_var_num++; + } + MergeVars(var_name, vars, send_scope_.get()); + auto send_functor = distributed::ParameterSend(); + auto &ctx = send_varname_to_ctx_.at(var_name); + send_functor(ctx, *send_scope_, true); + }; + task_futures.emplace_back( + send_threadpool_->enqueue(std::move(send_task))); + } + for (auto &task_f : task_futures) { + task_f.wait(); } - MergeVars(var_name, vars, send_scope_.get()); - // auto send_functor = distributed::ParameterSend(); - // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx, - // send_scope_, true); } } void Communicator::RecvThread() { - // parallel run recv graph - for (auto &iter : recv_varname_to_ctx_) { - auto &var_name = iter.first; - VLOG(3) << "recv var " << iter.first; - // auto recv_functor = distributed::ParameterRecv(); - // recv_functor(var_name, iter.second, exe_ctx, recv_scope_); + while (running_) { + // parallel run recv graph + std::vector> task_futures; + task_futures.reserve(recv_varname_to_ctx_.size()); + for (auto &iter : recv_varname_to_ctx_) { + auto recv_task = [this, &iter] { + auto &var_name = iter.first; + VLOG(3) << "recv var " << var_name; + auto recv_functor = distributed::ParameterRecv(); + recv_functor(iter.second, *recv_scope_); + }; + task_futures.emplace_back( + recv_threadpool_->enqueue(std::move(recv_task))); + } + for (auto &task : task_futures) { + task.wait(); + } } } @@ -101,6 +123,7 @@ void Communicator::Send(const std::string &var_name, } void Communicator::Start() { + running_ = true; // start send and recv thread send_thread_.reset( new std::thread(std::bind(&Communicator::SendThread, this))); diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 614d6ade81..ffdfa38b12 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -19,6 +19,8 @@ limitations under the License. */ #include #include +#include + #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/distributed/rpc_common.h" @@ -100,9 +102,18 @@ class Communicator { send_varname_to_queue_[iter.first] = std::make_shared>>(10); } + // TODO(qiao): default 5, need to config + send_threadpool_.reset(new ::ThreadPool(5)); + recv_threadpool_.reset(new ::ThreadPool(5)); } - ~Communicator() {} + ~Communicator() { + VLOG(3) << "~Communicator"; + running_ = false; + send_thread_->join(); + recv_thread_->join(); + VLOG(3) << "~Communicator done"; + } void Start(); @@ -113,6 +124,7 @@ class Communicator { void SendThread(); void RecvThread(); + bool running_ = false; std::unordered_map>>> send_varname_to_queue_; @@ -122,6 +134,8 @@ class Communicator { std::unique_ptr recv_thread_; Scope* recv_scope_; // should be global scope std::unique_ptr send_scope_; // an independent scope + std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; + std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; }; } // namespace distributed diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h index 7dede07b5a..39eb2d078c 100644 --- a/paddle/fluid/operators/distributed/rpc_common.h +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -29,6 +29,14 @@ struct RpcContext { splited_var_names(names), epmap(emap), height_sections(sections) {} + + RpcContext(const RpcContext& ctx) { + var_name = ctx.var_name; + splited_var_names = ctx.splited_var_names; + epmap = ctx.epmap; + height_sections = ctx.height_sections; + } + std::string var_name; std::vector splited_var_names; std::vector epmap; From 13e8b5bf8962eea9aafe0e6c32f761e386767cea Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 15:31:56 +0800 Subject: [PATCH 54/98] clear gradient before merge --- paddle/fluid/operators/distributed/communicator.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 403fcf4b16..a88b764474 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -47,6 +47,8 @@ static inline void MergeVars(const std::string &var_name, } } else if (var0->IsType()) { auto *out_slr = out_var->GetMutable(); + out_slr->mutable_rows()->clear(); + out_slr->mutable_value()->mutable_data({{}}, cpu_place); std::vector inputs; inputs.reserve(vars.size()); for (auto &var : vars) { @@ -71,6 +73,7 @@ void Communicator::SendThread() { VLOG(3) << "merge var " << var_name << " and send"; auto &var_queue = iter.second; std::vector> vars; + // TODO(qiao): need to be configurable const size_t max_merge_var_num = 20; size_t merged_var_num = 0; while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { From 8744f9a083719626c56190672b66eb7ac24d32be Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 4 Mar 2019 22:54:26 +0800 Subject: [PATCH 55/98] fix parallel executor async mode --- paddle/fluid/framework/parallel_executor.cc | 10 ++++++++-- paddle/fluid/framework/parallel_executor.h | 3 ++- paddle/fluid/pybind/pybind.cc | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c133772e6e..ae7cd800ad 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -188,7 +188,7 @@ ParallelExecutor::ParallelExecutor( const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - std::vector graphs) + ir::Graph *graph) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -218,12 +218,18 @@ ParallelExecutor::ParallelExecutor( } } + std::vector graphs; if (build_strategy.async_mode_) { PADDLE_ENFORCE(!member_->use_cuda_, "gpu mode does not support async_mode_ now!"); + graphs.push_back(graph); + for (int i = 1; i < places.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } } - ir::Graph *graph = graphs[0]; std::unique_ptr temp_owned_graph(graph); // FIXME(Yancey1989): parallel graph mode get better performance diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 0e05b2a460..987f715066 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -50,7 +50,7 @@ class ParallelExecutor { const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - std::vector graphs); + ir::Graph *graph); ~ParallelExecutor(); @@ -76,6 +76,7 @@ class ParallelExecutor { const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; + std::vector> async_graphs_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr local_nccl_id_; #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6d1fc0be23..69cfe280c6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1271,7 +1271,7 @@ All parameter, weight, gradient are variables in Paddle. pe.def(py::init &, const std::unordered_set &, const std::string &, Scope *, std::vector &, const ExecutionStrategy &, - const BuildStrategy &, std::vector>()) + const BuildStrategy &, ir::Graph *>()) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* From 8c38aca95401324a44a0aab8e017cae26a179b65 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Mar 2019 16:49:52 +0800 Subject: [PATCH 56/98] tmp commit --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/async_ssa_graph_executor.cc | 38 +++++++++++++++++++ .../operators/distributed/communicator.h | 36 +++++++++++++++--- 3 files changed, 69 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index b39673e229..88e7dd3f88 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -82,7 +82,7 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) -cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) +cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor communicator) cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 69f770afee..43391804c5 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/communicator.h" namespace paddle { namespace framework { @@ -39,6 +40,43 @@ inline void NewTempScopeAndInitVars(const std::vector &var_infos, } } +// get RpcContext and remote send and recv op +void ProcessGraph(std::vector graphs, Scope *scope) { + using RpcCtxMap = operators::distributed::RpcCtxMap; + RpcCtxMap send_varname_to_ctx; + RpcCtxMap recv_varname_to_ctx; + for (auto i = 0; i < graphs.size(); ++i) { + for (auto &node : graphs[i]->Nodes()) { + if (node->IsOp()) { + if (node->Op()->Type() == "send") { + auto send_var_name = node->Op()->Input("X")[0]; + auto send_varnames = boost::get>( + node->Op()->GetNullableAttr("send_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + auto height_section = boost::get>( + node->Op()->GetNullableAttr("sections")); + send_varname_to_ctx[send_var_name] = + operators::distributed::RpcContext(send_var_name, send_varnames, + epmap, height_section); + } else if (node->Op()->Type() == "recv") { + auto recv_var_name = node->Op()->Input("X")[0]; + auto recv_varnames = boost::get>( + node->Op()->GetNullableAttr("recv_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + recv_varname_to_ctx[recv_var_name] = + operators::distributed::RpcContext(recv_var_name, recv_varnames, + epmap, {}); + } + } + } + } + // init communicator here + operators::distributed::Communicator::Init(send_varname_to_ctx, + recv_varname_to_ctx, scope); +} + AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::vector graphs) diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index ffdfa38b12..44e2aa3be7 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -87,12 +87,12 @@ class BlockingQueue { std::condition_variable send_cv_; }; +using RpcCtxMap = std::unordered_map; + class Communicator { public: - Communicator( - const std::unordered_map& send_varname_to_ctx, - const std::unordered_map& recv_varname_to_ctx, - Scope* recv_scope) + Communicator(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) : send_varname_to_ctx_(send_varname_to_ctx), recv_varname_to_ctx_(recv_varname_to_ctx), recv_scope_(recv_scope) { @@ -128,14 +128,38 @@ class Communicator { std::unordered_map>>> send_varname_to_queue_; - std::unordered_map send_varname_to_ctx_; - std::unordered_map recv_varname_to_ctx_; + RpcCtxMap send_varname_to_ctx_; + RpcCtxMap recv_varname_to_ctx_; std::unique_ptr send_thread_; std::unique_ptr recv_thread_; Scope* recv_scope_; // should be global scope std::unique_ptr send_scope_; // an independent scope std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; + + // the following code is for initialize the commnunicator + public: + static void Init(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) { + InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope); + } + + static Communicator* GetInstance() { return communicator_.get(); } + + private: + // Init is called by GetInstance. + static void InitImpl(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, + Scope* recv_scope) { + if (communicator_ == nullptr) { + communicator_.reset(new Communicator(send_varname_to_ctx, + recv_varname_to_ctx, recv_scope)); + } + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr communicator_; }; } // namespace distributed From e92ad8a2097ecffdfa412306b60dba4df68b8541 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Mar 2019 16:56:56 +0800 Subject: [PATCH 57/98] optimize test_async_ssa_graph_executor_mnist test=develop --- .../tests/unittests/test_async_ssa_graph_executor_mnist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 4fbda407f1..5e77ce9b81 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -178,8 +178,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase): main_program=fluid.Program(), startup_program=fluid.Program()): test() - assert int(step_list[0] / 2) == int(step_list[1]) - assert int(step_list[1] / 2) == int(step_list[2]) + assert abs(int(step_list[0] / 2) - int(step_list[1])) < 5 + assert abs(int(step_list[1] / 2) - int(step_list[2])) < 5 if __name__ == "__main__": From f28c25845330cf47250f7f6cba67f6f4cdaae97d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Mar 2019 17:10:17 +0800 Subject: [PATCH 58/98] code clean test=develop --- .../framework/details/multi_devices_graph_pass.cc | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 109037c3e6..c8e9c5d687 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -167,10 +167,6 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( bool is_forwarding = true; bool insert_collection_ops = NeedCollectiveOps(); - if (strategy_.async_mode_) { - // async mode did not need to merge gradient - insert_collection_ops = false; - } for (ir::Node *node : sorted_ops) { if (DealWithSpecialOp(&result, node)) { @@ -749,10 +745,6 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { bool insert_op = false; if (OpHaveRole(*node, OpRole::kRPC)) { - // in async_mode, each graph will send it's own gradient. - if (strategy_.async_mode_ && node->Op()->Type() == "send") { - return false; - } int op_dev_id = CreateRPCOp(result, node); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -768,11 +760,6 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, insert_op = true; need_broadcast_var_ = true; } else if (OpHaveRole(*node, OpRole::kDist)) { - // in async_mode, each graph will send it's own gradient, do not need to - // merge gradient. - if (strategy_.async_mode_ && node->Op()->Type() != "concat") { - return false; - } int op_dev_id = CreateDistTrainOp(result, node); if (node->Op()->Type() == "concat") { // the input(block of parameter) of concat is on different device, @@ -844,7 +831,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { } auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - if (recv_param_grad.size() == 2U && !strategy_.async_mode_) { + if (recv_param_grad.size() == 2U) { op_dev_id = GetVarDeviceID(recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] From c09477b05755da2c61862b37c82fc4031bbf04b1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Mar 2019 23:13:00 +0800 Subject: [PATCH 59/98] revert change --- python/paddle/fluid/parallel_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 84beb37c1d..2ebaab3b10 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -104,6 +104,7 @@ class ParallelExecutor(object): main_program = main_program if main_program is not None \ else framework.default_main_program() + self._compiled_program = compiler.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=loss_name, From 4e218dabc5cb24c753186503389fd533087bae81 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Mar 2019 23:29:09 +0800 Subject: [PATCH 60/98] code format test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 +++ paddle/fluid/framework/details/build_strategy.cc | 1 + paddle/fluid/framework/details/build_strategy.h | 1 + paddle/fluid/framework/details/exception_holder.h | 1 + paddle/fluid/framework/details/multi_devices_graph_pass.cc | 3 +++ paddle/fluid/framework/details/multi_devices_graph_pass.h | 3 +++ paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 ++ paddle/fluid/framework/parallel_executor.h | 1 + paddle/fluid/operators/reader/blocking_queue.h | 1 + paddle/fluid/operators/reader/lod_tensor_blocking_queue.h | 1 + 10 files changed, 17 insertions(+) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 69f770afee..83fd8a50c3 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" +#include +#include + #include "paddle/fluid/framework/variable_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 4c5384af61..c073f10d8c 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8cb57ad674..9c807560f5 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 77ca03b86e..f8fd395bd9 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "glog/logging.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index c8e9c5d687..8e4f049721 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -13,7 +13,10 @@ // limitations under the License. #include #include +#include #include +#include +#include #include #include diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 377ba50fcc..f7ec9d28de 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -14,7 +14,10 @@ #pragma once +#include #include +#include +#include #include #include diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 923e940884..778bbab505 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -16,7 +16,9 @@ #include #include +#include #include +#include #include #include #include diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 987f715066..9a9f4e08fe 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index fe3f2f4031..2b7cb16bc7 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -16,6 +16,7 @@ #include // NOLINT #include +#include #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index eeba330d66..be044085f1 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/framework/ddim.h" From 5e8de51409e52b9bc0210f32cf0759b5925995d4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 6 Mar 2019 09:31:34 +0800 Subject: [PATCH 61/98] code format test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 --- paddle/fluid/framework/details/async_ssa_graph_executor.h | 2 ++ paddle/fluid/framework/parallel_executor.cc | 1 + paddle/fluid/framework/reader.h | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 83fd8a50c3..69f770afee 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -14,9 +14,6 @@ #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" -#include -#include - #include "paddle/fluid/framework/variable_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h index 7d7296772d..6aaf8f9a16 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include "ThreadPool.h" diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ae7cd800ad..6c5f246f95 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 6cf0ec2937..4b400e72a4 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/ddim.h" From 255b36dad2a3500a108977cee2b5eb041b086d2b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 6 Mar 2019 14:39:14 +0800 Subject: [PATCH 62/98] can run --- .../details/async_ssa_graph_executor.cc | 13 +++++-- .../operators/distributed/CMakeLists.txt | 2 +- .../operators/distributed/communicator.cc | 6 ++++ .../operators/distributed/communicator.h | 2 +- .../fluid/operators/distributed/rpc_common.h | 36 ++++++++++++++++--- .../operators/distributed_ops/CMakeLists.txt | 4 +-- .../operators/distributed_ops/send_op.cc | 11 +++--- 7 files changed, 60 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 43391804c5..18fba0d19b 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -59,6 +59,8 @@ void ProcessGraph(std::vector graphs, Scope *scope) { send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(send_var_name, send_varnames, epmap, height_section); + VLOG(3) << "find and init an send op: " + << send_varname_to_ctx[send_var_name]; } else if (node->Op()->Type() == "recv") { auto recv_var_name = node->Op()->Input("X")[0]; auto recv_varnames = boost::get>( @@ -68,13 +70,19 @@ void ProcessGraph(std::vector graphs, Scope *scope) { recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(recv_var_name, recv_varnames, epmap, {}); + graphs[i]->RemoveNode(node); + VLOG(3) << "find and remove an recv op: " + << recv_varname_to_ctx[recv_var_name]; } } } } // init communicator here - operators::distributed::Communicator::Init(send_varname_to_ctx, - recv_varname_to_ctx, scope); + if (send_varname_to_ctx.size() > 0) { + VLOG(3) << "this is distribute mode, will use "; + operators::distributed::Communicator::Init(send_varname_to_ctx, + recv_varname_to_ctx, scope); + } } AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( @@ -110,6 +118,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( for (auto *scope : local_scopes_) { NewTempScopeAndInitVars(var_infos_, scope); } + ProcessGraph(graphs_, local_scopes_[0]); } void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() { diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 1301467fa7..6a269a4fbe 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(WITH_GRPC) else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index a88b764474..e800cd5f41 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -63,6 +63,9 @@ static inline void MergeVars(const std::string &var_name, } } +std::unique_ptr Communicator::communicator_(nullptr); +std::once_flag Communicator::init_flag_; + void Communicator::SendThread() { while (running_) { std::vector> task_futures; @@ -117,6 +120,7 @@ void Communicator::RecvThread() { void Communicator::Send(const std::string &var_name, const framework::Scope &scope) { + VLOG(3) << "communicator send " << var_name; // push var into send queue by var_name auto *grad_var = scope.FindVar(var_name); PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); @@ -125,6 +129,8 @@ void Communicator::Send(const std::string &var_name, send_varname_to_queue_[var_name]->Push(tmp_grad_var); } +Communicator *Communicator::GetInstance() { return communicator_.get(); } + void Communicator::Start() { running_ = true; // start send and recv thread diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 44e2aa3be7..bc753bb75e 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -144,7 +144,7 @@ class Communicator { InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope); } - static Communicator* GetInstance() { return communicator_.get(); } + static Communicator* GetInstance(); private: // Init is called by GetInstance. diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h index 39eb2d078c..3de89c2ae8 100644 --- a/paddle/fluid/operators/distributed/rpc_common.h +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include @@ -22,15 +23,17 @@ namespace operators { namespace distributed { struct RpcContext { - RpcContext(const std::string& name, const std::vector& names, - const std::vector& emap, - const std::vector& sections) + RpcContext() = default; + + RpcContext(const std::string &name, const std::vector &names, + const std::vector &emap, + const std::vector §ions) : var_name(name), splited_var_names(names), epmap(emap), height_sections(sections) {} - RpcContext(const RpcContext& ctx) { + RpcContext(const RpcContext &ctx) { var_name = ctx.var_name; splited_var_names = ctx.splited_var_names; epmap = ctx.epmap; @@ -43,6 +46,31 @@ struct RpcContext { std::vector height_sections; }; +inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { + os << "{"; + os << "var_name: " << rpc_ctx.var_name << "\n"; + + os << "splited_var_names: ["; + for (auto &name : rpc_ctx.splited_var_names) { + os << name << ", "; + } + os << "]\n"; + + os << "epmap: ["; + for (auto &ep : rpc_ctx.epmap) { + os << ep << ", "; + } + os << "]\n"; + + os << "height_sections: ["; + for (auto §ion : rpc_ctx.height_sections) { + os << section << ", "; + } + os << "]\n"; + os << "}"; + return os; +} + } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3bcfc532e8..a1ef1af39f 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 8b09cf86d7..347395b7cc 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/distributed/communicator.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/parameter_send.h" #include "paddle/fluid/operators/distributed/rpc_common.h" @@ -47,10 +48,12 @@ class SendOp : public framework::OperatorBase { if (send_varnames.size() > 0) { PADDLE_ENFORCE_EQ(ins.size(), 1, ""); - auto send_functor = distributed::ParameterSend(); - auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, - height_sections); - send_functor(rpc_ctx, scope, static_cast(sync_send)); + // auto send_functor = distributed::ParameterSend(); + // auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, + // epmap, + // height_sections); + // send_functor(rpc_ctx, scope, static_cast(sync_send)); + distributed::Communicator::GetInstance()->Send(ins[0], scope); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From 7d5dc4ef06dcfce01b7489f92ccb18c7ef7e67b4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 6 Mar 2019 15:47:20 +0800 Subject: [PATCH 63/98] fix cmake list --- paddle/fluid/operators/distributed/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 6a269a4fbe..750aac8dd0 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -54,7 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool) +cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} From a23f1ee85a0a08497fd372e28360e41a2818c14c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Mar 2019 09:46:40 +0800 Subject: [PATCH 64/98] optimize code --- .../details/async_ssa_graph_executor.cc | 21 +++++--- paddle/fluid/framework/parallel_executor.cc | 6 +-- .../operators/distributed/communicator.cc | 48 +++++++++++-------- .../operators/distributed/communicator.h | 6 +++ .../operators/distributed/variable_response.h | 6 ++- 5 files changed, 57 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 18fba0d19b..3f4d9f6ca4 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -23,6 +23,7 @@ namespace details { inline void NewTempScopeAndInitVars(const std::vector &var_infos, Scope *scope) { + VLOG(3) << "NewTempScopeAndInitVars"; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; @@ -43,12 +44,15 @@ inline void NewTempScopeAndInitVars(const std::vector &var_infos, // get RpcContext and remote send and recv op void ProcessGraph(std::vector graphs, Scope *scope) { using RpcCtxMap = operators::distributed::RpcCtxMap; + VLOG(3) << "ProcessGraph"; RpcCtxMap send_varname_to_ctx; RpcCtxMap recv_varname_to_ctx; for (auto i = 0; i < graphs.size(); ++i) { for (auto &node : graphs[i]->Nodes()) { - if (node->IsOp()) { - if (node->Op()->Type() == "send") { + VLOG(3) << "node name " << node->Name(); + std::vector nodes_to_delete; + if (node && node->IsOp()) { + if (node->Name() == "send") { auto send_var_name = node->Op()->Input("X")[0]; auto send_varnames = boost::get>( node->Op()->GetNullableAttr("send_varnames")); @@ -61,8 +65,8 @@ void ProcessGraph(std::vector graphs, Scope *scope) { epmap, height_section); VLOG(3) << "find and init an send op: " << send_varname_to_ctx[send_var_name]; - } else if (node->Op()->Type() == "recv") { - auto recv_var_name = node->Op()->Input("X")[0]; + } else if (node->Name() == "recv") { + auto recv_var_name = node->Op()->Output("Out")[0]; auto recv_varnames = boost::get>( node->Op()->GetNullableAttr("recv_varnames")); auto epmap = boost::get>( @@ -70,18 +74,23 @@ void ProcessGraph(std::vector graphs, Scope *scope) { recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(recv_var_name, recv_varnames, epmap, {}); - graphs[i]->RemoveNode(node); + nodes_to_delete.push_back(node); VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; } + VLOG(3) << "delete all recv ops"; + for (auto *node : nodes_to_delete) { + graphs[i]->RemoveNode(node); + } } } } // init communicator here if (send_varname_to_ctx.size() > 0) { - VLOG(3) << "this is distribute mode, will use "; + VLOG(3) << "this is distribute mode, will use communicator"; operators::distributed::Communicator::Init(send_varname_to_ctx, recv_varname_to_ctx, scope); + operators::distributed::Communicator::GetInstance()->Start(); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 6c5f246f95..6c710abd7a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -277,7 +277,7 @@ ParallelExecutor::ParallelExecutor( // ncclOp std::vector async_graphs(places.size()); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { + if (build_strategy.async_mode_) { VLOG(3) << "use local async mode"; temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, @@ -298,7 +298,7 @@ ParallelExecutor::ParallelExecutor( member_->nccl_ctxs_.get()); } #else - if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { + if (build_strategy.async_mode_) { VLOG(3) << "use local async mode"; temp_owned_graph = build_strategy.Apply( std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name, @@ -358,7 +358,7 @@ ParallelExecutor::ParallelExecutor( } } - if (build_strategy.async_mode_ && !build_strategy.is_distribution_) { + if (build_strategy.async_mode_) { VLOG(3) << "use AsyncSSAGraphExecutor"; member_->executor_.reset(new details::AsyncSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, async_graphs)); diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index e800cd5f41..b2bb8fb403 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -14,6 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/communicator.h" +#include // NOLINT +#include // NOLINT + #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable_helper.h" @@ -28,6 +31,7 @@ namespace distributed { static inline void MergeVars(const std::string &var_name, const std::vector> &vars, Scope *scope) { + VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to one"; PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; @@ -67,29 +71,32 @@ std::unique_ptr Communicator::communicator_(nullptr); std::once_flag Communicator::init_flag_; void Communicator::SendThread() { + VLOG("SendThread start!"); while (running_) { std::vector> task_futures; task_futures.reserve(send_varname_to_ctx_.size()); for (auto &iter : send_varname_to_queue_) { - auto send_task = [this, &iter] { - auto &var_name = iter.first; - VLOG(3) << "merge var " << var_name << " and send"; - auto &var_queue = iter.second; - std::vector> vars; - // TODO(qiao): need to be configurable - const size_t max_merge_var_num = 20; - size_t merged_var_num = 0; - while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { - vars.push_back(var_queue->Pop()); - merged_var_num++; - } - MergeVars(var_name, vars, send_scope_.get()); - auto send_functor = distributed::ParameterSend(); - auto &ctx = send_varname_to_ctx_.at(var_name); - send_functor(ctx, *send_scope_, true); - }; - task_futures.emplace_back( - send_threadpool_->enqueue(std::move(send_task))); + auto &var_name = iter.first; + auto &var_queue = iter.second; + if (var_queue->NotEmpty()) { // will block if queue is empty + auto send_task = [this, &var_name, &var_queue] { + VLOG(3) << "merge var " << var_name << " and send"; + std::vector> vars; + // TODO(qiao): need to be configurable + const size_t max_merge_var_num = 20; + size_t merged_var_num = 0; + while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { + vars.push_back(var_queue->Pop()); + merged_var_num++; + } + MergeVars(var_name, vars, send_scope_.get()); + auto send_functor = distributed::ParameterSend(); + auto &ctx = send_varname_to_ctx_.at(var_name); + send_functor(ctx, *send_scope_, true); + }; + task_futures.emplace_back( + send_threadpool_->enqueue(std::move(send_task))); + } } for (auto &task_f : task_futures) { task_f.wait(); @@ -98,6 +105,7 @@ void Communicator::SendThread() { } void Communicator::RecvThread() { + VLOG(3) << "RecvThread start!"; while (running_) { // parallel run recv graph std::vector> task_futures; @@ -115,6 +123,8 @@ void Communicator::RecvThread() { for (auto &task : task_futures) { task.wait(); } + // TODO(qiao) need to be configuable + std::this_thread::sleep_for(std::chrono::milliseconds(200)); } } diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index bc753bb75e..c93ad02555 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -68,6 +68,12 @@ class BlockingQueue { return rc; } + bool NotEmpty() { + std::unique_lock lock(mutex_); + recv_cv_.wait(lock, [=] { return !queue_.empty(); }); + return true; + } + size_t Cap() const { std::lock_guard lock(mutex_); return capacity_; diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 3ecb696069..edc12e2091 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -60,12 +60,14 @@ class VariableResponse { bool create_scope = false) : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) { if (create_scope) { - local_scope_ = scope->NewTmpScope(); + local_scope_ = &scope->NewScope(); } } virtual ~VariableResponse() { - if (local_scope_) delete local_scope_; + if (local_scope_) { + scope_->DeleteScope(local_scope_); + } } int Parse(Source* source, const sendrecv::VariableMessage& meta) { From 446fdf95634df26dd18388a3834ff9a556764296 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Mar 2019 10:00:27 +0800 Subject: [PATCH 65/98] fix compile problem --- paddle/fluid/framework/details/build_strategy.cc | 6 +++--- paddle/fluid/operators/distributed/communicator.cc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 92b69334b8..22ce1b52c1 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -136,11 +136,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass; - if (strategy_.is_distribution_) { + if (strategy_.async_mode_) { + multi_devices_pass = AppendPass("async_multi_devices_pass").get(); + } else if (strategy_.is_distribution_) { VLOG(3) << "multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); - } else if (strategy_.async_mode_) { - multi_devices_pass = AppendPass("async_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { VLOG(3) << "multi devices collective mode with allreduce"; diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index b2bb8fb403..506c5fbebd 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -71,7 +71,7 @@ std::unique_ptr Communicator::communicator_(nullptr); std::once_flag Communicator::init_flag_; void Communicator::SendThread() { - VLOG("SendThread start!"); + VLOG(3) << "SendThread start!"; while (running_) { std::vector> task_futures; task_futures.reserve(send_varname_to_ctx_.size()); From fe6a8409241f69d52661e555fb02a1e1daca3cf7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Mar 2019 11:41:55 +0800 Subject: [PATCH 66/98] fix delete recv ops --- .../framework/details/async_ssa_graph_executor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 3f4d9f6ca4..e7cc14b0d1 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -48,9 +48,9 @@ void ProcessGraph(std::vector graphs, Scope *scope) { RpcCtxMap send_varname_to_ctx; RpcCtxMap recv_varname_to_ctx; for (auto i = 0; i < graphs.size(); ++i) { + std::vector nodes_to_delete; for (auto &node : graphs[i]->Nodes()) { VLOG(3) << "node name " << node->Name(); - std::vector nodes_to_delete; if (node && node->IsOp()) { if (node->Name() == "send") { auto send_var_name = node->Op()->Input("X")[0]; @@ -78,12 +78,12 @@ void ProcessGraph(std::vector graphs, Scope *scope) { VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; } - VLOG(3) << "delete all recv ops"; - for (auto *node : nodes_to_delete) { - graphs[i]->RemoveNode(node); - } } } + VLOG(3) << "delete all recv ops"; + for (auto *node : nodes_to_delete) { + graphs[i]->RemoveNode(node); + } } // init communicator here if (send_varname_to_ctx.size() > 0) { From 3225e195912b1c467558bce192c6468d7f0e8540 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 7 Mar 2019 14:54:59 +0800 Subject: [PATCH 67/98] fix remove recv op --- .../details/async_ssa_graph_executor.cc | 21 +++++++++++++++++++ .../operators/distributed/communicator.cc | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index e7cc14b0d1..b36ed8af9a 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -82,6 +82,27 @@ void ProcessGraph(std::vector graphs, Scope *scope) { } VLOG(3) << "delete all recv ops"; for (auto *node : nodes_to_delete) { + // delete input edge + for (auto *in : node->inputs) { + auto &in_outs = in->outputs; + for (auto iter = in_outs.begin(); iter != in_outs.end();) { + if (*iter == node) { + VLOG(3) << "delete input edge from " << in->Name() << " for " + << node->Name(); + iter = in_outs.erase(iter); + } else { + ++iter; + } + } + } + // delete output edge + for (auto *out : node->outputs) { + PADDLE_ENFORCE_EQ(out->outputs.size(), 0, "%s should have no outputs", + out->Name()); + VLOG(3) << "delete output edge to " << out->Name(); + graphs[i]->RemoveNode(out); + } + VLOG(3) << "delete node " << node->Name(); graphs[i]->RemoveNode(node); } } diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 506c5fbebd..f5d274b66d 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -31,7 +31,7 @@ namespace distributed { static inline void MergeVars(const std::string &var_name, const std::vector> &vars, Scope *scope) { - VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to one"; + VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to 1"; PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; From ff8054c5a7f4ea34f6f112c318c03a16adf37e64 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Mar 2019 10:23:54 +0800 Subject: [PATCH 68/98] can run --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 2 ++ paddle/fluid/framework/details/multi_devices_graph_pass.h | 4 ++++ paddle/fluid/operators/distributed_ops/recv_op.cc | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index b36ed8af9a..12822c64e9 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -80,6 +80,7 @@ void ProcessGraph(std::vector graphs, Scope *scope) { } } } + /* VLOG(3) << "delete all recv ops"; for (auto *node : nodes_to_delete) { // delete input edge @@ -105,6 +106,7 @@ void ProcessGraph(std::vector graphs, Scope *scope) { VLOG(3) << "delete node " << node->Name(); graphs[i]->RemoveNode(node); } + */ } // init communicator here if (send_varname_to_ctx.size() > 0) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index f7ec9d28de..0b9061ad60 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -127,6 +127,10 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { bool NeedCollectiveOps() const override { return false; } bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { + if (node->Op()->Type() == "recv") { + node->Op()->SetAttr("do_not_run", true); + node->Op()->Flush(); + } return false; } diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 680b484d41..afbf7a4a23 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -36,6 +36,11 @@ class RecvOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { + bool do_not_run = Attr("do_not_run"); + if (do_not_run) { + VLOG(3) << "recv do not run!"; + return; + } std::vector epmap = Attr>("epmap"); std::vector varnames = Attr>("varnames"); @@ -126,6 +131,7 @@ This operator can get variables from server side. "(vector) " "the splited parameter varnames to be recved from pserver") .SetDefault(std::vector{}); + AddAttr("do_not_run", "").SetDefault(false); } }; From c0e5941e31000447c10dd64fe5dfc47309ec33c7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Mar 2019 10:35:01 +0800 Subject: [PATCH 69/98] add commnet for recv do_not_run --- paddle/fluid/operators/distributed_ops/recv_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index afbf7a4a23..3fd0700a07 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -131,7 +131,7 @@ This operator can get variables from server side. "(vector) " "the splited parameter varnames to be recved from pserver") .SetDefault(std::vector{}); - AddAttr("do_not_run", "").SetDefault(false); + AddAttr("do_not_run", "if recv need to really run").SetDefault(false); } }; From 63cd70a8b84905adc83d0fc082e4eaf15d91361b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Mar 2019 17:36:02 +0800 Subject: [PATCH 70/98] fix blocking problem --- .../operators/distributed/communicator.cc | 51 +++++++++++-------- .../operators/distributed/communicator.h | 38 +++++++------- .../operators/distributed/parameter_recv.cc | 2 + .../operators/distributed_ops/send_op.cc | 13 +++-- 4 files changed, 60 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index f5d274b66d..a7bce26234 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -75,10 +75,11 @@ void Communicator::SendThread() { while (running_) { std::vector> task_futures; task_futures.reserve(send_varname_to_ctx_.size()); + VLOG(3) << "run send graph"; for (auto &iter : send_varname_to_queue_) { auto &var_name = iter.first; auto &var_queue = iter.second; - if (var_queue->NotEmpty()) { // will block if queue is empty + if (var_queue->Size() > 0) { auto send_task = [this, &var_name, &var_queue] { VLOG(3) << "merge var " << var_name << " and send"; std::vector> vars; @@ -96,33 +97,41 @@ void Communicator::SendThread() { }; task_futures.emplace_back( send_threadpool_->enqueue(std::move(send_task))); + } else { + VLOG(3) << var_name << " queue empty"; } } for (auto &task_f : task_futures) { task_f.wait(); } + VLOG(3) << "run send graph done"; + RecvAll(); } } +void Communicator::RecvAll() { + VLOG(3) << "parallel run recv graph"; + std::vector> task_futures; + task_futures.reserve(recv_varname_to_ctx_.size()); + for (auto &iter : recv_varname_to_ctx_) { + auto recv_task = [this, &iter] { + auto &var_name = iter.first; + VLOG(3) << "recv var " << var_name; + auto recv_functor = distributed::ParameterRecv(); + recv_functor(iter.second, *recv_scope_); + }; + task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); + } + for (auto &task : task_futures) { + task.wait(); + } + VLOG(3) << "run recv graph done"; +} + void Communicator::RecvThread() { VLOG(3) << "RecvThread start!"; while (running_) { - // parallel run recv graph - std::vector> task_futures; - task_futures.reserve(recv_varname_to_ctx_.size()); - for (auto &iter : recv_varname_to_ctx_) { - auto recv_task = [this, &iter] { - auto &var_name = iter.first; - VLOG(3) << "recv var " << var_name; - auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); - }; - task_futures.emplace_back( - recv_threadpool_->enqueue(std::move(recv_task))); - } - for (auto &task : task_futures) { - task.wait(); - } + RecvAll(); // TODO(qiao) need to be configuable std::this_thread::sleep_for(std::chrono::milliseconds(200)); } @@ -136,7 +145,9 @@ void Communicator::Send(const std::string &var_name, PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); auto tmp_grad_var = std::make_shared(); framework::CopyVariable(*grad_var, tmp_grad_var.get()); - send_varname_to_queue_[var_name]->Push(tmp_grad_var); + auto &queue = send_varname_to_queue_.at(var_name); + VLOG(3) << "send " << var_name << " queue size " << queue->Size(); + queue->Push(tmp_grad_var); } Communicator *Communicator::GetInstance() { return communicator_.get(); } @@ -146,8 +157,8 @@ void Communicator::Start() { // start send and recv thread send_thread_.reset( new std::thread(std::bind(&Communicator::SendThread, this))); - recv_thread_.reset( - new std::thread(std::bind(&Communicator::RecvThread, this))); + // recv_thread_.reset( + // new std::thread(std::bind(&Communicator::RecvThread, this))); } } // namespace distributed diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index c93ad02555..3c98b36b74 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -43,37 +43,36 @@ class BlockingQueue { } bool Push(const T& elem) { - std::unique_lock lock(mutex_); - send_cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT(queue_.size(), capacity_); - queue_.push_back(elem); - recv_cv_.notify_one(); + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.push_back(elem); + } + cv_.notify_one(); return true; } bool Push(T&& elem) { - std::unique_lock lock(mutex_); - send_cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT(queue_.size(), capacity_); - queue_.emplace_back(std::move(elem)); - recv_cv_.notify_one(); + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.emplace_back(std::move(elem)); + } + cv_.notify_one(); return true; } T Pop() { std::unique_lock lock(mutex_); - recv_cv_.wait(lock, [=] { return !queue_.empty(); }); + cv_.wait(lock, [=] { return !queue_.empty(); }); T rc(std::move(queue_.front())); queue_.pop_front(); + cv_.notify_one(); return rc; } - bool NotEmpty() { - std::unique_lock lock(mutex_); - recv_cv_.wait(lock, [=] { return !queue_.empty(); }); - return true; - } - size_t Cap() const { std::lock_guard lock(mutex_); return capacity_; @@ -89,8 +88,7 @@ class BlockingQueue { std::deque queue_; mutable std::mutex mutex_; - std::condition_variable recv_cv_; - std::condition_variable send_cv_; + std::condition_variable cv_; }; using RpcCtxMap = std::unordered_map; @@ -127,6 +125,8 @@ class Communicator { void Send(const std::string& var_name, const framework::Scope& scope); private: + // recv all parameter + void RecvAll(); void SendThread(); void RecvThread(); diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index fecc76955d..c3238f28f6 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -41,6 +41,7 @@ using DDim = framework::DDim; template void ParameterRecv::operator()(const RpcContext &rpc_ctx, const framework::Scope &scope) { + VLOG(3) << "ParameterRecv in"; framework::Scope *local_scope = scope.NewTmpScope(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -90,6 +91,7 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, } delete local_scope; + VLOG(3) << "ParameterRecv out"; } template struct ParameterRecv; diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 347395b7cc..67de7b4185 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -48,12 +48,15 @@ class SendOp : public framework::OperatorBase { if (send_varnames.size() > 0) { PADDLE_ENFORCE_EQ(ins.size(), 1, ""); - // auto send_functor = distributed::ParameterSend(); - // auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, - // epmap, - // height_sections); - // send_functor(rpc_ctx, scope, static_cast(sync_send)); + /* + auto send_functor = distributed::ParameterSend(); + auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, + height_sections); + send_functor(rpc_ctx, scope, static_cast(sync_send)); + */ + VLOG(3) << "send " << ins[0]; distributed::Communicator::GetInstance()->Send(ins[0], scope); + VLOG(3) << "send " << ins[0] << " done"; } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From 0a828fef8286c6b9cd7a5ca2345d19057762dc79 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 10 Mar 2019 23:16:50 +0800 Subject: [PATCH 71/98] add some flags for communicator --- .../operators/distributed/communicator.cc | 54 +++++++++++++++++-- .../operators/distributed/communicator.h | 23 +------- python/paddle/fluid/__init__.py | 4 ++ 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index a7bce26234..73b9800d43 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/communicator.h" +#include #include // NOLINT #include // NOLINT @@ -24,6 +25,13 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/parameter_send.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" +DEFINE_bool(communicator_independent_recv_thread, true, + "use an independent to recv vars from parameter server"); +DEFINE_int32(communicator_send_queue_size, 20, + "queue size to recv gradient before send"); +DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv"); +DEFINE_int32(communicator_thread_pool_size, 5, "wait time between each recv"); + namespace paddle { namespace operators { namespace distributed { @@ -70,6 +78,38 @@ static inline void MergeVars(const std::string &var_name, std::unique_ptr Communicator::communicator_(nullptr); std::once_flag Communicator::init_flag_; +Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, + const RpcCtxMap &recv_varname_to_ctx, + Scope *recv_scope) + : send_varname_to_ctx_(send_varname_to_ctx), + recv_varname_to_ctx_(recv_varname_to_ctx), + recv_scope_(recv_scope) { + // get all send information from graph, build vars_to_send + VLOG(0) << "communicator_independent_recv_thread: " + << FLAGS_communicator_independent_recv_thread; + VLOG(0) << "communicator_send_queue_size: " + << FLAGS_communicator_send_queue_size; + VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms; + VLOG(0) << "communicator_thread_pool_size: " + << FLAGS_communicator_thread_pool_size; + send_scope_.reset(new Scope()); + for (auto &iter : send_varname_to_ctx_) { + send_varname_to_queue_[iter.first] = + std::make_shared>>( + FLAGS_communicator_send_queue_size); + } + send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); + recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); +} + +Communicator::~Communicator() { + VLOG(3) << "~Communicator"; + running_ = false; + if (send_thread_) send_thread_->join(); + if (recv_thread_) recv_thread_->join(); + VLOG(3) << "~Communicator done"; +} + void Communicator::SendThread() { VLOG(3) << "SendThread start!"; while (running_) { @@ -105,7 +145,9 @@ void Communicator::SendThread() { task_f.wait(); } VLOG(3) << "run send graph done"; - RecvAll(); + if (!FLAGS_communicator_independent_recv_thread) { + RecvAll(); + } } } @@ -132,8 +174,8 @@ void Communicator::RecvThread() { VLOG(3) << "RecvThread start!"; while (running_) { RecvAll(); - // TODO(qiao) need to be configuable - std::this_thread::sleep_for(std::chrono::milliseconds(200)); + std::this_thread::sleep_for( + std::chrono::milliseconds(FLAGS_communicator_recv_wait_ms)); } } @@ -157,8 +199,10 @@ void Communicator::Start() { // start send and recv thread send_thread_.reset( new std::thread(std::bind(&Communicator::SendThread, this))); - // recv_thread_.reset( - // new std::thread(std::bind(&Communicator::RecvThread, this))); + if (FLAGS_communicator_independent_recv_thread) { + recv_thread_.reset( + new std::thread(std::bind(&Communicator::RecvThread, this))); + } } } // namespace distributed diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 3c98b36b74..4104cb20a3 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -96,28 +96,9 @@ using RpcCtxMap = std::unordered_map; class Communicator { public: Communicator(const RpcCtxMap& send_varname_to_ctx, - const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) - : send_varname_to_ctx_(send_varname_to_ctx), - recv_varname_to_ctx_(recv_varname_to_ctx), - recv_scope_(recv_scope) { - // get all send information from graph, build vars_to_send - send_scope_.reset(new Scope()); - for (auto& iter : send_varname_to_ctx_) { - send_varname_to_queue_[iter.first] = - std::make_shared>>(10); - } - // TODO(qiao): default 5, need to config - send_threadpool_.reset(new ::ThreadPool(5)); - recv_threadpool_.reset(new ::ThreadPool(5)); - } + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope); - ~Communicator() { - VLOG(3) << "~Communicator"; - running_ = false; - send_thread_->join(); - recv_thread_->join(); - VLOG(3) << "~Communicator done"; - } + ~Communicator(); void Start(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d12f04a6ab..8af5e1c509 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -150,6 +150,10 @@ def __bootstrap__(): read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_disable_reuse_port') + read_env_flags.append('communicator_independent_recv_thread') + read_env_flags.append('communicator_send_queue_size') + read_env_flags.append('communicator_recv_wait_ms') + read_env_flags.append('communicator_thread_pool_size') if core.is_compiled_with_brpc(): read_env_flags.append('max_body_size') #set brpc max body size From eb6af305d62f233bc70a313f8c24ef5088d4bac6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 10 Mar 2019 23:18:09 +0800 Subject: [PATCH 72/98] change embedding interface addnremote_prefetch --- python/paddle/fluid/layers/nn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index efb400ccc6..48a46a0ff0 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -306,7 +306,8 @@ def embedding(input, is_distributed=False, padding_idx=None, param_attr=None, - dtype='float32'): + dtype='float32', + remote_prefetch=False): """ **Embedding Layer** @@ -345,7 +346,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = is_sparse and (not is_distributed) + remote_prefetch = is_sparse and (not is_distributed) and remote_prefetch if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( From ad5a2b3edfb437a225d7f42ab5c35b65a3b9d49e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 11 Mar 2019 11:02:54 +0800 Subject: [PATCH 73/98] add some debug flags for communicator --- .../operators/distributed/communicator.cc | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 73b9800d43..06f7859f4f 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -30,7 +30,11 @@ DEFINE_bool(communicator_independent_recv_thread, true, DEFINE_int32(communicator_send_queue_size, 20, "queue size to recv gradient before send"); DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv"); -DEFINE_int32(communicator_thread_pool_size, 5, "wait time between each recv"); +DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); +DEFINE_int32(communicator_max_merge_var_num, 20, + "max var num to merge and send"); +DEFINE_bool(communicator_fake_rpc, false, + "fake mode does not really send any thing"); namespace paddle { namespace operators { @@ -92,6 +96,9 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms; VLOG(0) << "communicator_thread_pool_size: " << FLAGS_communicator_thread_pool_size; + VLOG(0) << "communicator_max_merge_var_num" + << FLAGS_communicator_max_merge_var_num; + VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; send_scope_.reset(new Scope()); for (auto &iter : send_varname_to_ctx_) { send_varname_to_queue_[iter.first] = @@ -123,17 +130,18 @@ void Communicator::SendThread() { auto send_task = [this, &var_name, &var_queue] { VLOG(3) << "merge var " << var_name << " and send"; std::vector> vars; - // TODO(qiao): need to be configurable - const size_t max_merge_var_num = 20; size_t merged_var_num = 0; - while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) { + while (var_queue->Size() > 0 && + merged_var_num < FLAGS_communicator_max_merge_var_num) { vars.push_back(var_queue->Pop()); merged_var_num++; } MergeVars(var_name, vars, send_scope_.get()); auto send_functor = distributed::ParameterSend(); auto &ctx = send_varname_to_ctx_.at(var_name); - send_functor(ctx, *send_scope_, true); + if (!FLAGS_communicator_fake_rpc) { + send_functor(ctx, *send_scope_, true); + } }; task_futures.emplace_back( send_threadpool_->enqueue(std::move(send_task))); @@ -160,7 +168,9 @@ void Communicator::RecvAll() { auto &var_name = iter.first; VLOG(3) << "recv var " << var_name; auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); + if (!FLAGS_communicator_fake_rpc) { + recv_functor(iter.second, *recv_scope_); + } }; task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); } From 43378ad626460e11e7afd1cf8176c51fe592396b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 11 Mar 2019 12:37:57 +0800 Subject: [PATCH 74/98] add flags to init --- paddle/fluid/operators/distributed/communicator.cc | 2 +- python/paddle/fluid/__init__.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 06f7859f4f..6acb572de9 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -96,7 +96,7 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms; VLOG(0) << "communicator_thread_pool_size: " << FLAGS_communicator_thread_pool_size; - VLOG(0) << "communicator_max_merge_var_num" + VLOG(0) << "communicator_max_merge_var_num: " << FLAGS_communicator_max_merge_var_num; VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; send_scope_.reset(new Scope()); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8af5e1c509..c478c8ceee 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -143,6 +143,7 @@ def __bootstrap__(): read_env_flags.append('use_mkldnn') if core.is_compiled_with_dist(): + #env for rpc read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_server_profile_path') read_env_flags.append('enable_rpc_profiler') @@ -150,10 +151,14 @@ def __bootstrap__(): read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_disable_reuse_port') + + # env for communicator read_env_flags.append('communicator_independent_recv_thread') read_env_flags.append('communicator_send_queue_size') read_env_flags.append('communicator_recv_wait_ms') read_env_flags.append('communicator_thread_pool_size') + read_env_flags.append('communicator_max_merge_var_num') + read_env_flags.append('communicator_fake_rpc') if core.is_compiled_with_brpc(): read_env_flags.append('max_body_size') #set brpc max body size From d3a14377d5cf0376a5f0170406fecd336e3fc41a Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 11 Mar 2019 15:08:38 +0800 Subject: [PATCH 75/98] add fake rpc to send --- .../operators/distributed/communicator.cc | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 6acb572de9..d3b77a758c 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -191,15 +191,17 @@ void Communicator::RecvThread() { void Communicator::Send(const std::string &var_name, const framework::Scope &scope) { - VLOG(3) << "communicator send " << var_name; - // push var into send queue by var_name - auto *grad_var = scope.FindVar(var_name); - PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); - auto tmp_grad_var = std::make_shared(); - framework::CopyVariable(*grad_var, tmp_grad_var.get()); - auto &queue = send_varname_to_queue_.at(var_name); - VLOG(3) << "send " << var_name << " queue size " << queue->Size(); - queue->Push(tmp_grad_var); + if (!FLAGS_communicator_fake_rpc) { + VLOG(3) << "communicator send " << var_name; + // push var into send queue by var_name + auto *grad_var = scope.FindVar(var_name); + PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); + auto tmp_grad_var = std::make_shared(); + framework::CopyVariable(*grad_var, tmp_grad_var.get()); + auto &queue = send_varname_to_queue_.at(var_name); + VLOG(3) << "send " << var_name << " queue size " << queue->Size(); + queue->Push(tmp_grad_var); + } } Communicator *Communicator::GetInstance() { return communicator_.get(); } From 23d3929a4bb758b70c1aafe31b3eabedc5d2ea3d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 12 Mar 2019 17:20:08 +0800 Subject: [PATCH 76/98] optimize merge vars --- .../operators/distributed/communicator.cc | 85 ++++++++++++++----- 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index d3b77a758c..91e2417d0c 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -18,12 +18,15 @@ limitations under the License. */ #include // NOLINT #include // NOLINT +#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/parameter_recv.h" #include "paddle/fluid/operators/distributed/parameter_send.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device_context.h" DEFINE_bool(communicator_independent_recv_thread, true, "use an independent to recv vars from parameter server"); @@ -40,28 +43,54 @@ namespace paddle { namespace operators { namespace distributed { +template +using EigenVector = framework::EigenVector; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + static inline void MergeVars(const std::string &var_name, const std::vector> &vars, Scope *scope) { - VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to 1"; PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; auto *out_var = scope->Var(var_name); if (var0->IsType()) { + VLOG(3) << "merge " << var_name << " LoDTensor" + << var0->Get().dims(); + + // init output tensor auto *out_t = out_var->GetMutable(); auto *out_ptr = out_t->mutable_data( var0->Get().dims(), cpu_place); auto numel = out_t->numel(); - for (auto i = 0; i < numel; ++i) { - out_ptr[i] = 0; - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims"); - out_ptr[i] += var_t.data()[i]; - } + + // check the input dims + for (auto &var : vars) { + auto &var_t = var->Get(); + PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims"); + } + + // set output tensor to 0. + auto cpu_ctx = paddle::platform::CPUDeviceContext(); + math::SetConstant + constant_functor; + constant_functor(cpu_ctx, out_t, static_cast(0)); + + // sum all vars to out + auto result = EigenVector::Flatten(*out_t); + for (auto &var : vars) { + auto &in_t = var->Get(); + auto in = EigenVector::Flatten(in_t); + result.device(*cpu_ctx.eigen_device()) = result + in; } } else if (var0->IsType()) { + auto &slr0 = var0->Get(); auto *out_slr = out_var->GetMutable(); out_slr->mutable_rows()->clear(); out_slr->mutable_value()->mutable_data({{}}, cpu_place); @@ -74,6 +103,8 @@ static inline void MergeVars(const std::string &var_name, merge_add; auto dev_ctx = paddle::platform::CPUDeviceContext(); merge_add(dev_ctx, inputs, out_slr, false); + VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() + << " dims: " << slr0.value().dims(); } else { PADDLE_THROW("unsupported var type!"); } @@ -123,12 +154,13 @@ void Communicator::SendThread() { std::vector> task_futures; task_futures.reserve(send_varname_to_ctx_.size()); VLOG(3) << "run send graph"; + auto before_run_send_graph = GetCurrentUS(); for (auto &iter : send_varname_to_queue_) { auto &var_name = iter.first; auto &var_queue = iter.second; if (var_queue->Size() > 0) { auto send_task = [this, &var_name, &var_queue] { - VLOG(3) << "merge var " << var_name << " and send"; + VLOG(3) << var_name << " merge and send"; std::vector> vars; size_t merged_var_num = 0; while (var_queue->Size() > 0 && @@ -136,12 +168,19 @@ void Communicator::SendThread() { vars.push_back(var_queue->Pop()); merged_var_num++; } + auto before_merge = GetCurrentUS(); MergeVars(var_name, vars, send_scope_.get()); + auto after_merge = GetCurrentUS(); + VLOG(3) << "merge " << var_name << " use time " + << after_merge - before_merge; auto send_functor = distributed::ParameterSend(); auto &ctx = send_varname_to_ctx_.at(var_name); if (!FLAGS_communicator_fake_rpc) { send_functor(ctx, *send_scope_, true); } + auto after_send = GetCurrentUS(); + VLOG(3) << "send " << var_name << " use time " + << after_send - after_merge; }; task_futures.emplace_back( send_threadpool_->enqueue(std::move(send_task))); @@ -152,7 +191,9 @@ void Communicator::SendThread() { for (auto &task_f : task_futures) { task_f.wait(); } - VLOG(3) << "run send graph done"; + auto after_run_send_graph = GetCurrentUS(); + VLOG(3) << "run send graph use time " + << after_run_send_graph - before_run_send_graph; if (!FLAGS_communicator_independent_recv_thread) { RecvAll(); } @@ -161,6 +202,7 @@ void Communicator::SendThread() { void Communicator::RecvAll() { VLOG(3) << "parallel run recv graph"; + auto before_send = GetCurrentUS(); std::vector> task_futures; task_futures.reserve(recv_varname_to_ctx_.size()); for (auto &iter : recv_varname_to_ctx_) { @@ -177,7 +219,8 @@ void Communicator::RecvAll() { for (auto &task : task_futures) { task.wait(); } - VLOG(3) << "run recv graph done"; + auto after_recv = GetCurrentUS(); + VLOG(3) << "run recv graph use time " << after_recv - before_send; } void Communicator::RecvThread() { @@ -191,17 +234,15 @@ void Communicator::RecvThread() { void Communicator::Send(const std::string &var_name, const framework::Scope &scope) { - if (!FLAGS_communicator_fake_rpc) { - VLOG(3) << "communicator send " << var_name; - // push var into send queue by var_name - auto *grad_var = scope.FindVar(var_name); - PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); - auto tmp_grad_var = std::make_shared(); - framework::CopyVariable(*grad_var, tmp_grad_var.get()); - auto &queue = send_varname_to_queue_.at(var_name); - VLOG(3) << "send " << var_name << " queue size " << queue->Size(); - queue->Push(tmp_grad_var); - } + VLOG(3) << "communicator send " << var_name; + // push var into send queue by var_name + auto *grad_var = scope.FindVar(var_name); + PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); + auto tmp_grad_var = std::make_shared(); + framework::CopyVariable(*grad_var, tmp_grad_var.get()); + auto &queue = send_varname_to_queue_.at(var_name); + VLOG(3) << "send " << var_name << " queue size " << queue->Size(); + queue->Push(tmp_grad_var); } Communicator *Communicator::GetInstance() { return communicator_.get(); } From 9b74707cbf293f17e3b8a84c319f14ee3370f53d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 12 Mar 2019 17:24:05 +0800 Subject: [PATCH 77/98] fix compile problem --- paddle/fluid/operators/distributed/communicator.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 91e2417d0c..f17af56400 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -66,8 +66,6 @@ static inline void MergeVars(const std::string &var_name, // init output tensor auto *out_t = out_var->GetMutable(); - auto *out_ptr = out_t->mutable_data( - var0->Get().dims(), cpu_place); auto numel = out_t->numel(); // check the input dims @@ -83,7 +81,7 @@ static inline void MergeVars(const std::string &var_name, constant_functor(cpu_ctx, out_t, static_cast(0)); // sum all vars to out - auto result = EigenVector::Flatten(*out_t); + auto result = EigenVector::Flatten(*out_t); for (auto &var : vars) { auto &in_t = var->Get(); auto in = EigenVector::Flatten(in_t); From 0fcdae8418b8bbc06013ca540d8a7b8d2e4d790e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 12 Mar 2019 23:08:55 +0800 Subject: [PATCH 78/98] add communicator_test --- .../operators/distributed/CMakeLists.txt | 1 + .../operators/distributed/communicator.cc | 62 ---------- .../operators/distributed/communicator.h | 61 ++++++++++ .../distributed/communicator_test.cc | 110 ++++++++++++++++++ 4 files changed, 172 insertions(+), 62 deletions(-) create mode 100644 paddle/fluid/operators/distributed/communicator_test.cc diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 750aac8dd0..972b4f67a8 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -55,6 +55,7 @@ cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc mem cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv) +cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index f17af56400..72f26e91b2 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -24,9 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/parameter_recv.h" #include "paddle/fluid/operators/distributed/parameter_send.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/device_context.h" DEFINE_bool(communicator_independent_recv_thread, true, "use an independent to recv vars from parameter server"); @@ -43,71 +40,12 @@ namespace paddle { namespace operators { namespace distributed { -template -using EigenVector = framework::EigenVector; - inline double GetCurrentUS() { struct timeval time; gettimeofday(&time, NULL); return 1e+6 * time.tv_sec + time.tv_usec; } -static inline void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope) { - PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); - auto cpu_place = platform::CPUPlace(); - auto &var0 = vars[0]; - auto *out_var = scope->Var(var_name); - if (var0->IsType()) { - VLOG(3) << "merge " << var_name << " LoDTensor" - << var0->Get().dims(); - - // init output tensor - auto *out_t = out_var->GetMutable(); - auto numel = out_t->numel(); - - // check the input dims - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims"); - } - - // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - math::SetConstant - constant_functor; - constant_functor(cpu_ctx, out_t, static_cast(0)); - - // sum all vars to out - auto result = EigenVector::Flatten(*out_t); - for (auto &var : vars) { - auto &in_t = var->Get(); - auto in = EigenVector::Flatten(in_t); - result.device(*cpu_ctx.eigen_device()) = result + in; - } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); - out_slr->mutable_rows()->clear(); - out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; - inputs.reserve(vars.size()); - for (auto &var : vars) { - inputs.push_back(&var->Get()); - } - math::scatter::MergeAdd - merge_add; - auto dev_ctx = paddle::platform::CPUDeviceContext(); - merge_add(dev_ctx, inputs, out_slr, false); - VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() - << " dims: " << slr0.value().dims(); - } else { - PADDLE_THROW("unsupported var type!"); - } -} - std::unique_ptr Communicator::communicator_(nullptr); std::once_flag Communicator::init_flag_; diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 4104cb20a3..3fe2a21232 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -24,6 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/distributed/rpc_common.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -91,6 +93,65 @@ class BlockingQueue { std::condition_variable cv_; }; +template +using EigenVector = framework::EigenVector; + +inline void MergeVars(const std::string& var_name, + const std::vector>& vars, + Scope* scope) { + PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); + auto cpu_place = platform::CPUPlace(); + auto& var0 = vars[0]; + auto* out_var = scope->Var(var_name); + if (var0->IsType()) { + auto dims = var0->Get().dims(); + VLOG(3) << "merge " << var_name << " LoDTensor " << dims; + + // init output tensor + auto* out_t = out_var->GetMutable(); + out_t->mutable_data(dims, cpu_place); + + // check the input dims + for (auto& var : vars) { + auto& var_t = var->Get(); + PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims"); + } + + // set output tensor to 0. + auto cpu_ctx = paddle::platform::CPUDeviceContext(); + math::SetConstant + constant_functor; + constant_functor(cpu_ctx, out_t, static_cast(0)); + + // sum all vars to out + auto result = EigenVector::Flatten(*out_t); + for (auto& var : vars) { + auto& in_t = var->Get(); + auto in = EigenVector::Flatten(in_t); + result.device(*cpu_ctx.eigen_device()) = result + in; + } + } else if (var0->IsType()) { + auto& slr0 = var0->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->mutable_rows()->clear(); + out_slr->mutable_value()->mutable_data({{}}, cpu_place); + std::vector inputs; + inputs.reserve(vars.size()); + for (auto& var : vars) { + inputs.push_back(&var->Get()); + } + math::scatter::MergeAdd + merge_add; + auto dev_ctx = paddle::platform::CPUDeviceContext(); + merge_add(dev_ctx, inputs, out_slr, false); + VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() + << " dims: " << slr0.value().dims(); + } else { + PADDLE_THROW("unsupported var type!"); + } +} + using RpcCtxMap = std::unordered_map; class Communicator { diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc new file mode 100644 index 0000000000..5294ac33d1 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/communicator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +TEST(communicator, merge_lod_tensors) { + auto cpu_place = platform::CPUPlace(); + auto dims = framework::make_ddim({2, 3}); + std::vector> in_vars; + float out_value = 0; + for (auto i = 0; i < 10; ++i) { + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *tensor = var->GetMutable(); + auto *data = tensor->mutable_data(dims, cpu_place); + for (auto j = 0; j < tensor->numel(); ++j) { + data[j] = static_cast(i); + } + out_value += static_cast(i); + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_tensor = scope->FindVar(out_name)->Get(); + auto *out_data = out_tensor.data(); + ASSERT_EQ(out_tensor.dims(), dims); + for (auto i = 0; i < out_tensor.numel(); ++i) { + ASSERT_EQ(out_data[i], out_value); + } +} + +TEST(communicator, merge_selected_rows) { + auto cpu_place = platform::CPUPlace(); + int64_t width = 10; + std::vector> in_vars; + const int64_t height = 100; + for (auto i = 0; i < 10; ++i) { + std::vector rows; + for (auto k = 0; k <= i; ++k) { + rows.push_back(k); + } + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *slr = var->GetMutable(); + slr->set_height(height); + slr->set_rows(rows); + auto dims = + framework::make_ddim({static_cast(rows.size()), width}); + auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); + for (auto i = 0; i < rows.size(); ++i) { + for (auto j = 0; j < width; ++j) { + data[i * width + j] = static_cast(rows[i]); + } + } + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_slr = scope->FindVar(out_name)->Get(); + auto &out_t = out_slr.value(); + auto *out_data = out_t.data(); + ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); + std::vector out_values; + out_values.reserve(10); + for (auto i = 0; i < 10; ++i) { + out_values.push_back(static_cast(i * (10 - i))); + } + for (auto i = 0; i < out_slr.rows().size(); ++i) { + ASSERT_EQ(out_slr.rows()[i], i); + for (auto j = 0; j < width; ++j) { + ASSERT_EQ(out_data[i * width + j], out_values[i]); + } + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle From c567debcd94e4d5aaf46dddccb1d17f06b992c89 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 13 Mar 2019 19:01:53 +0800 Subject: [PATCH 79/98] optimize log --- paddle/fluid/operators/distributed/communicator.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 72f26e91b2..3661c2763d 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -128,8 +128,11 @@ void Communicator::SendThread() { task_f.wait(); } auto after_run_send_graph = GetCurrentUS(); - VLOG(3) << "run send graph use time " - << after_run_send_graph - before_run_send_graph; + auto send_graph_use_time = after_run_send_graph - before_run_send_graph; + if (send_graph_use_time > 10) { + VLOG(1) << "run send graph use time " + << after_run_send_graph - before_run_send_graph; + } if (!FLAGS_communicator_independent_recv_thread) { RecvAll(); } @@ -156,7 +159,7 @@ void Communicator::RecvAll() { task.wait(); } auto after_recv = GetCurrentUS(); - VLOG(3) << "run recv graph use time " << after_recv - before_send; + VLOG(1) << "run recv graph use time " << after_recv - before_send; } void Communicator::RecvThread() { From 347178bd977eb1323402d10a64bc3c3f6b157ae6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 14 Mar 2019 15:50:08 +0800 Subject: [PATCH 80/98] fix pserver memory leak --- paddle/fluid/operators/distributed/grpc/grpc_server.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index a0ed79201d..f32681738c 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -107,9 +107,11 @@ class RequestSend final : public RequestBase { int trainer_id = request_->GetTrainerId(); framework::Variable* outvar = nullptr; + /* if (!request_handler_->sync_mode()) { request_->ReleaseOwnershipOfLocalScope(); } + */ request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); Finish(reply_, &responder_); } From 065b68b6ca53b3eb140a9f3ebe95b8cdd856fef4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 14 Mar 2019 23:34:25 +0800 Subject: [PATCH 81/98] clean code --- .../fluid/operators/distributed/grpc/grpc_server.cc | 6 ------ paddle/fluid/operators/distributed/parameter_send.cc | 6 +++--- paddle/fluid/operators/distributed/request_handler.h | 6 +----- .../operators/distributed/request_handler_impl.cc | 11 ++--------- .../fluid/operators/distributed/variable_response.h | 11 +++-------- .../fluid/operators/distributed_ops/send_recv_util.h | 1 + 6 files changed, 10 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index f32681738c..b86f0a53c4 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -106,12 +106,6 @@ class RequestSend final : public RequestBase { auto invar = request_->GetVar(); int trainer_id = request_->GetTrainerId(); framework::Variable* outvar = nullptr; - - /* - if (!request_handler_->sync_mode()) { - request_->ReleaseOwnershipOfLocalScope(); - } - */ request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); Finish(reply_, &responder_); } diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 3fe3be193a..388bc781c1 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -80,7 +80,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, auto &send_slr = send_var->Get(); auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections); - auto send_rows = send_slr.rows(); + auto &send_rows = send_slr.rows(); std::vector> outs_rows_idx; std::vector> outs_dense_idx; @@ -88,7 +88,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, outs_dense_idx.resize(out_num); auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto src = send_slr.value().data(); + auto *src = send_slr.value().data(); // create output var in local scope std::vector outs; @@ -110,8 +110,8 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, outs[i]->set_height(rpc_ctx.height_sections[i]); auto dims = send_slr.GetCompleteDims(); dims[0] = rows_idx.size(); - outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); outs[i]->mutable_rows()->clear(); + outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); if (rows_idx.size() > 0) { for (auto idx : rows_idx) { outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index e777d515ce..991158ac72 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -71,15 +71,13 @@ class VarHandle { VarHandle(const std::string ep, const std::string& method, const std::string& name, const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr, - bool delete_local_scope = false) + const framework::Scope* p_scope = nullptr) : status_(kDefaultState) { ep_ = ep; ctx_ = p_ctx; scope_ = p_scope; name_ = name; method_ = method; - delete_local_scope_ = delete_local_scope; } virtual ~VarHandle() {} @@ -101,7 +99,6 @@ class VarHandle { std::unique_lock lk(sync_mutex_); status_ = ok ? kFinishState : kErrorState; } - if (delete_local_scope_ && scope_) delete scope_; VLOG(7) << "VarHandle finish:" << ok; wait_cond_.notify_all(); } @@ -128,7 +125,6 @@ class VarHandle { std::string name_; // RPC method name. std::string method_; - bool delete_local_scope_; protected: std::mutex sync_mutex_; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index e5318f98ca..e289ec929d 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -59,15 +59,8 @@ bool RequestSendHandler::Handle(const std::string& varname, "async mode should not recv BATCH_BARRIER_MESSAGE or " "COMPLETE_MESSAGE"); } - - try { - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), - scope); - delete scope; - } catch (std::exception& e) { - LOG(ERROR) << "async: run sub program error " << e.what(); - return false; - } + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); return true; } else { // sync rpc_server_->WaitCond(kRequestSend); diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index edc12e2091..eb3265e092 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -60,13 +60,14 @@ class VariableResponse { bool create_scope = false) : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) { if (create_scope) { - local_scope_ = &scope->NewScope(); + local_scope_ = scope->NewTmpScope(); } } virtual ~VariableResponse() { if (local_scope_) { - scope_->DeleteScope(local_scope_); + delete local_scope_; + local_scope_ = nullptr; } } @@ -86,12 +87,6 @@ class VariableResponse { inline std::string Varname() const { return meta_.varname(); } inline std::string OutVarname() const { return meta_.out_varname(); } inline std::string TableName() const { return meta_.table_name(); } - inline void ReleaseOwnershipOfLocalScope() { - PADDLE_ENFORCE(create_scope_, - "only when create_scope_ is true can you release the " - "ownership of local scope"); - local_scope_ = nullptr; - } // should call parse first. framework::Variable* GetVar() { diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h index 1e91f0dd51..01caee9a92 100644 --- a/paddle/fluid/operators/distributed_ops/send_recv_util.h +++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h @@ -54,6 +54,7 @@ inline int FindOutIdx(int row, const std::vector& abs_sections) { return i - 1; } } + PADDLE_ENFORCE_LT(row, abs_sections.back(), "row should be less then max id"); return abs_sections.size() - 1; } From ea0df4e8a2cf291a0e6626771c58d1d75635b3c1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 16 Mar 2019 15:11:45 +0800 Subject: [PATCH 82/98] add some check --- .../fluid/operators/distributed/parameter_recv.cc | 3 +++ .../fluid/operators/distributed/parameter_send.cc | 2 +- .../operators/distributed_ops/send_recv_util.h | 10 ---------- paddle/fluid/operators/split_selected_rows_op.h | 13 +++++++++---- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index c3238f28f6..ae6516b246 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -80,7 +80,9 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, framework::Tensor *recv_tensor = recv_var->GetMutable(); auto dev_ctx = paddle::platform::CPUDeviceContext(); + int64_t recv_numel = 0; for (auto *in : recved_tensors) { + recv_numel += in->numel(); auto in_stride = framework::stride_numel(in->dims()); auto out_stride = framework::stride_numel(recv_tensor->dims()); StridedNumelCopyWithAxis( @@ -88,6 +90,7 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, in->data(), in_stride, in_stride[0]); output_offset += in_stride[0]; } + PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); } delete local_scope; diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 388bc781c1..ec2884c252 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -99,7 +99,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, // split rows index into output sparse vars for (size_t i = 0; i < send_rows.size(); ++i) { - int out_idx = FindOutIdx(send_rows[i], abs_sections); + int out_idx = GetSectionIndex(send_rows[i], abs_sections); outs_rows_idx[out_idx].push_back(send_rows[i]); outs_dense_idx[out_idx].push_back(i); } diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h index 01caee9a92..c05a1ff1da 100644 --- a/paddle/fluid/operators/distributed_ops/send_recv_util.h +++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h @@ -48,16 +48,6 @@ inline bool NeedSend(const framework::Scope& scope, return false; } -inline int FindOutIdx(int row, const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (row < abs_sections[i]) { - return i - 1; - } - } - PADDLE_ENFORCE_LT(row, abs_sections.back(), "row should be less then max id"); - return abs_sections.size() - 1; -} - inline std::vector ToAbsoluteSection( const std::vector& height_sections) { std::vector abs_sections; diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index c29065649e..9ec459e2a6 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -32,7 +32,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { auto abs_sections = ToAbsoluteSection(height_sections); - auto x_rows = x->rows(); + auto& x_rows = x->rows(); + auto height = x->height(); std::vector> outs_rows_idx; std::vector> outs_dense_idx; @@ -44,8 +45,10 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { // split rows index into output sparse vars for (size_t i = 0; i < x_rows.size(); ++i) { - int out_idx = FindOutIdx(x_rows[i], abs_sections); - outs_rows_idx[out_idx].push_back(x_rows[i]); + auto& id = x_rows[i]; + PADDLE_ENFORCE_LT(id, height); + int out_idx = GetSectionIndex(id, abs_sections); + outs_rows_idx[out_idx].push_back(id); outs_dense_idx[out_idx].push_back(i); } auto place = ctx.GetPlace(); @@ -59,7 +62,9 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { outs[i]->mutable_rows()->clear(); if (rows_idx.size() > 0) { for (auto idx : rows_idx) { - outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + auto id_offset = idx - abs_sections[i]; + PADDLE_ENFORCE_LT(id_offset, height_sections[i]); + outs[i]->mutable_rows()->push_back(id_offset); } auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); for (size_t j = 0; j < rows_idx.size(); j++) { From 039d783db5ed14a5eabadb3177c800697afec39d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 18 Mar 2019 13:35:37 +0800 Subject: [PATCH 83/98] change communicator_recv_wait_ms to communicator_max_send_grad_num_before_recv --- .../operators/distributed/communicator.cc | 23 ++++++++++++++----- .../operators/distributed/communicator.h | 2 ++ python/paddle/fluid/__init__.py | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 3661c2763d..eba18c6777 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -29,7 +29,8 @@ DEFINE_bool(communicator_independent_recv_thread, true, "use an independent to recv vars from parameter server"); DEFINE_int32(communicator_send_queue_size, 20, "queue size to recv gradient before send"); -DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv"); +DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, + "max grad num to send before recv parameters"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); DEFINE_int32(communicator_max_merge_var_num, 20, "max var num to merge and send"); @@ -60,7 +61,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, << FLAGS_communicator_independent_recv_thread; VLOG(0) << "communicator_send_queue_size: " << FLAGS_communicator_send_queue_size; - VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms; + VLOG(0) << "communicator_max_send_grad_num_before_recv: " + << FLAGS_communicator_max_send_grad_num_before_recv; VLOG(0) << "communicator_thread_pool_size: " << FLAGS_communicator_thread_pool_size; VLOG(0) << "communicator_max_merge_var_num: " @@ -102,6 +104,10 @@ void Communicator::SendThread() { while (var_queue->Size() > 0 && merged_var_num < FLAGS_communicator_max_merge_var_num) { vars.push_back(var_queue->Pop()); + // only count the send number of the first var + if (var_name == send_varname_to_queue_.begin()->first) { + grad_num_.fetch_add(1, std::memory_order_relaxed); + } merged_var_num++; } auto before_merge = GetCurrentUS(); @@ -129,7 +135,7 @@ void Communicator::SendThread() { } auto after_run_send_graph = GetCurrentUS(); auto send_graph_use_time = after_run_send_graph - before_run_send_graph; - if (send_graph_use_time > 10) { + if (send_graph_use_time > 100) { VLOG(1) << "run send graph use time " << after_run_send_graph - before_run_send_graph; } @@ -165,9 +171,14 @@ void Communicator::RecvAll() { void Communicator::RecvThread() { VLOG(3) << "RecvThread start!"; while (running_) { - RecvAll(); - std::this_thread::sleep_for( - std::chrono::milliseconds(FLAGS_communicator_recv_wait_ms)); + auto grad_num = grad_num_.load(); + if (grad_num > FLAGS_communicator_max_send_grad_num_before_recv) { + VLOG(1) << "current grad num " << grad_num; + RecvAll(); + grad_num_.store(0); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } } } diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 3fe2a21232..859c0a7f51 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -184,6 +185,7 @@ class Communicator { std::unique_ptr send_scope_; // an independent scope std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; + std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv // the following code is for initialize the commnunicator public: diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index c478c8ceee..97ac7fd97b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -155,7 +155,7 @@ def __bootstrap__(): # env for communicator read_env_flags.append('communicator_independent_recv_thread') read_env_flags.append('communicator_send_queue_size') - read_env_flags.append('communicator_recv_wait_ms') + read_env_flags.append('communicator_max_send_grad_num_before_recv') read_env_flags.append('communicator_thread_pool_size') read_env_flags.append('communicator_max_merge_var_num') read_env_flags.append('communicator_fake_rpc') From 37f6b9ab7a24ace68167b68bfc3bce746a8abf7a Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 12:20:38 +0800 Subject: [PATCH 84/98] fix build test=develop --- .../fluid/framework/details/multi_devices_graph_pass.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 1d9ce17c50..21b0687f63 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { bool UseGPU() const; - bool NeedCollectiveForGrad(const std::string &grad_name, - std::vector ops) const; + virtual bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const; bool IsScaleLossOp(ir::Node *node) const; @@ -117,7 +117,10 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const override {} - bool NeedCollectiveOps() const override { return false; } + bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const { + return false; + } bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { if (node->Op()->Type() == "recv") { From d640c6cfa93179a592b662df36025e6e57c6fb17 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 12:55:51 +0800 Subject: [PATCH 85/98] fix pylint --- python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 4ddfc084e0..41e5f47976 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1401,8 +1401,9 @@ class DistributeTranspiler(object): # create table param and grad var in pserver program # create table optimize block in pserver program table_opt_op = [ - op for op in self.optimize_ops if 'Param' in op.input_names and - op.input("Param")[0] == self.table_name + op for op in self.optimize_ops + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name ][0] origin_param_var = self.origin_program.global_block().vars[ From 392e97aae5451b5135ff3c971b4d8cc95ec9ae99 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 13:04:00 +0800 Subject: [PATCH 86/98] fix cpplint test=develop --- paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 +- paddle/fluid/framework/ir/pass.cc | 4 ++++ paddle/fluid/operators/distributed/communicator.h | 2 ++ paddle/fluid/operators/distributed/grpc/grpc_server.cc | 1 + paddle/fluid/operators/distributed/parameter_prefetch.cc | 1 + paddle/fluid/operators/hierarchical_sigmoid_op.h | 3 +++ 6 files changed, 12 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 51f625cf2d..ec0a0064c4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -25,7 +25,7 @@ #include #include // ThreadPool in thrird party -#include + #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 823697495e..a03ba10b94 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" + +#include +#include + #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 859c0a7f51..41155bfc31 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -18,6 +18,8 @@ limitations under the License. */ #include #include #include +#include +#include #include #include diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index b86f0a53c4..0eb313f75d 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 539a038099..a1eba34662 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include "paddle/fluid/operators/distributed/parameter_prefetch.h" diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 751091478e..ed97878240 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -13,11 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #include +#include #include #include #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/clip_op.h" From b542639dc04f55584a70cb44413ca4ba9c8f2abe Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 15:58:52 +0800 Subject: [PATCH 87/98] code clean test=develop --- .../details/async_ssa_graph_executor.cc | 27 ------------------- .../operators/distributed_ops/send_op.cc | 17 ++++++------ 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 12822c64e9..5ca676ccde 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -80,33 +80,6 @@ void ProcessGraph(std::vector graphs, Scope *scope) { } } } - /* - VLOG(3) << "delete all recv ops"; - for (auto *node : nodes_to_delete) { - // delete input edge - for (auto *in : node->inputs) { - auto &in_outs = in->outputs; - for (auto iter = in_outs.begin(); iter != in_outs.end();) { - if (*iter == node) { - VLOG(3) << "delete input edge from " << in->Name() << " for " - << node->Name(); - iter = in_outs.erase(iter); - } else { - ++iter; - } - } - } - // delete output edge - for (auto *out : node->outputs) { - PADDLE_ENFORCE_EQ(out->outputs.size(), 0, "%s should have no outputs", - out->Name()); - VLOG(3) << "delete output edge to " << out->Name(); - graphs[i]->RemoveNode(out); - } - VLOG(3) << "delete node " << node->Name(); - graphs[i]->RemoveNode(node); - } - */ } // init communicator here if (send_varname_to_ctx.size() > 0) { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 67de7b4185..47688d0ad4 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -48,15 +48,14 @@ class SendOp : public framework::OperatorBase { if (send_varnames.size() > 0) { PADDLE_ENFORCE_EQ(ins.size(), 1, ""); - /* - auto send_functor = distributed::ParameterSend(); - auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, - height_sections); - send_functor(rpc_ctx, scope, static_cast(sync_send)); - */ - VLOG(3) << "send " << ins[0]; - distributed::Communicator::GetInstance()->Send(ins[0], scope); - VLOG(3) << "send " << ins[0] << " done"; + if (distributed::Communicator::GetInstance() == nullptr) { + auto send_functor = distributed::ParameterSend(); + auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, + height_sections); + send_functor(rpc_ctx, scope, static_cast(sync_send)); + } else { + distributed::Communicator::GetInstance()->Send(ins[0], scope); + } } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); From 33be014535609d3e4d58a36bf5243390cd8cc265 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 17:12:58 +0800 Subject: [PATCH 88/98] fix distribute compile problem test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 6 +++++- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a3a10eade8..9c4634bcbc 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -89,7 +89,11 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) -cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor communicator) +set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor) +if(WITH_DISTRIBUTE) + list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator) +endif() +cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS}) cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 5ca676ccde..e9aad5d264 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -15,7 +15,10 @@ #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/variable_helper.h" + +#ifdef PADDLE_WITH_DISTRIBUTE #include "paddle/fluid/operators/distributed/communicator.h" +#endif namespace paddle { namespace framework { @@ -43,6 +46,7 @@ inline void NewTempScopeAndInitVars(const std::vector &var_infos, // get RpcContext and remote send and recv op void ProcessGraph(std::vector graphs, Scope *scope) { +#ifdef PADDLE_WITH_DISTRIBUTE using RpcCtxMap = operators::distributed::RpcCtxMap; VLOG(3) << "ProcessGraph"; RpcCtxMap send_varname_to_ctx; @@ -88,6 +92,7 @@ void ProcessGraph(std::vector graphs, Scope *scope) { recv_varname_to_ctx, scope); operators::distributed::Communicator::GetInstance()->Start(); } +#endif } AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( From b68f84090bfc00c2c73aa49aca5f760bd2859352 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 27 Mar 2019 19:09:58 +0800 Subject: [PATCH 89/98] fix test_split_selected_rows_op test=develop --- .../paddle/fluid/tests/unittests/test_split_selected_rows_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py index f8847e1570..d8c57d964d 100644 --- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py @@ -38,7 +38,7 @@ class TestSpliteSelectedRows(unittest.TestCase): def check_with_place(self, place): scope = core.Scope() rows = [0, 5, 7, 4, 20] - height = 20 + height = 21 row_numel = 2 # initialize input variable X From 34890fd3b129f85f28489453ddd1d5f62dd526f7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 28 Mar 2019 09:07:50 +0800 Subject: [PATCH 90/98] fix gpu build for lookup_table_op test=develop --- paddle/fluid/operators/lookup_table_op.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 0af8b9e69c..a863af4af9 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -84,7 +84,8 @@ class LookupTableCUDAKernel : public framework::OpKernel { // for remote prefetch auto epmap = context.Attr>("epmap"); - auto height_sections = context.Attr>("height_sections"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); if (!epmap.empty()) { From 61912e879d23811e966fc6dae8eeaf080056b4e4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 29 Mar 2019 21:24:29 +0800 Subject: [PATCH 91/98] test_dist_base set runtime_split_send_recv to false test=develop --- python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 9fd2fe739e..a5d8cd4660 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -52,7 +52,7 @@ class TestDistRunnerBase(object): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd - config.runtime_split_send_recv = True + # config.runtime_split_send_recv = True t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, From a1821a04493152facc8ff63a2bcd6b339028d7a5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 30 Mar 2019 22:52:19 +0800 Subject: [PATCH 92/98] remote remote_prefetch in embedding layer test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 +++ paddle/fluid/operators/lookup_table_op.h | 3 ++- python/paddle/fluid/layers/nn.py | 5 ++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index e9aad5d264..8fe4cdc709 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -81,6 +81,9 @@ void ProcessGraph(std::vector graphs, Scope *scope) { nodes_to_delete.push_back(node); VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; + } else if (node->Name() == "lookup_table") { + VLOG(0) << "set lookup_table op remote_prefetch to false"; + node->Op()->SetAttr("remote_prefetch", false); } } } diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 524565a439..62e298e066 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -50,11 +50,12 @@ class LookupTableKernel : public framework::OpKernel { // for remote prefetch auto epmap = context.Attr>("epmap"); + auto remote_prefetch = context.Attr("remote_prefetch"); auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9743cfa727..f2413f6033 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -341,8 +341,7 @@ def embedding(input, is_distributed=False, padding_idx=None, param_attr=None, - dtype='float32', - remote_prefetch=False): + dtype='float32'): """ **Embedding Layer** @@ -381,7 +380,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = is_sparse and (not is_distributed) and remote_prefetch + remote_prefetch = is_sparse and (not is_distributed) if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( From df45c8c538bddc1d43f933438413d4143c588fce Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 30 Mar 2019 23:00:17 +0800 Subject: [PATCH 93/98] update nce and hierarchical_sigmoid remote_prefetch test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 5 +++-- paddle/fluid/operators/hierarchical_sigmoid_op.h | 3 ++- paddle/fluid/operators/nce_op.h | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 8fe4cdc709..52641260a6 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -81,8 +81,9 @@ void ProcessGraph(std::vector graphs, Scope *scope) { nodes_to_delete.push_back(node); VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; - } else if (node->Name() == "lookup_table") { - VLOG(0) << "set lookup_table op remote_prefetch to false"; + } else if (node->Name() == "lookup_table" || node->Name() == "nce" || + node->Name() == "hierarchical_sigmoid") { + VLOG(0) << "set " << node->Name() << " op remote_prefetch to false"; node->Op()->SetAttr("remote_prefetch", false); } } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index ed97878240..82c8171ca5 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -68,8 +68,9 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { size_t num_classes = static_cast(ctx.Attr("num_classes")); // for remote prefetch + auto remote_prefetch = ctx.Attr("remote_prefetch"); auto epmap = ctx.Attr>("epmap"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 25b6ed851b..12f3118ec7 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -156,9 +156,10 @@ class NCEKernel : public framework::OpKernel { auto input_mat = EigenMatrix::From(*(context.Input("Input"))); // for remote prefetch + auto remote_prefetch = context.Attr("remote_prefetch"); auto epmap = context.Attr>("epmap"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server From 8342f12e3159c74cb6753be15c6661a3bf5ac789 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 31 Mar 2019 09:02:50 +0800 Subject: [PATCH 94/98] fix set remote_prefetch test=develop --- paddle/fluid/framework/details/async_ssa_graph_executor.cc | 4 ---- paddle/fluid/framework/details/multi_devices_graph_pass.h | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 52641260a6..e9aad5d264 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -81,10 +81,6 @@ void ProcessGraph(std::vector graphs, Scope *scope) { nodes_to_delete.push_back(node); VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; - } else if (node->Name() == "lookup_table" || node->Name() == "nce" || - node->Name() == "hierarchical_sigmoid") { - VLOG(0) << "set " << node->Name() << " op remote_prefetch to false"; - node->Op()->SetAttr("remote_prefetch", false); } } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index a3fe9e8b13..82d003fad7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -127,8 +127,13 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { if (node->Op()->Type() == "recv") { + VLOG(0) << "set recv op do_not_run to true"; node->Op()->SetAttr("do_not_run", true); node->Op()->Flush(); + } else if (node->Name() == "lookup_table" || node->Name() == "nce" || + node->Name() == "hierarchical_sigmoid") { + VLOG(0) << "set " << node->Name() << " op remote_prefetch to false"; + node->Op()->SetAttr("remote_prefetch", false); } return false; } From 9db1a9e1288433878128ba40f88a32e4ef5a1691 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 31 Mar 2019 12:03:08 +0800 Subject: [PATCH 95/98] change log level test=develop --- paddle/fluid/framework/details/multi_devices_graph_pass.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 82d003fad7..26fc8dc198 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -127,12 +127,12 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { if (node->Op()->Type() == "recv") { - VLOG(0) << "set recv op do_not_run to true"; + VLOG(1) << "set recv op do_not_run to true"; node->Op()->SetAttr("do_not_run", true); node->Op()->Flush(); } else if (node->Name() == "lookup_table" || node->Name() == "nce" || node->Name() == "hierarchical_sigmoid") { - VLOG(0) << "set " << node->Name() << " op remote_prefetch to false"; + VLOG(1) << "set " << node->Name() << " op remote_prefetch to false"; node->Op()->SetAttr("remote_prefetch", false); } return false; From fb6cc3a1bd40378b3a9d560bd975ab22b730eb2d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 1 Apr 2019 09:06:33 +0800 Subject: [PATCH 96/98] follow commnet, optimize code and add comment test=develop --- .../framework/details/multi_devices_graph_pass.h | 3 +++ paddle/fluid/framework/scope.h | 4 ++++ .../fluid/operators/distributed/parameter_send.cc | 13 ++++++------- paddle/fluid/operators/distributed_ops/send_op.cc | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 26fc8dc198..7cc68dd2d5 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -132,8 +132,11 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { node->Op()->Flush(); } else if (node->Name() == "lookup_table" || node->Name() == "nce" || node->Name() == "hierarchical_sigmoid") { + // in async_mode, we do not need remote prefetch, because communicator + // will do async parameter recv. VLOG(1) << "set " << node->Name() << " op remote_prefetch to false"; node->Op()->SetAttr("remote_prefetch", false); + node->Op()->Flush(); } return false; } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index cd752077d6..6665458d4c 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -52,6 +52,10 @@ class Scope { /// Mark it to const because that new kid scope cannot change parent scope. Scope& NewScope() const; + /// Create a sub-scope for current scope but do not record it in the kids to + /// avoid performance problems. + /// Note!!! You should delete the result pointer yourself to avoid memory + /// leak! Scope* NewTmpScope() const; /// Create a variable with given name if it doesn't exist. diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index ec2884c252..4858dbe84e 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -81,8 +81,8 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections); auto &send_rows = send_slr.rows(); - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; + std::vector> outs_rows_idx; + std::vector> outs_dense_idx; outs_rows_idx.resize(out_num); outs_dense_idx.resize(out_num); @@ -99,7 +99,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, // split rows index into output sparse vars for (size_t i = 0; i < send_rows.size(); ++i) { - int out_idx = GetSectionIndex(send_rows[i], abs_sections); + size_t out_idx = GetSectionIndex(send_rows[i], abs_sections); outs_rows_idx[out_idx].push_back(send_rows[i]); outs_dense_idx[out_idx].push_back(i); } @@ -160,10 +160,9 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, } } - // note!! only support sync send now - if (true || sync) { - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + if (sync) { + for (auto &handle : rets) { + PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient"); } } diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 47688d0ad4..b08cd0942f 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -52,7 +52,7 @@ class SendOp : public framework::OperatorBase { auto send_functor = distributed::ParameterSend(); auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, height_sections); - send_functor(rpc_ctx, scope, static_cast(sync_send)); + send_functor(rpc_ctx, scope, true); } else { distributed::Communicator::GetInstance()->Send(ins[0], scope); } From 9861a92f6f014b826050b1c292eff3fb1b6ea5dc Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 1 Apr 2019 12:19:40 +0800 Subject: [PATCH 97/98] change the return type of NewTempScope to unique ptr test=develop --- paddle/fluid/framework/scope.cc | 4 +++- paddle/fluid/framework/scope.h | 4 +--- .../operators/distributed/parameter_prefetch.cc | 13 ++++++------- .../fluid/operators/distributed/parameter_recv.cc | 5 ++--- .../fluid/operators/distributed/parameter_send.cc | 10 ++++------ .../fluid/operators/distributed/variable_response.h | 2 +- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index aa1039baf0..49e22a5ad3 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -59,7 +59,9 @@ Scope& Scope::NewScope() const { return *child; } -Scope* Scope::NewTmpScope() const { return new Scope(this); } +std::unique_ptr Scope::NewTmpScope() const { + return std::unique_ptr(new Scope(this)); +} Variable* Scope::Var(const std::string& name) { SCOPE_VARS_WRITER_LOCK diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 6665458d4c..5f3d106e09 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -54,9 +54,7 @@ class Scope { /// Create a sub-scope for current scope but do not record it in the kids to /// avoid performance problems. - /// Note!!! You should delete the result pointer yourself to avoid memory - /// leak! - Scope* NewTmpScope() const; + std::unique_ptr NewTmpScope() const; /// Create a variable with given name if it doesn't exist. /// Caller doesn't own the returned Variable. diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index a686672813..7c33153ba7 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -160,7 +160,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - framework::Scope* local_scope = scope.NewTmpScope(); + std::unique_ptr local_scope = scope.NewTmpScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -206,7 +206,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, auto splited_ids = SplitIds(ids_vector, height_sections); SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, - local_scope); + local_scope.get()); // create output var in local scope for (auto& name : out_var_names) { @@ -215,12 +215,12 @@ void prefetch(const std::string& id_name, const std::string& out_name, std::vector rets; for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope, in_var_names[i])) { + if (NeedSend(*local_scope.get(), in_var_names[i])) { VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] << " to get " << out_var_names[i] << " back"; rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], cpu_ctx, *local_scope, in_var_names[i], out_var_names[i], - table_names[i])); + epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i], + out_var_names[i], table_names[i])); } else { VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; } @@ -232,8 +232,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, local_scope, &actual_ctx); - delete local_scope; + context, local_scope.get(), &actual_ctx); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index ae6516b246..2466be3254 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -42,7 +42,7 @@ template void ParameterRecv::operator()(const RpcContext &rpc_ctx, const framework::Scope &scope) { VLOG(3) << "ParameterRecv in"; - framework::Scope *local_scope = scope.NewTmpScope(); + std::unique_ptr local_scope = scope.NewTmpScope(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -64,7 +64,7 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, recved_tensors.push_back(t); VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, - *local_scope, recv_var_name, + *local_scope.get(), recv_var_name, recv_var_name)); } for (size_t i = 0; i < rets.size(); i++) { @@ -93,7 +93,6 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); } - delete local_scope; VLOG(3) << "ParameterRecv out"; } diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 4858dbe84e..c8a00cce7e 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -40,7 +40,7 @@ using DDim = framework::DDim; template void ParameterSend::operator()(const RpcContext &rpc_ctx, const framework::Scope &scope, bool sync) { - framework::Scope *local_scope = scope.NewTmpScope(); + std::unique_ptr local_scope = scope.NewTmpScope(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -150,10 +150,10 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { auto &send_var_name = rpc_ctx.splited_var_names[i]; auto &endpoint = rpc_ctx.epmap[i]; - if (NeedSend(*local_scope, send_var_name)) { + if (NeedSend(*local_scope.get(), send_var_name)) { VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope, - send_var_name)); + rets.push_back(rpc_client->AsyncSendVar( + endpoint, cpu_ctx, *local_scope.get(), send_var_name)); } else { VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.splited_var_names[i]; @@ -165,8 +165,6 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient"); } } - - delete local_scope; } template struct ParameterSend; diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index eb3265e092..3cabcd22cd 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -60,7 +60,7 @@ class VariableResponse { bool create_scope = false) : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) { if (create_scope) { - local_scope_ = scope->NewTmpScope(); + local_scope_ = scope->NewTmpScope().release(); } } From 4031c1a7b1248f0f909dc30dd852aacedb4a4daa Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 1 Apr 2019 13:25:58 +0800 Subject: [PATCH 98/98] fix ci build test=develop --- paddle/fluid/operators/distributed/parameter_prefetch.cc | 1 + paddle/fluid/operators/distributed/parameter_recv.cc | 1 + paddle/fluid/operators/distributed/parameter_send.cc | 1 + 3 files changed, 3 insertions(+) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 7c33153ba7..0e8d877e08 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index 2466be3254..e7d4c262aa 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index c8a00cce7e..9ce4244452 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include