Paddle/paddle/fluid/framework/parallel_executor.cc

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/parallel_executor.h"

#include <string>
#include <vector>

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif

#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"

namespace paddle {
namespace framework {

class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
      : places_(places) {}

  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
  Scope *global_scope_;
  std::unique_ptr<details::SSAGraphExecutor> executor_;

#ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
};

std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
}

ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

  // Step 1. Bcast the params to devs.
  // Create local scopes
  if (local_scopes.empty()) {
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.push_back(&scope->NewScope());
    }
  } else {
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.push_back(local_scopes[i]);
    }
  }

// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif
  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
      local_scopes.empty()) {  // Is CUDA
    BCastParamsToGPUs(bcast_vars);
  }
// Startup Program has been run. All local scopes has correct parameters.

// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
                                           params, member_->local_scopes_,
                                           member_->nccl_ctxs_.get());
#else
  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
                                           params, member_->local_scopes_);
#endif
  auto graph = builder.Build(main_program);

  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
      allow_op_delay));

  // Step 3. Create vars in each scope;
  for (auto *scope : member_->local_scopes_) {
    for (auto *var : main_program.Block(0).AllVars()) {
      if (scope->FindVar(var->Name()) != nullptr) {
        continue;
      }

      InitializeVariable(scope->Var(var->Name()), var->GetType());
    }
  }
}

void ParallelExecutor::BCastParamsToGPUs(
    const std::unordered_set<std::string> &vars) const {
#ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];

  for (auto &var : vars) {
    auto *main_var = main_scope->FindVar(var);
    if (!main_var->IsType<LoDTensor>()) {
      continue;
    }

    auto &main_tensor = main_var->Get<LoDTensor>();

    auto &dims = main_tensor.dims();

    if (paddle::platform::is_gpu_place(main_tensor.place())) {
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      platform::NCCLGroupGuard guard;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
        if (i == 0) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
                                     nccl_ctx.comm_, nccl_ctx.stream());
      }
    } else {
      platform::CPUPlace cpu;
      for (size_t i = 1; i < member_->places_.size(); ++i) {
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
        t->Resize(dims);
        t->mutable_data(cpu, main_tensor.type());
        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
    member_->nccl_ctxs_->WaitAll();
  }
#else
  PADDLE_THROW("Not compiled with CUDA");
#endif
}

void ParallelExecutor::Run(
    const std::vector<std::string> &fetch_tensors,
    const std::string &fetched_var_name,
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
  SplitTensorToPlaces(feed_tensors);
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
}

void ParallelExecutor::SplitTensorToPlaces(
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  for (auto it : feed_tensors) {
    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
    PADDLE_ENFORCE_EQ(
        member_->places_.size(), lod_tensors.size(),
        "The number of samples of current batch is less than the count of "
        "devices, currently, it is not allowed. (%d vs %d)",
        member_->places_.size(), lod_tensors.size());
    for (size_t j = 0; j < member_->places_.size(); ++j) {
      // TODO(panxy0718): Do I need to delete this var?
      member_->local_scopes_[j]
          ->Var(it.first)
          ->GetMutable<LoDTensor>()
          ->ShareDataWith(lod_tensors[j]);
    }
  }
}

}  // namespace framework
}  // namespace paddle
init commit 7 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

			`#include "paddle/fluid/framework/parallel_executor.h"`
fix mac compile 7 years ago
refine parallel 7 years ago			`#include <string>`
fix mac compile 7 years ago			`#include <vector>`
Extract Executors to indie modules 7 years ago
Fix CPU compile 7 years ago			`#ifdef PADDLE_WITH_CUDA`
extract multi devices graph builder 7 years ago			`#include "paddle/fluid/platform/nccl_helper.h"`
Fix CPU compile 7 years ago			`#endif`
init commit 7 years ago
Extract Executors to indie modules 7 years ago			`#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"`
			`#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`#include "paddle/fluid/platform/profiler.h"`
Extract Executors to indie modules 7 years ago
init commit 7 years ago			`namespace paddle {`
ParallelExecutor And dependency engine 7 years ago			`namespace framework {`

Extract GraphExecutor 7 years ago			`class ParallelExecutorPrivate {`
			`public:`
			`explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)`
Clean code 7 years ago			`: places_(places) {}`
Extract GraphExecutor 7 years ago
			`std::vector<platform::Place> places_;`
			`std::vector<Scope *> local_scopes_;`
			`Scope *global_scope_;`
Clean code 7 years ago			`std::unique_ptr<details::SSAGraphExecutor> executor_;`
Extract GraphExecutor 7 years ago
Clean code 7 years ago			`#ifdef PADDLE_WITH_CUDA`
Extract GraphExecutor 7 years ago			`std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;`
Clean code 7 years ago			`#endif`
Add ncclAllReduce 7 years ago			`};`

Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {`
			`return member_->local_scopes_;`
			`}`

ParallelExecutor And dependency engine 7 years ago			`ParallelExecutor::ParallelExecutor(`
Add simple python wrapper for ParallelExecutor 7 years ago			`size_t num_threads, bool use_event,`
			`const std::vector<platform::Place> &places,`
ParallelExecutor And dependency engine 7 years ago			`const std::unordered_set<std::string> &params,`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`const std::unordered_set<std::string> &bcast_vars,`
			`const ProgramDesc &main_program, const std::string &loss_var_name,`
			`Scope scope, const std::vector<Scope > &local_scopes, bool allow_op_delay)`
Extract GraphExecutor 7 years ago			`: member_(new ParallelExecutorPrivate(places)) {`
Complete fetch op 7 years ago			`member_->global_scope_ = scope;`
AddInput/AddOutput for OpHandle 7 years ago
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`// Step 1. Bcast the params to devs.`
ParallelExecutor And dependency engine 7 years ago			`// Create local scopes`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`if (local_scopes.empty()) {`
			`for (size_t i = 0; i < member_->places_.size(); ++i) {`
			`member_->local_scopes_.push_back(&scope->NewScope());`
			`}`
			`} else {`
			`PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());`
			`for (size_t i = 0; i < member_->places_.size(); ++i) {`
			`member_->local_scopes_.push_back(local_scopes[i]);`
			`}`
ParallelExecutor And dependency engine 7 years ago			`}`

Clean code 7 years ago			`// Bcast Parameters to all GPUs`
			`#ifdef PADDLE_WITH_CUDA`
			`member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));`
			`#endif`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&`
			`local_scopes.empty()) { // Is CUDA`
			`BCastParamsToGPUs(bcast_vars);`
ParallelExecutor And dependency engine 7 years ago			`}`
Fix CPU compile 7 years ago			`// Startup Program has been run. All local scopes has correct parameters.`
ParallelExecutor And dependency engine 7 years ago
Fix CPU compile 7 years ago			`// Step 2. Convert main_program to SSA form and dependency graph. Also, insert`
			`// ncclOp`
			`#ifdef PADDLE_WITH_CUDA`
extract multi devices graph builder 7 years ago			`details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,`
			`params, member_->local_scopes_,`
			`member_->nccl_ctxs_.get());`
Fix CPU compile 7 years ago			`#else`
			`details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,`
			`params, member_->local_scopes_);`
			`#endif`
Make executor steal graph inside 7 years ago			`auto graph = builder.Build(main_program);`
Single GPU ParallelExecutor complete 7 years ago
Extract Executors to indie modules 7 years ago			`member_->executor_.reset(new details::ThreadedSSAGraphExecutor(`
Add enable/disable for delayed ops 7 years ago			`num_threads, use_event, member_->local_scopes_, places, std::move(graph),`
			`allow_op_delay));`
Extract GraphExecutor 7 years ago
Single GPU ParallelExecutor complete 7 years ago			`// Step 3. Create vars in each scope;`
Refactor local_scopes 7 years ago			`for (auto *scope : member_->local_scopes_) {`
Single GPU ParallelExecutor complete 7 years ago			`for (auto *var : main_program.Block(0).AllVars()) {`
			`if (scope->FindVar(var->Name()) != nullptr) {`
			`continue;`
			`}`

			`InitializeVariable(scope->Var(var->Name()), var->GetType());`
			`}`
			`}`
ParallelExecutor And dependency engine 7 years ago			`}`

			`void ParallelExecutor::BCastParamsToGPUs(`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`const std::unordered_set<std::string> &vars) const {`
Polish code 7 years ago			`#ifdef PADDLE_WITH_CUDA`
Refactor local_scopes 7 years ago			`auto *main_scope = member_->local_scopes_[0];`
Polish code 7 years ago
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`for (auto &var : vars) {`
			`auto *main_var = main_scope->FindVar(var);`
			`if (!main_var->IsType<LoDTensor>()) {`
			`continue;`
			`}`

			`auto &main_tensor = main_var->Get<LoDTensor>();`

			`auto &dims = main_tensor.dims();`

			`if (paddle::platform::is_gpu_place(main_tensor.place())) {`
			`size_t numel = main_tensor.numel();`
			`ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());`
			`platform::NCCLGroupGuard guard;`
			`for (size_t i = 0; i < member_->places_.size(); ++i) {`
			`auto place = member_->places_[i];`
			`void *buffer;`
			`if (i == 0) {`
			`buffer = const_cast<void *>(main_tensor.data<void>());`
			`} else {`
Refactor local_scopes 7 years ago			`auto local_scope = member_->local_scopes_[i];`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();`
Update 7 years ago			`t->Resize(dims);`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`buffer = t->mutable_data(place, main_tensor.type());`
Update 7 years ago			`}`
Support testing during training by ParallelExecutor. (#9738) * Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments. 7 years ago			`auto &nccl_ctx = member_->nccl_ctxs_->at(place);`
			`platform::dynload::ncclBcast(buffer, numel, data_type, 0,`
			`nccl_ctx.comm_, nccl_ctx.stream());`
			`}`
			`} else {`
			`platform::CPUPlace cpu;`
			`for (size_t i = 1; i < member_->places_.size(); ++i) {`
			`auto local_scope = member_->local_scopes_[i];`
			`auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();`
			`t->Resize(dims);`
			`t->mutable_data(cpu, main_tensor.type());`
			`paddle::framework::TensorCopy(main_tensor, cpu, t);`
ParallelExecutor And dependency engine 7 years ago			`}`
Stash 7 years ago			`}`
Extract NCCLCtxMap 7 years ago			`member_->nccl_ctxs_->WaitAll();`
Stash 7 years ago			`}`
Polish code 7 years ago			`#else`
			`PADDLE_THROW("Not compiled with CUDA");`
			`#endif`
			`}`
ParallelExecutor And dependency engine 7 years ago
Add feed to ParallelExecutor 7 years ago			`void ParallelExecutor::Run(`
			`const std::vector<std::string> &fetch_tensors,`
			`const std::string &fetched_var_name,`
			`const std::unordered_map<std::string, LoDTensor> &feed_tensors) {`
Improve ParallelExecutor performance 7 years ago			`platform::RecordBlock b(0);`
Add feed to ParallelExecutor 7 years ago			`SplitTensorToPlaces(feed_tensors);`
Make executor steal graph inside 7 years ago			`auto fetch_data = member_->executor_->Run(fetch_tensors);`
			`*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =`
			`fetch_data;`
ParallelExecutor And dependency engine 7 years ago			`}`
Single GPU ParallelExecutor complete 7 years ago
Add feed to ParallelExecutor 7 years ago			`void ParallelExecutor::SplitTensorToPlaces(`
			`const std::unordered_map<std::string, LoDTensor> &feed_tensors) {`
			`for (auto it : feed_tensors) {`
			`auto lod_tensors = it.second.SplitLoDTensor(member_->places_);`
when the number of samples of current batch is less than the count of devices, let it crash. 7 years ago			`PADDLE_ENFORCE_EQ(`
			`member_->places_.size(), lod_tensors.size(),`
			`"The number of samples of current batch is less than the count of "`
			`"devices, currently, it is not allowed. (%d vs %d)",`
			`member_->places_.size(), lod_tensors.size());`
Add feed to ParallelExecutor 7 years ago			`for (size_t j = 0; j < member_->places_.size(); ++j) {`
			`// TODO(panxy0718): Do I need to delete this var?`
			`member_->local_scopes_[j]`
			`->Var(it.first)`
			`->GetMutable<LoDTensor>()`
			`->ShareDataWith(lod_tensors[j]);`
			`}`
			`}`
			`}`

ParallelExecutor And dependency engine 7 years ago			`} // namespace framework`
init commit 7 years ago			`} // namespace paddle`