Paddle/paddle/fluid/framework/details/all_reduce_op_handle.cc

//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>

#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"

namespace paddle {
namespace framework {
namespace details {

#ifdef PADDLE_WITH_CUDA
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLContextMap *ctxs)
    : OpHandleBase(node),
      local_scopes_(local_scopes),
      places_(places),
      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
    }
  }
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif

void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);

  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done
    WaitInputVarGenerated();
    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
    PADDLE_ENFORCE_EQ(
        in_var_handles.size(), places_.size(),
        "The NoDummyInputSize should be equal to the number of places.");
    PADDLE_ENFORCE_EQ(
        in_var_handles.size(), out_var_handles.size(),
        "The NoDummyInputSize and NoDummyOutputSize should be equal.");

    std::vector<const LoDTensor *> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
      auto &lod_tensor =
          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
      lod_tensors.emplace_back(&lod_tensor);
      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
                        "The name of input and output should be equal.");
    }

    if (platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA
      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
      int dtype = -1;
      size_t numel = 0;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
        auto &lod_tensor = *lod_tensors[i];
        void *buffer = const_cast<void *>(lod_tensor.data<void>());

        if (dtype == -1) {
          dtype = platform::ToNCCLDataType(lod_tensor.type());
        }

        if (numel == 0) {
          numel = static_cast<size_t>(lod_tensor.numel());
        }

        int dev_id = boost::get<platform::CUDAPlace>(p).device;
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
              ncclSum, comm, stream));
        });
      }
      this->RunAndRecordEvent([&] {
        platform::NCCLGroupGuard guard;
        for (auto &call : all_reduce_calls) {
          call();
        }
      });
#else
      PADDLE_THROW("Not compiled with CUDA");
#endif
    } else {  // Special handle CPU only Operator's gradient. Like CRF
      auto &trg = *this->local_scopes_[0]
                       ->FindVar(kLocalExecScopeName)
                       ->Get<Scope *>()
                       ->FindVar(out_var_handles[0]->name_)
                       ->GetMutable<framework::LoDTensor>();

      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0]->type()), func);

      for (size_t i = 1; i < local_scopes_.size(); ++i) {
        auto &scope =
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
        auto *var = scope.FindVar(out_var_handles[i]->name_);
        auto *dev_ctx = dev_ctxes_.at(p);

        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
          auto &tensor_cpu = trg;
          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
        });
      }
    }
  }
}

std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
}  // namespace details
}  // namespace framework
}  // namespace paddle
Reorganize Code 7 years ago			`// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
add fuse var op handle 7 years ago			`#include <algorithm>`
Reorganize Code 7 years ago
nccl_all_reduce_op_handle => all_reduce_op_handle 7 years ago			`#include "paddle/fluid/framework/details/all_reduce_op_handle.h"`
add fuse var op handle 7 years ago			`#include "paddle/fluid/framework/details/container_cast.h"`
make unit test to work 7 years ago			`#include "paddle/fluid/framework/details/reduce_and_gather.h"`
add fuse var op handle 7 years ago			`#include "paddle/fluid/framework/details/variable_visitor.h"`
make profiler use thread_id from g_thread_id Add a few more RecordEvent. Cleanup 7 years ago			`#include "paddle/fluid/platform/profiler.h"`
Stash 7 years ago
Reorganize Code 7 years ago			`namespace paddle {`
			`namespace framework {`
			`namespace details {`
fix in c++ side 7 years ago
			`#ifdef PADDLE_WITH_CUDA`
op compose node and update nodes. 7 years ago			`AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,`
			`const std::vector<Scope *> &local_scopes,`
nccl_all_reduce_op_handle => all_reduce_op_handle 7 years ago			`const std::vector<platform::Place> &places,`
			`const platform::NCCLContextMap *ctxs)`
op compose node and update nodes. 7 years ago			`: OpHandleBase(node),`
			`local_scopes_(local_scopes),`
			`places_(places),`
			`nccl_ctxs_(ctxs) {`
nccl_all_reduce_op_handle => all_reduce_op_handle 7 years ago			`if (nccl_ctxs_) {`
fix in c++ side 7 years ago			`for (auto &p : places_) {`
refine op_handle (#14178) test=develop 6 years ago			`this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));`
fix in c++ side 7 years ago			`}`
Reorganize Code 7 years ago			`}`
			`}`
fix in c++ side 7 years ago			`#else`
op compose node and update nodes. 7 years ago			`AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,`
			`const std::vector<Scope *> &local_scopes,`
nccl_all_reduce_op_handle => all_reduce_op_handle 7 years ago			`const std::vector<platform::Place> &places)`
op compose node and update nodes. 7 years ago			`: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}`
fix in c++ side 7 years ago			`#endif`
Reorganize Code 7 years ago
nccl_all_reduce_op_handle => all_reduce_op_handle 7 years ago			`void AllReduceOpHandle::RunImpl() {`
refine op_handle (#14178) test=develop 6 years ago			`platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);`
move bcast op into pass 7 years ago
add fuse var op handle 7 years ago			`if (NoDummyInputSize() == 1) {`
Reorganize Code 7 years ago			`return; // No need to all reduce when GPU count = 1;`
			`} else {`
			`// Wait input done`
refine pe 7 years ago			`WaitInputVarGenerated();`
add fuse var op handle 7 years ago			`auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());`
			`auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());`
			`PADDLE_ENFORCE_EQ(`
			`in_var_handles.size(), places_.size(),`
			`"The NoDummyInputSize should be equal to the number of places.");`
			`PADDLE_ENFORCE_EQ(`
			`in_var_handles.size(), out_var_handles.size(),`
			`"The NoDummyInputSize and NoDummyOutputSize should be equal.");`
Reorganize Code 7 years ago
fix scope of gather broadcast 7 years ago			`std::vector<const LoDTensor *> lod_tensors;`
Reorganize Code 7 years ago			`for (size_t i = 0; i < local_scopes_.size(); ++i) {`
			`auto *s = local_scopes_[i];`
DebugCode 7 years ago			`auto &local_scope = s->FindVar(kLocalExecScopeName)->Get<Scope >();`
add fuse var op handle 7 years ago			`auto &lod_tensor =`
			`local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();`
fix scope of gather broadcast 7 years ago			`lod_tensors.emplace_back(&lod_tensor);`
add fuse var op handle 7 years ago			`PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,`
			`"The name of input and output should be equal.");`
Stash 7 years ago			`}`

fix scope of gather broadcast 7 years ago			`if (platform::is_gpu_place(lod_tensors[0]->place())) {`
fix in c++ side 7 years ago			`#ifdef PADDLE_WITH_CUDA`
add cpu test 7 years ago			`PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");`
add fuse var op handle 7 years ago			`int dtype = -1;`
			`size_t numel = 0;`
Stash 7 years ago			`std::vector<std::function<void()>> all_reduce_calls;`
			`for (size_t i = 0; i < local_scopes_.size(); ++i) {`
			`auto &p = places_[i];`
fix scope of gather broadcast 7 years ago			`auto &lod_tensor = *lod_tensors[i];`
Stash 7 years ago			`void buffer = const_cast<void >(lod_tensor.data<void>());`
Reorganize Code 7 years ago
Stash 7 years ago			`if (dtype == -1) {`
			`dtype = platform::ToNCCLDataType(lod_tensor.type());`
			`}`

			`if (numel == 0) {`
			`numel = static_cast<size_t>(lod_tensor.numel());`
			`}`

			`int dev_id = boost::get<platform::CUDAPlace>(p).device;`
fix in c++ side 7 years ago			`auto &nccl_ctx = nccl_ctxs_->at(dev_id);`
Stash 7 years ago			`auto stream = nccl_ctx.stream();`
			`auto comm = nccl_ctx.comm_;`
			`all_reduce_calls.emplace_back([=] {`
			`PADDLE_ENFORCE(platform::dynload::ncclAllReduce(`
			`buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),`
			`ncclSum, comm, stream));`
			`});`
Reorganize Code 7 years ago			`}`
Merge branch 'feature/fix_transformer_hang' into feature/mix_cpu_gpu_op 7 years ago			`this->RunAndRecordEvent([&] {`
			`platform::NCCLGroupGuard guard;`
			`for (auto &call : all_reduce_calls) {`
			`call();`
			`}`
			`});`
fix in c++ side 7 years ago			`#else`
			`PADDLE_THROW("Not compiled with CUDA");`
			`#endif`
Stash 7 years ago			`} else { // Special handle CPU only Operator's gradient. Like CRF`
DebugCode 7 years ago			`auto &trg = *this->local_scopes_[0]`
			`->FindVar(kLocalExecScopeName)`
			`->Get<Scope *>()`
Add cpu test for parallel_executor_crf executor_fetch_feed, and enable these tests 7 years ago			`->FindVar(out_var_handles[0]->name_)`
DebugCode 7 years ago			`->GetMutable<framework::LoDTensor>();`
Refine allreduce op 7 years ago
Stash 7 years ago			`// Reduce All Tensor to trg in CPU`
			`ReduceLoDTensor func(lod_tensors, &trg);`
fix scope of gather broadcast 7 years ago			`VisitDataType(ToDataType(lod_tensors[0]->type()), func);`
Support CPU/GPU mixture for ParallelExecutor 7 years ago
Add cpu test for parallel_executor_crf executor_fetch_feed, and enable these tests 7 years ago			`for (size_t i = 1; i < local_scopes_.size(); ++i) {`
DebugCode 7 years ago			`auto &scope =`
			`local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope >();`
Support CPU/GPU mixture for ParallelExecutor 7 years ago			`auto &p = places_[i];`
Add cpu test for parallel_executor_crf executor_fetch_feed, and enable these tests 7 years ago			`auto *var = scope.FindVar(out_var_handles[i]->name_);`
refine op_handle (#14178) test=develop 6 years ago			`auto *dev_ctx = dev_ctxes_.at(p);`
Support CPU/GPU mixture for ParallelExecutor 7 years ago
			`RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {`
			`auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();`
			`auto &tensor_cpu = trg;`
			`TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);`
			`});`
			`}`
Reorganize Code 7 years ago			`}`
			`}`
			`}`
Add Graphviz output 7 years ago
ADD CPU_NUM 7 years ago			`std::string AllReduceOpHandle::Name() const { return "all_reduce"; }`
Reorganize Code 7 years ago			`} // namespace details`
			`} // namespace framework`
			`} // namespace paddle`