init commit

7 years ago · 0621c327f1
parent 3621d9a3d7
commit 0621c327f1
6 changed files with 148 additions and 0 deletions
--- a/doc/design/parallel_executor.md
+++ b/doc/design/parallel_executor.md
@ -0,0 +1,52 @@
+# ParallelExecutor Design Doc
+
+## Introduction
+
+We introduce `ParallelExecutor` to run multi-GPU training in PaddlePaddle Fluid. It supports
+1. keeping a copy of the parameters on each GPU
+1. allreduce on a separate stream allowing computation and communication overlap
+
+An example of switching single GPU training to multiple GPUs:
+```python
+cost = your_neural_network()
+opt = fluid.optimizer.SGDOptimizer()
+opt.minimize(avg_cost)
+
+# change Executor -> ParallelExecutor
+exe = fluid.ParallelExecutor(gpu_list=[0, 1])
+
+for iter in xranges(iter_num):
+    exe.run()
+```
+
+## Design
+
+In the constructor, a list of parameter, whose gradients need to be allreduced, is given.
+
+During the runtime, `ParallelExecutor` starts `#gpu` threads to run each `Executor`. For every
+operator run on each GPU, it will automatically sync with different streams when necessary.
+
+```c++
+// if op's input is params' grad:
+    // sync with allreduce stream
+    // e.g. sgd should wait for allreduce to be finished
+SyncMultipleStreams(op);
+
+op->Run(*local_scope, place_);
+
+// if op's output is params' grad:
+//     sync with computation stream
+//     e.g. allreduce shoudl wait for fc_grad to be finished.
+SyncMultipleStreams(op);
+```
+
+
+## API
+
+The `ParallelExecutor.run` has similar interface as `Executor.run`. Besides
+1. Scope: we don't expose `scope` in `ParallelExecutor.run` since `ParallelExecutor` has its
+own scope to maintain NCCL.
+1. Feed: we don't expose `feed` in the API either, because the whole point of implementing
+parallel_executor is the speed. The input for NN should be implemented in an reader OP.
+1. Fetch: we return the fetched value on all GPUs as a list. (e.g. `exe.run(..., fetch=loss)`
+with return `[loss_on_gpu0, loss_on_gpu1]`)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -86,6 +86,8 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo

 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope
+        framework_proto backward glog lod_rank_table feed_fetch_method executor)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -305,10 +305,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }    // if (create_vars)

  for (auto& op : ctx->ops_) {
+    // TODO(ty):
+    // e.g. sgd should wait for allreduce to be finished
+    // if op's input is params' grad:
+    //     sync with allreduce stream
+    // SyncMultipleStreams(op);
+
    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

+    // TODO(ty):
+    // e.g. allreduce shoudl wait for fc_grad to be finished.
+    // if op's output is params' grad:
+    //     sync with computation stream
+    //     apply allreduce on allreduce stream
+    // SyncMultipleStreams(op);
+
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -47,6 +47,7 @@ class Executor {
           const std::string& feed_holder_name = "feed",
           const std::string& fetch_holder_name = "fetch");

+ private:
  static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
                                         int block_id);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/parallel_executor.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_set>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+struct AllReduceCallBack {
+  void operator()(framework::OperatorBase* op);
+
+  std::unordered_set<std::string> param_grad_names_;
+  platform::DeviceContext dev_ctx;
+};
+
+class ParallelExecutor {
+  explicit ParallelExecutor(const std::vector<platform::Place>& places,
+                            const std::unordered_set& params);
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
+  void Run(const ProgramDesc& prog, Scope* scope, int block_id,
+           bool create_local_scope = true, bool create_vars = true);
+
+ private:
+  std::vector<framework::Executor> exes_;
+  std::vector<framework::Scope*> scopes_;
+  AllReduceCallBack all_reduce_callbacks_;
+  std::unordered_set<std::string> params_;  // where to initilize it?
+  platform::Communicator nccl_com_;
+};
+
+}  // namespace framework
+}  // namespace paddle