"move Tensor to LoDTensor"

8 years ago · 5200c657a7
parent 63fb41b399
commit 5200c657a7
4 changed files with 186 additions and 105 deletions
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@ -74,8 +74,15 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
    //                 reduction == "ncclMin" || reduction == "ncclMax"),
    //                "invalid reduction.");

+    // auto in_dim = x_dims[0];
    ctx->SetOutputsDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
+    size_t N = x_dims.size();
+    auto out_dims = ctx->GetOutputsDim("Out");
+    for (size_t i = 0; i < N; ++i) {
+      VLOG(1) << " inference (X) " << framework::product(x_dims[i]) << " (Out)"
+              << framework::product(out_dims[i]);
+    }
  }
 };

--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@ -12,6 +12,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include <functional>

+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"

@ -20,6 +21,7 @@ namespace operators {

 using framework::Tensor;
 using platform::Communicator;
+using framework::LoDTensor;

 template <typename Type>
 class NCCLTypeWrapper;
@ -43,8 +45,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");

-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto outs = ctx.MultiOutput<Tensor>("Out");
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");

    auto* comm = ctx.Input<Communicator>("Communicator");

@ -56,12 +58,24 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(device_id);

+    size_t N = ins.size();
+    for (size_t i = 0; i < N; ++i) {
+      VLOG(1) << " inference (X) " << framework::product(ins[i]->dims())
+              << " (Out)" << framework::product(outs[i]->dims());
+    }
+
    for (size_t i = 0; i < ins.size(); ++i) {
+      VLOG(1) << " invoke allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+
      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, ncclSum,
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, ncclSum,
          comm->comms_[idx], stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << " finished allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
    }
  }
 };
--- a/paddle/operators/nccl_op.h
+++ b/paddle/operators/nccl_op.h
@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-#include <string.h>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Communicator;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-template <typename T>
-class NCCLInitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
-    auto* comm = ctx.Output<Communicator>("Communicator");
-    comm->InitAll(gpus);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu