add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)

* add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang <frankwhzhang@126.com>
5 years ago · 5cb20f30fc
parent c687edecd8
commit 5cb20f30fc
7 changed files with 414 additions and 102 deletions
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -97,6 +97,36 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  // TODO(zhiqiu): handle different condition like CUDA code below
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream);
  }
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@ -304,6 +334,32 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr);
  }
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@ -433,10 +489,9 @@ class AnyVisitor : public boost::static_visitor<bool> {
  bool GetResult(const framework::Tensor& out,
                 const platform::NPUPlace& npu) const {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(
-        "Not supported on place (%s) ",
+        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
-        npu));
+    // return GetResultHelper(out, npu);
    //return GetResultHelper(out, npu);
  }
  bool GetResult(const framework::Tensor& out,
@ -642,7 +697,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
  }
  void VisitorImpl(const platform::NPUPlace& npu) const {
-    //TODO(zhiqiu)
+    // TODO(zhiqiu)
  }
  void VisitorImpl(const platform::CPUPlace& cpu) const {
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
@ -8,4 +8,4 @@ register_operators(DEPS op_version_registry)
 cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
 cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
 cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
-cc_test(elementwise_add_op_npu_test SRCS elementwise_add_op_npu_test.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
+cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@ -29,17 +29,9 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
    auto* x = ctx.Input<framework::LoDTensor>("X");
    auto* y = ctx.Input<framework::LoDTensor>("Y");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    out->mutable_data<T>(ctx.GetPlace());
    // TODO(zhiqiu): get the attr infomation of Ascend op and
    // convert paddle AttributeMap to Ascend attrs.
    // Ascend op add has no attribute ?
    // int axis = ctx.Attr<int>("axis");
    // NOTE(zhiqiu): the order of inputs and outputs is important
    auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc
@ -1,87 +0,0 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  auto y = scope->Var("Y");
  auto tensor_y = y->GetMutable<f::LoDTensor>();
  std::vector<float> init;
  for (int64_t i = 0; i < 10 * 10; ++i) {
    init.push_back(1.0);
  }
  TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({10, 10});
  TensorFromVector(init, ctx, tensor_y);
  tensor_y->Resize({10, 10});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("Out");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({10, 10});
  tensor_out->mutable_data<float>(place);  // allocate
  // run
  f::AttributeMap attrs;
  auto op =
      f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
                              {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
  std::vector<float> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 2.0);
  }
 }
 TEST(elementwise_add, NPU) {
  f::Scope scope;
  p::NPUDeviceContext ctx(p::NPUPlace(0));
  Compare(&scope, ctx);
 }
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@ -0,0 +1,181 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 template <typename T>
 void Compare(f::Scope* scope, const p::DeviceContext& ctx,
             std::string op_type) {
  // init
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  auto y = scope->Var("Y");
  auto tensor_y = y->GetMutable<f::LoDTensor>();
  std::vector<T> init_x;
  for (int64_t i = 0; i < 10 * 10; ++i) {
    init_x.push_back(static_cast<T>(1.0));
  }
  std::vector<T> init_y;
  for (int64_t i = 0; i < 10 * 10; ++i) {
    init_y.push_back(static_cast<T>(2.0));
  }
  TensorFromVector(init_x, ctx, tensor_x);
  tensor_x->Resize({10, 10});
  TensorFromVector(init_y, ctx, tensor_y);
  tensor_y->Resize({10, 10});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("Out");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  // run
  f::AttributeMap attrs;
  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
                                    {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
  std::vector<T> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  float expected;
  if (op_type == "elementwise_add") {
    expected = 3.0;
  } else if (op_type == "elementwise_sub") {
    expected = -1.0;
  }
  EXPECT_EQ(out_vec.size(), init_x.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
  }
 }
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
                 std::string op_type) {
  // init
  auto dout = scope->Var("DOut");
  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
  tensor_dout->Resize({2, 3, 5});
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  tensor_x->Resize({2, 3, 5});
  auto y = scope->Var("Y");
  auto tensor_y = y->GetMutable<f::LoDTensor>();
  tensor_y->Resize({1, 5});
  auto dx = scope->Var("DX");
  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
  auto dy = scope->Var("DY");
  auto tensor_dy = dy->GetMutable<f::LoDTensor>();
  std::vector<T> init_dout;
  for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
    init_dout.push_back(static_cast<T>(1.0));
  }
  TensorFromVector(init_dout, ctx, tensor_dout);
  tensor_dout->Resize({2, 3, 5});
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  auto op = f::OpRegistry::CreateOp(op_type,
    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
    {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
  auto place = ctx.GetPlace();
    op->Run(*scope, place);
    std::vector<T> dx_vec;
    TensorToVector(*tensor_dx, ctx, &dx_vec);
    std::vector<T> dy_vec;
    TensorToVector(*tensor_dy, ctx, &dy_vec);
    ctx.Wait();
    float expected_x, expected_y;
    if (op_type == "elementwise_add_grad") {
      expected_x = 1.0;
      expected_y = 6.0;
    } else if (op_type == "elementwise_sub_grad") {
      expected_x = 1.0;
      expected_y = -6.0;
    }
    for (uint32_t i = 0; i < dx_vec.size(); i++) {
      EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
    }
    for (uint32_t i = 0; i < dy_vec.size(); i++) {
      EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
    }
 }
 TEST(elementwise_add, NPU_fp32) {
    f::Scope scope;
    p::NPUDeviceContext ctx(p::NPUPlace(0));
    Compare<float>(&scope, ctx, "elementwise_add");
 }
 TEST(elementwise_sub, NPU_fp32) {
    f::Scope scope;
    p::NPUDeviceContext ctx(p::NPUPlace(0));
    Compare<float>(&scope, ctx, "elementwise_sub");
 }
 TEST(elementwise_sub, NPU_fp16) {
    f::Scope scope;
    p::NPUDeviceContext ctx(p::NPUPlace(0));
    Compare<p::float16>(&scope, ctx, "elementwise_sub");
 }
 TEST(elementwise_sub_grad, NPU) {
    f::Scope scope;
    p::NPUDeviceContext ctx(p::NPUPlace(0));
    CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
 }
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@ -0,0 +1,171 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x = ctx.Input<Tensor>("X");
    auto* y = ctx.Input<Tensor>("Y");
    auto* out = ctx.Output<Tensor>("Out");
    out->mutable_data<T>(ctx.GetPlace());
    auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
    runner.Run(stream);
  }
 };
 template <typename DeviceContext, typename T>
 class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    dx->mutable_data<T>(ctx.GetPlace());
    dy->mutable_data<T>(ctx.GetPlace());
    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
    // default axis=-1?
    // So, the sub_grad should do reduce if needed.
    // For example, the shape of each variable in elementwise_sub:
    // x, dx: [2, 3, 5]
    // y, dy: [1, 5]
    // out, dout: [2, 3, 5]
    // Then, out = x - y  =>  dx = dout, dy = -dout
    // And, the shape of dy can be computed by two stages reduce,
    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
    // For dx
    // stage 1
    auto reduce_ndim = dout->dims().size() - dx->dims().size();
    std::vector<int> axes;
    for (auto i = 0; i < reduce_ndim; ++i) {
      axes.push_back(i);
    }
    Tensor* tmp_dout = const_cast<Tensor*>(dout);
    Tensor reduced_dout(dx->type());
    if (axes.size() != 0) {
      std::vector<int64_t> reduced_dout_dims;
      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
        reduced_dout_dims.push_back(dout->dims()[i]);
      }
      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
      reduced_dout.mutable_data<T>(ctx.GetPlace());
      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
                                {{"axes", axes}, {"keep_dims", false}});
      runner.Run(stream);
      tmp_dout = &reduced_dout;
    }
    // stage 2
    axes.clear();
    for (auto i = 0; i < dx->dims().size(); ++i) {
      if (dx->dims()[i] == 1) {
        axes.push_back(i);
      }
    }
    if (axes.size() != 0) {
      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
                                {{"axes", axes}, {"keep_dims", true}});
      runner.Run(stream);
    } else {
      framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
    }
    // For dy
    // stage 1
    reduce_ndim = dout->dims().size() - dy->dims().size();
    axes.clear();
    for (auto i = 0; i < reduce_ndim; ++i) {
      axes.push_back(i);
    }
    tmp_dout = const_cast<Tensor*>(dout);
    Tensor reduced_dy(dy->type());
    if (axes.size() != 0) {
      std::vector<int64_t> reduced_dout_dims;
      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
        reduced_dout_dims.push_back(dout->dims()[i]);
      }
      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
      reduced_dout.mutable_data<T>(ctx.GetPlace());
      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
                                {{"axes", axes}, {"keep_dims", false}});
      runner.Run(stream);
      tmp_dout = &reduced_dout;
    }
    // stage 2
    axes.clear();
    Tensor* tmp_dy = tmp_dout;
    for (auto i = 0; i < dy->dims().size(); ++i) {
      if (dy->dims()[i] == 1) {
        axes.push_back(i);
      }
    }
    if (axes.size() != 0) {
      reduced_dy.Resize(dy->dims());
      reduced_dy.mutable_data<T>(ctx.GetPlace());
      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
                                {{"axes", axes}, {"keep_dims", true}});
      runner.Run(stream);
      tmp_dy = &reduced_dy;
    }
    // stage 3, negative
    auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
    runner.Run(stream);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
                                 paddle::platform::float16>);
 REGISTER_OP_NPU_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
                                     paddle::platform::float16>);
 #endif
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@ -253,7 +253,7 @@ void NpuOpRunner::Run(aclrtStream stream) {
      input_buffers_.data(), output_descs_.size(), output_descs_.data(),
      output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
      stream);
-  VLOG(4) << "after aclopCompileAndExecute";
+  VLOG(4) << "after aclopCompileAndExecute: " << ret;
  PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
 }  // namespace operators