add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)
* add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang <frankwhzhang@126.com>revert-31562-mean
parent
c687edecd8
commit
5cb20f30fc
@ -1,87 +0,0 @@
|
|||||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License. */
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <thread> // NOLINT
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
#include "paddle/fluid/framework/op_registry.h"
|
|
||||||
#include "paddle/fluid/framework/operator.h"
|
|
||||||
#include "paddle/fluid/framework/program_desc.h"
|
|
||||||
#include "paddle/fluid/operators/dropout_op.h"
|
|
||||||
#include "paddle/fluid/operators/math/math_function.h"
|
|
||||||
#include "paddle/fluid/string/printf.h"
|
|
||||||
|
|
||||||
namespace f = paddle::framework;
|
|
||||||
namespace p = paddle::platform;
|
|
||||||
namespace m = paddle::operators::math;
|
|
||||||
|
|
||||||
USE_OP(elementwise_add);
|
|
||||||
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
|
|
||||||
|
|
||||||
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
|
|
||||||
// init
|
|
||||||
auto x = scope->Var("X");
|
|
||||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
|
||||||
|
|
||||||
auto y = scope->Var("Y");
|
|
||||||
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
|
||||||
|
|
||||||
std::vector<float> init;
|
|
||||||
for (int64_t i = 0; i < 10 * 10; ++i) {
|
|
||||||
init.push_back(1.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
TensorFromVector(init, ctx, tensor_x);
|
|
||||||
tensor_x->Resize({10, 10});
|
|
||||||
TensorFromVector(init, ctx, tensor_y);
|
|
||||||
tensor_y->Resize({10, 10});
|
|
||||||
|
|
||||||
ctx.Wait();
|
|
||||||
|
|
||||||
auto place = ctx.GetPlace();
|
|
||||||
auto out = scope->Var("Out");
|
|
||||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
|
||||||
tensor_out->Resize({10, 10});
|
|
||||||
tensor_out->mutable_data<float>(place); // allocate
|
|
||||||
|
|
||||||
// run
|
|
||||||
f::AttributeMap attrs;
|
|
||||||
auto op =
|
|
||||||
f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
|
|
||||||
{{"Out", {"Out"}}}, attrs);
|
|
||||||
|
|
||||||
op->Run(*scope, place);
|
|
||||||
|
|
||||||
std::vector<float> out_vec;
|
|
||||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
|
||||||
|
|
||||||
ctx.Wait();
|
|
||||||
|
|
||||||
EXPECT_EQ(out_vec.size(), init.size());
|
|
||||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
|
||||||
EXPECT_EQ(out_vec[i], 2.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(elementwise_add, NPU) {
|
|
||||||
f::Scope scope;
|
|
||||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
|
||||||
Compare(&scope, ctx);
|
|
||||||
}
|
|
||||||
@ -0,0 +1,181 @@
|
|||||||
|
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <thread> // NOLINT
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include "paddle/fluid/framework/op_registry.h"
|
||||||
|
#include "paddle/fluid/framework/operator.h"
|
||||||
|
#include "paddle/fluid/framework/program_desc.h"
|
||||||
|
#include "paddle/fluid/operators/dropout_op.h"
|
||||||
|
#include "paddle/fluid/operators/math/math_function.h"
|
||||||
|
#include "paddle/fluid/string/printf.h"
|
||||||
|
|
||||||
|
namespace f = paddle::framework;
|
||||||
|
namespace p = paddle::platform;
|
||||||
|
namespace m = paddle::operators::math;
|
||||||
|
|
||||||
|
USE_OP(elementwise_add);
|
||||||
|
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
|
||||||
|
USE_OP(elementwise_sub);
|
||||||
|
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
|
||||||
|
std::string op_type) {
|
||||||
|
// init
|
||||||
|
auto x = scope->Var("X");
|
||||||
|
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||||
|
|
||||||
|
auto y = scope->Var("Y");
|
||||||
|
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
||||||
|
|
||||||
|
std::vector<T> init_x;
|
||||||
|
for (int64_t i = 0; i < 10 * 10; ++i) {
|
||||||
|
init_x.push_back(static_cast<T>(1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<T> init_y;
|
||||||
|
for (int64_t i = 0; i < 10 * 10; ++i) {
|
||||||
|
init_y.push_back(static_cast<T>(2.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorFromVector(init_x, ctx, tensor_x);
|
||||||
|
tensor_x->Resize({10, 10});
|
||||||
|
TensorFromVector(init_y, ctx, tensor_y);
|
||||||
|
tensor_y->Resize({10, 10});
|
||||||
|
|
||||||
|
ctx.Wait();
|
||||||
|
|
||||||
|
auto place = ctx.GetPlace();
|
||||||
|
auto out = scope->Var("Out");
|
||||||
|
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||||
|
|
||||||
|
// run
|
||||||
|
f::AttributeMap attrs;
|
||||||
|
auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
|
||||||
|
{{"Out", {"Out"}}}, attrs);
|
||||||
|
|
||||||
|
op->Run(*scope, place);
|
||||||
|
|
||||||
|
std::vector<T> out_vec;
|
||||||
|
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||||
|
|
||||||
|
ctx.Wait();
|
||||||
|
float expected;
|
||||||
|
if (op_type == "elementwise_add") {
|
||||||
|
expected = 3.0;
|
||||||
|
} else if (op_type == "elementwise_sub") {
|
||||||
|
expected = -1.0;
|
||||||
|
}
|
||||||
|
EXPECT_EQ(out_vec.size(), init_x.size());
|
||||||
|
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||||
|
EXPECT_EQ(out_vec[i], static_cast<T>(expected));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
|
||||||
|
std::string op_type) {
|
||||||
|
// init
|
||||||
|
auto dout = scope->Var("DOut");
|
||||||
|
auto tensor_dout = dout->GetMutable<f::LoDTensor>();
|
||||||
|
tensor_dout->Resize({2, 3, 5});
|
||||||
|
|
||||||
|
auto x = scope->Var("X");
|
||||||
|
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||||
|
tensor_x->Resize({2, 3, 5});
|
||||||
|
|
||||||
|
auto y = scope->Var("Y");
|
||||||
|
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
||||||
|
tensor_y->Resize({1, 5});
|
||||||
|
|
||||||
|
auto dx = scope->Var("DX");
|
||||||
|
auto tensor_dx = dx->GetMutable<f::LoDTensor>();
|
||||||
|
|
||||||
|
auto dy = scope->Var("DY");
|
||||||
|
auto tensor_dy = dy->GetMutable<f::LoDTensor>();
|
||||||
|
|
||||||
|
std::vector<T> init_dout;
|
||||||
|
for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
|
||||||
|
init_dout.push_back(static_cast<T>(1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorFromVector(init_dout, ctx, tensor_dout);
|
||||||
|
tensor_dout->Resize({2, 3, 5});
|
||||||
|
|
||||||
|
ctx.Wait();
|
||||||
|
|
||||||
|
// run
|
||||||
|
f::AttributeMap attrs;
|
||||||
|
auto op = f::OpRegistry::CreateOp(op_type,
|
||||||
|
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
|
||||||
|
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
|
||||||
|
|
||||||
|
auto place = ctx.GetPlace();
|
||||||
|
op->Run(*scope, place);
|
||||||
|
|
||||||
|
std::vector<T> dx_vec;
|
||||||
|
TensorToVector(*tensor_dx, ctx, &dx_vec);
|
||||||
|
|
||||||
|
std::vector<T> dy_vec;
|
||||||
|
TensorToVector(*tensor_dy, ctx, &dy_vec);
|
||||||
|
|
||||||
|
ctx.Wait();
|
||||||
|
float expected_x, expected_y;
|
||||||
|
if (op_type == "elementwise_add_grad") {
|
||||||
|
expected_x = 1.0;
|
||||||
|
expected_y = 6.0;
|
||||||
|
} else if (op_type == "elementwise_sub_grad") {
|
||||||
|
expected_x = 1.0;
|
||||||
|
expected_y = -6.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < dx_vec.size(); i++) {
|
||||||
|
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < dy_vec.size(); i++) {
|
||||||
|
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(elementwise_add, NPU_fp32) {
|
||||||
|
f::Scope scope;
|
||||||
|
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||||
|
Compare<float>(&scope, ctx, "elementwise_add");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(elementwise_sub, NPU_fp32) {
|
||||||
|
f::Scope scope;
|
||||||
|
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||||
|
Compare<float>(&scope, ctx, "elementwise_sub");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(elementwise_sub, NPU_fp16) {
|
||||||
|
f::Scope scope;
|
||||||
|
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||||
|
Compare<p::float16>(&scope, ctx, "elementwise_sub");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(elementwise_sub_grad, NPU) {
|
||||||
|
f::Scope scope;
|
||||||
|
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||||
|
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
|
||||||
|
}
|
||||||
@ -0,0 +1,171 @@
|
|||||||
|
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#ifdef PADDLE_WITH_ASCEND_CL
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
|
||||||
|
#include "paddle/fluid/operators/npu_op_runner.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
using Tensor = framework::Tensor;
|
||||||
|
|
||||||
|
template <typename DeviceContext, typename T>
|
||||||
|
class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
|
||||||
|
public:
|
||||||
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||||
|
auto* x = ctx.Input<Tensor>("X");
|
||||||
|
auto* y = ctx.Input<Tensor>("Y");
|
||||||
|
auto* out = ctx.Output<Tensor>("Out");
|
||||||
|
|
||||||
|
out->mutable_data<T>(ctx.GetPlace());
|
||||||
|
|
||||||
|
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
|
||||||
|
|
||||||
|
auto stream =
|
||||||
|
ctx.template device_context<paddle::platform::NPUDeviceContext>()
|
||||||
|
.stream();
|
||||||
|
runner.Run(stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename DeviceContext, typename T>
|
||||||
|
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
|
||||||
|
public:
|
||||||
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||||
|
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
|
||||||
|
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
|
||||||
|
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
|
||||||
|
|
||||||
|
dx->mutable_data<T>(ctx.GetPlace());
|
||||||
|
dy->mutable_data<T>(ctx.GetPlace());
|
||||||
|
|
||||||
|
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
|
||||||
|
// default axis=-1?
|
||||||
|
// So, the sub_grad should do reduce if needed.
|
||||||
|
// For example, the shape of each variable in elementwise_sub:
|
||||||
|
// x, dx: [2, 3, 5]
|
||||||
|
// y, dy: [1, 5]
|
||||||
|
// out, dout: [2, 3, 5]
|
||||||
|
// Then, out = x - y => dx = dout, dy = -dout
|
||||||
|
// And, the shape of dy can be computed by two stages reduce,
|
||||||
|
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
|
||||||
|
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
|
||||||
|
|
||||||
|
auto stream =
|
||||||
|
ctx.template device_context<paddle::platform::NPUDeviceContext>()
|
||||||
|
.stream();
|
||||||
|
// For dx
|
||||||
|
// stage 1
|
||||||
|
auto reduce_ndim = dout->dims().size() - dx->dims().size();
|
||||||
|
std::vector<int> axes;
|
||||||
|
for (auto i = 0; i < reduce_ndim; ++i) {
|
||||||
|
axes.push_back(i);
|
||||||
|
}
|
||||||
|
Tensor* tmp_dout = const_cast<Tensor*>(dout);
|
||||||
|
Tensor reduced_dout(dx->type());
|
||||||
|
if (axes.size() != 0) {
|
||||||
|
std::vector<int64_t> reduced_dout_dims;
|
||||||
|
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
|
||||||
|
reduced_dout_dims.push_back(dout->dims()[i]);
|
||||||
|
}
|
||||||
|
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
|
||||||
|
reduced_dout.mutable_data<T>(ctx.GetPlace());
|
||||||
|
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
|
||||||
|
{{"axes", axes}, {"keep_dims", false}});
|
||||||
|
runner.Run(stream);
|
||||||
|
tmp_dout = &reduced_dout;
|
||||||
|
}
|
||||||
|
|
||||||
|
// stage 2
|
||||||
|
axes.clear();
|
||||||
|
for (auto i = 0; i < dx->dims().size(); ++i) {
|
||||||
|
if (dx->dims()[i] == 1) {
|
||||||
|
axes.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (axes.size() != 0) {
|
||||||
|
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
|
||||||
|
{{"axes", axes}, {"keep_dims", true}});
|
||||||
|
runner.Run(stream);
|
||||||
|
} else {
|
||||||
|
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For dy
|
||||||
|
// stage 1
|
||||||
|
reduce_ndim = dout->dims().size() - dy->dims().size();
|
||||||
|
axes.clear();
|
||||||
|
for (auto i = 0; i < reduce_ndim; ++i) {
|
||||||
|
axes.push_back(i);
|
||||||
|
}
|
||||||
|
tmp_dout = const_cast<Tensor*>(dout);
|
||||||
|
Tensor reduced_dy(dy->type());
|
||||||
|
|
||||||
|
if (axes.size() != 0) {
|
||||||
|
std::vector<int64_t> reduced_dout_dims;
|
||||||
|
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
|
||||||
|
reduced_dout_dims.push_back(dout->dims()[i]);
|
||||||
|
}
|
||||||
|
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
|
||||||
|
reduced_dout.mutable_data<T>(ctx.GetPlace());
|
||||||
|
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
|
||||||
|
{{"axes", axes}, {"keep_dims", false}});
|
||||||
|
runner.Run(stream);
|
||||||
|
tmp_dout = &reduced_dout;
|
||||||
|
}
|
||||||
|
|
||||||
|
// stage 2
|
||||||
|
axes.clear();
|
||||||
|
Tensor* tmp_dy = tmp_dout;
|
||||||
|
for (auto i = 0; i < dy->dims().size(); ++i) {
|
||||||
|
if (dy->dims()[i] == 1) {
|
||||||
|
axes.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (axes.size() != 0) {
|
||||||
|
reduced_dy.Resize(dy->dims());
|
||||||
|
reduced_dy.mutable_data<T>(ctx.GetPlace());
|
||||||
|
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
|
||||||
|
{{"axes", axes}, {"keep_dims", true}});
|
||||||
|
runner.Run(stream);
|
||||||
|
tmp_dy = &reduced_dy;
|
||||||
|
}
|
||||||
|
|
||||||
|
// stage 3, negative
|
||||||
|
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
|
||||||
|
runner.Run(stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
|
||||||
|
REGISTER_OP_NPU_KERNEL(
|
||||||
|
elementwise_sub,
|
||||||
|
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
|
||||||
|
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
|
||||||
|
paddle::platform::float16>);
|
||||||
|
|
||||||
|
REGISTER_OP_NPU_KERNEL(
|
||||||
|
elementwise_sub_grad,
|
||||||
|
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
|
||||||
|
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
|
||||||
|
paddle::platform::float16>);
|
||||||
|
#endif
|
||||||
Loading…
Reference in new issue