add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)
* add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang <frankwhzhang@126.com>revert-31562-mean
parent
c687edecd8
commit
5cb20f30fc
@ -1,87 +0,0 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(elementwise_add);
|
||||
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
|
||||
|
||||
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
auto y = scope->Var("Y");
|
||||
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<float> init;
|
||||
for (int64_t i = 0; i < 10 * 10; ++i) {
|
||||
init.push_back(1.0);
|
||||
}
|
||||
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({10, 10});
|
||||
TensorFromVector(init, ctx, tensor_y);
|
||||
tensor_y->Resize({10, 10});
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({10, 10});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
auto op =
|
||||
f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
|
||||
{{"Out", {"Out"}}}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
EXPECT_EQ(out_vec.size(), init.size());
|
||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], 2.0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(elementwise_add, NPU) {
|
||||
f::Scope scope;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||
Compare(&scope, ctx);
|
||||
}
|
@ -0,0 +1,181 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(elementwise_add);
|
||||
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
|
||||
USE_OP(elementwise_sub);
|
||||
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
|
||||
|
||||
template <typename T>
|
||||
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
|
||||
std::string op_type) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
auto y = scope->Var("Y");
|
||||
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<T> init_x;
|
||||
for (int64_t i = 0; i < 10 * 10; ++i) {
|
||||
init_x.push_back(static_cast<T>(1.0));
|
||||
}
|
||||
|
||||
std::vector<T> init_y;
|
||||
for (int64_t i = 0; i < 10 * 10; ++i) {
|
||||
init_y.push_back(static_cast<T>(2.0));
|
||||
}
|
||||
|
||||
TensorFromVector(init_x, ctx, tensor_x);
|
||||
tensor_x->Resize({10, 10});
|
||||
TensorFromVector(init_y, ctx, tensor_y);
|
||||
tensor_y->Resize({10, 10});
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
|
||||
{{"Out", {"Out"}}}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
|
||||
std::vector<T> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
|
||||
ctx.Wait();
|
||||
float expected;
|
||||
if (op_type == "elementwise_add") {
|
||||
expected = 3.0;
|
||||
} else if (op_type == "elementwise_sub") {
|
||||
expected = -1.0;
|
||||
}
|
||||
EXPECT_EQ(out_vec.size(), init_x.size());
|
||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], static_cast<T>(expected));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
|
||||
std::string op_type) {
|
||||
// init
|
||||
auto dout = scope->Var("DOut");
|
||||
auto tensor_dout = dout->GetMutable<f::LoDTensor>();
|
||||
tensor_dout->Resize({2, 3, 5});
|
||||
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
tensor_x->Resize({2, 3, 5});
|
||||
|
||||
auto y = scope->Var("Y");
|
||||
auto tensor_y = y->GetMutable<f::LoDTensor>();
|
||||
tensor_y->Resize({1, 5});
|
||||
|
||||
auto dx = scope->Var("DX");
|
||||
auto tensor_dx = dx->GetMutable<f::LoDTensor>();
|
||||
|
||||
auto dy = scope->Var("DY");
|
||||
auto tensor_dy = dy->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<T> init_dout;
|
||||
for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
|
||||
init_dout.push_back(static_cast<T>(1.0));
|
||||
}
|
||||
|
||||
TensorFromVector(init_dout, ctx, tensor_dout);
|
||||
tensor_dout->Resize({2, 3, 5});
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
auto op = f::OpRegistry::CreateOp(op_type,
|
||||
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
|
||||
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
op->Run(*scope, place);
|
||||
|
||||
std::vector<T> dx_vec;
|
||||
TensorToVector(*tensor_dx, ctx, &dx_vec);
|
||||
|
||||
std::vector<T> dy_vec;
|
||||
TensorToVector(*tensor_dy, ctx, &dy_vec);
|
||||
|
||||
ctx.Wait();
|
||||
float expected_x, expected_y;
|
||||
if (op_type == "elementwise_add_grad") {
|
||||
expected_x = 1.0;
|
||||
expected_y = 6.0;
|
||||
} else if (op_type == "elementwise_sub_grad") {
|
||||
expected_x = 1.0;
|
||||
expected_y = -6.0;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < dx_vec.size(); i++) {
|
||||
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
|
||||
}
|
||||
for (uint32_t i = 0; i < dy_vec.size(); i++) {
|
||||
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(elementwise_add, NPU_fp32) {
|
||||
f::Scope scope;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||
Compare<float>(&scope, ctx, "elementwise_add");
|
||||
}
|
||||
|
||||
TEST(elementwise_sub, NPU_fp32) {
|
||||
f::Scope scope;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||
Compare<float>(&scope, ctx, "elementwise_sub");
|
||||
}
|
||||
|
||||
TEST(elementwise_sub, NPU_fp16) {
|
||||
f::Scope scope;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||
Compare<p::float16>(&scope, ctx, "elementwise_sub");
|
||||
}
|
||||
|
||||
TEST(elementwise_sub_grad, NPU) {
|
||||
f::Scope scope;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(0));
|
||||
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
|
||||
}
|
@ -0,0 +1,171 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifdef PADDLE_WITH_ASCEND_CL
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
|
||||
#include "paddle/fluid/operators/npu_op_runner.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using Tensor = framework::Tensor;
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto* x = ctx.Input<Tensor>("X");
|
||||
auto* y = ctx.Input<Tensor>("Y");
|
||||
auto* out = ctx.Output<Tensor>("Out");
|
||||
|
||||
out->mutable_data<T>(ctx.GetPlace());
|
||||
|
||||
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
|
||||
|
||||
auto stream =
|
||||
ctx.template device_context<paddle::platform::NPUDeviceContext>()
|
||||
.stream();
|
||||
runner.Run(stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
|
||||
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
|
||||
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
|
||||
|
||||
dx->mutable_data<T>(ctx.GetPlace());
|
||||
dy->mutable_data<T>(ctx.GetPlace());
|
||||
|
||||
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
|
||||
// default axis=-1?
|
||||
// So, the sub_grad should do reduce if needed.
|
||||
// For example, the shape of each variable in elementwise_sub:
|
||||
// x, dx: [2, 3, 5]
|
||||
// y, dy: [1, 5]
|
||||
// out, dout: [2, 3, 5]
|
||||
// Then, out = x - y => dx = dout, dy = -dout
|
||||
// And, the shape of dy can be computed by two stages reduce,
|
||||
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
|
||||
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
|
||||
|
||||
auto stream =
|
||||
ctx.template device_context<paddle::platform::NPUDeviceContext>()
|
||||
.stream();
|
||||
// For dx
|
||||
// stage 1
|
||||
auto reduce_ndim = dout->dims().size() - dx->dims().size();
|
||||
std::vector<int> axes;
|
||||
for (auto i = 0; i < reduce_ndim; ++i) {
|
||||
axes.push_back(i);
|
||||
}
|
||||
Tensor* tmp_dout = const_cast<Tensor*>(dout);
|
||||
Tensor reduced_dout(dx->type());
|
||||
if (axes.size() != 0) {
|
||||
std::vector<int64_t> reduced_dout_dims;
|
||||
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
|
||||
reduced_dout_dims.push_back(dout->dims()[i]);
|
||||
}
|
||||
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
|
||||
reduced_dout.mutable_data<T>(ctx.GetPlace());
|
||||
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
|
||||
{{"axes", axes}, {"keep_dims", false}});
|
||||
runner.Run(stream);
|
||||
tmp_dout = &reduced_dout;
|
||||
}
|
||||
|
||||
// stage 2
|
||||
axes.clear();
|
||||
for (auto i = 0; i < dx->dims().size(); ++i) {
|
||||
if (dx->dims()[i] == 1) {
|
||||
axes.push_back(i);
|
||||
}
|
||||
}
|
||||
if (axes.size() != 0) {
|
||||
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
|
||||
{{"axes", axes}, {"keep_dims", true}});
|
||||
runner.Run(stream);
|
||||
} else {
|
||||
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
|
||||
}
|
||||
|
||||
// For dy
|
||||
// stage 1
|
||||
reduce_ndim = dout->dims().size() - dy->dims().size();
|
||||
axes.clear();
|
||||
for (auto i = 0; i < reduce_ndim; ++i) {
|
||||
axes.push_back(i);
|
||||
}
|
||||
tmp_dout = const_cast<Tensor*>(dout);
|
||||
Tensor reduced_dy(dy->type());
|
||||
|
||||
if (axes.size() != 0) {
|
||||
std::vector<int64_t> reduced_dout_dims;
|
||||
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
|
||||
reduced_dout_dims.push_back(dout->dims()[i]);
|
||||
}
|
||||
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
|
||||
reduced_dout.mutable_data<T>(ctx.GetPlace());
|
||||
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
|
||||
{{"axes", axes}, {"keep_dims", false}});
|
||||
runner.Run(stream);
|
||||
tmp_dout = &reduced_dout;
|
||||
}
|
||||
|
||||
// stage 2
|
||||
axes.clear();
|
||||
Tensor* tmp_dy = tmp_dout;
|
||||
for (auto i = 0; i < dy->dims().size(); ++i) {
|
||||
if (dy->dims()[i] == 1) {
|
||||
axes.push_back(i);
|
||||
}
|
||||
}
|
||||
if (axes.size() != 0) {
|
||||
reduced_dy.Resize(dy->dims());
|
||||
reduced_dy.mutable_data<T>(ctx.GetPlace());
|
||||
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
|
||||
{{"axes", axes}, {"keep_dims", true}});
|
||||
runner.Run(stream);
|
||||
tmp_dy = &reduced_dy;
|
||||
}
|
||||
|
||||
// stage 3, negative
|
||||
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
|
||||
runner.Run(stream);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(
|
||||
elementwise_sub,
|
||||
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
|
||||
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
|
||||
paddle::platform::float16>);
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(
|
||||
elementwise_sub_grad,
|
||||
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
|
||||
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
|
||||
paddle::platform::float16>);
|
||||
#endif
|
Loading…
Reference in new issue