【NPU】Support npu kernel for update_loss_scaling op (#31830)
* add update_loss_scaling_npu NPU kernel * change TensorFromVec to Memsetascendrc
parent
b2407af6e3
commit
0279486b02
@ -0,0 +1,219 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/operators/npu_op_runner.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using Tensor = framework::Tensor;
|
||||
|
||||
template <typename T>
|
||||
void Update(const platform::NPUDeviceContext& ctx,
|
||||
const std::vector<bool> found_inf_vec,
|
||||
const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor,
|
||||
const Tensor* bad_in_tensor, const int incr_every_n_steps,
|
||||
const int decr_every_n_nan_or_inf, const float incr_ratio,
|
||||
const float decr_ratio, Tensor* updated_loss_scaling_tensor,
|
||||
Tensor* good_out_tensor, Tensor* bad_out_tensor) {
|
||||
auto place = ctx.GetPlace();
|
||||
auto stream = ctx.stream();
|
||||
if (found_inf_vec[0]) {
|
||||
// good_out_data = 0
|
||||
auto g = good_out_tensor->mutable_data<int>(place);
|
||||
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
|
||||
good_out_tensor->numel() * sizeof(int), stream);
|
||||
// bad_out_data = bad_in_data + 1
|
||||
Tensor factor_tensor(bad_out_tensor->type());
|
||||
factor_tensor.mutable_data<int>({1}, place);
|
||||
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
|
||||
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
|
||||
{*bad_out_tensor}, {});
|
||||
runner_p2.Run(stream);
|
||||
|
||||
std::vector<int> bad_out_data;
|
||||
TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
|
||||
if (bad_out_data[0] == decr_every_n_nan_or_inf) {
|
||||
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
|
||||
{*updated_loss_scaling_tensor},
|
||||
{{"power", static_cast<float>(1)},
|
||||
{"scale", decr_ratio},
|
||||
{"shift", static_cast<float>(0)}});
|
||||
|
||||
runner_p3.Run(stream);
|
||||
|
||||
std::vector<T> new_loss_scaling;
|
||||
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
|
||||
if (new_loss_scaling[0] < static_cast<T>(1)) {
|
||||
// updated_loss_scaling_data = 1
|
||||
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
|
||||
{*updated_loss_scaling_tensor},
|
||||
{{"power", static_cast<float>(1)},
|
||||
{"scale", static_cast<float>(0)},
|
||||
{"shift", static_cast<float>(1)}});
|
||||
|
||||
runner_p4.Run(stream);
|
||||
}
|
||||
|
||||
// bad_out_data = 0
|
||||
auto b = bad_out_tensor->mutable_data<int>(place);
|
||||
platform::NPUMemsetAsync(static_cast<void*>(b), 0,
|
||||
bad_out_tensor->numel() * sizeof(int), stream);
|
||||
}
|
||||
} else {
|
||||
// bad_out_data = 0
|
||||
auto b = bad_out_tensor->mutable_data<int>(place);
|
||||
platform::NPUMemsetAsync(static_cast<void*>(b), 0,
|
||||
bad_out_tensor->numel() * sizeof(int), stream);
|
||||
|
||||
// good_out_data = good_in_data + 1
|
||||
Tensor factor_tensor(good_out_tensor->type());
|
||||
factor_tensor.mutable_data<int>({1}, place);
|
||||
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
|
||||
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
|
||||
{*good_out_tensor}, {});
|
||||
runner_p2.Run(stream);
|
||||
|
||||
std::vector<int> good_out_data;
|
||||
TensorToVector(*good_out_tensor, ctx, &good_out_data);
|
||||
|
||||
if (good_out_data[0] == incr_every_n_steps) {
|
||||
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
|
||||
{*updated_loss_scaling_tensor},
|
||||
{{"power", static_cast<float>(1)},
|
||||
{"scale", incr_ratio},
|
||||
{"shift", static_cast<float>(0)}});
|
||||
runner_p3.Run(stream);
|
||||
|
||||
std::vector<T> new_loss_scaling;
|
||||
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
|
||||
if (!std::isfinite(new_loss_scaling[0])) {
|
||||
// updated_loss_scaling_data = pre_loss_scaling_data
|
||||
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
|
||||
{*updated_loss_scaling_tensor},
|
||||
{{"power", static_cast<float>(1)},
|
||||
{"scale", static_cast<float>(1)},
|
||||
{"shift", static_cast<float>(0)}});
|
||||
|
||||
runner_p4.Run(stream);
|
||||
}
|
||||
// good_out_data = 0
|
||||
auto g = good_out_tensor->mutable_data<int>(place);
|
||||
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
|
||||
good_out_tensor->numel() * sizeof(int), stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
|
||||
public:
|
||||
void operator()(const platform::NPUDeviceContext& dev_ctx,
|
||||
const std::vector<bool> found_inf_vec,
|
||||
const Tensor* pre_loss_scaling_tensor,
|
||||
const Tensor* good_in_tensor, const Tensor* bad_in_tensor,
|
||||
const int incr_every_n_steps,
|
||||
const int decr_every_n_nan_or_inf, const float incr_ratio,
|
||||
const float decr_ratio, Tensor* updated_loss_scaling_tensor,
|
||||
Tensor* good_out_tensor, Tensor* bad_out_tensor) const {
|
||||
Update<T>(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor,
|
||||
bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf,
|
||||
incr_ratio, decr_ratio, updated_loss_scaling_tensor,
|
||||
good_out_tensor, bad_out_tensor);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class LazyZerosNPU {
|
||||
public:
|
||||
void operator()(const platform::NPUDeviceContext& dev_ctx,
|
||||
const std::vector<bool> found_inf_vec,
|
||||
const std::vector<const framework::Tensor*>& xs,
|
||||
const std::vector<framework::Tensor*>& outs) const {
|
||||
for (size_t i = 0; i < xs.size(); ++i) {
|
||||
auto* out = outs[i];
|
||||
if (found_inf_vec[0]) {
|
||||
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
|
||||
|
||||
auto place = dev_ctx.GetPlace();
|
||||
auto stream = dev_ctx.stream();
|
||||
auto g = out->mutable_data<int>(place);
|
||||
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
|
||||
out->numel() * sizeof(int), stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
|
||||
using MPDType = typename details::MPTypeTrait<T>::Type;
|
||||
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto& dev_ctx = ctx.template device_context<DeviceContext>();
|
||||
|
||||
const auto xs = ctx.MultiInput<framework::Tensor>("X");
|
||||
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
|
||||
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
|
||||
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
|
||||
platform::errors::InvalidArgument(
|
||||
"FoundInfinite must has only one element."));
|
||||
|
||||
std::vector<bool> found_inf_vec;
|
||||
TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
|
||||
|
||||
LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
|
||||
const bool stop_update = ctx.Attr<bool>("stop_update");
|
||||
if (stop_update) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
|
||||
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
|
||||
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
|
||||
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
|
||||
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
|
||||
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
|
||||
|
||||
updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
|
||||
good_out->mutable_data<int>(dev_ctx.GetPlace());
|
||||
bad_out->mutable_data<int>(dev_ctx.GetPlace());
|
||||
|
||||
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
|
||||
const int decr_every_n_nan_or_inf =
|
||||
ctx.Attr<int>("decr_every_n_nan_or_inf");
|
||||
const float incr_ratio = ctx.Attr<float>("incr_ratio");
|
||||
const float decr_ratio = ctx.Attr<float>("decr_ratio");
|
||||
UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
|
||||
dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
|
||||
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
|
||||
updated_loss_scaling, good_out, bad_out);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(
|
||||
update_loss_scaling,
|
||||
ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
|
||||
ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
|
||||
double>);
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue