!13579 [ms][lite][cpu] power master fp16 optimize
From: @lzkcode Reviewed-by: Signed-off-by:pull/13579/MERGE
commit
403c434f55
@ -0,0 +1,117 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "nnacl/fp16/power_fp16.h"
|
||||||
|
#include "nnacl/errorcode.h"
|
||||||
|
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
float16x8_t OptimizedPowerSimdFp16(float16x8_t x, const void *exponent) {
|
||||||
|
int tmp = (int)(*(float16_t *)exponent);
|
||||||
|
int exp = abs(tmp);
|
||||||
|
float16x8_t result = vmovq_n_f16(1.0f);
|
||||||
|
while (exp) {
|
||||||
|
if (exp % 2) {
|
||||||
|
result *= x;
|
||||||
|
}
|
||||||
|
x *= x;
|
||||||
|
exp = exp / 2;
|
||||||
|
}
|
||||||
|
if (tmp >= 0) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return 1 / result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float16_t OptimizedPowerScalarFp16(float16_t x, const void *exponent) {
|
||||||
|
int tmp = *(float16_t *)exponent;
|
||||||
|
int exp = abs(tmp);
|
||||||
|
float16_t result = 1;
|
||||||
|
while (exp) {
|
||||||
|
if (exp % 2) {
|
||||||
|
result *= x;
|
||||||
|
}
|
||||||
|
x *= x;
|
||||||
|
exp = exp / 2;
|
||||||
|
}
|
||||||
|
return tmp >= 0 ? result : 1 / result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale,
|
||||||
|
float shift) {
|
||||||
|
PowerScalarFunFp16 PowerScalarFunFp16_ = NULL;
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
PowerSimdFunFp16 PowerSimdFunFp16_ = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (CheckInteger(*exponent)) {
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
PowerSimdFunFp16_ = OptimizedPowerSimdFp16;
|
||||||
|
#endif
|
||||||
|
PowerScalarFunFp16_ = OptimizedPowerScalarFp16;
|
||||||
|
} else {
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
PowerSimdFunFp16_ = StdPowerSimdFp16;
|
||||||
|
#endif
|
||||||
|
PowerScalarFunFp16_ = StdPowerScalarFp16;
|
||||||
|
}
|
||||||
|
int i = 0;
|
||||||
|
#ifdef ENABLE_NEON
|
||||||
|
int len_c8 = UP_ROUND(len, C8NUM);
|
||||||
|
float16x8_t scale_8 = vmovq_n_f16(scale);
|
||||||
|
float16x8_t shift_8 = vmovq_n_f16(shift);
|
||||||
|
for (; i < len_c8; i += C8NUM) {
|
||||||
|
float16x8_t result = PowerSimdFunFp16_(scale_8 * vld1q_f16(input + i) + shift_8, exponent);
|
||||||
|
vst1q_f16(output + i, result);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; i < len; ++i) {
|
||||||
|
output[i] = PowerScalarFunFp16_(scale * input[i] + shift, exponent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale,
|
||||||
|
float shift) {
|
||||||
|
int i = 0;
|
||||||
|
PowerScalarFunFp16 PowerScalarFunFp16_ = NULL;
|
||||||
|
#ifdef ENABLE_NEON
|
||||||
|
int len_c8 = UP_ROUND(len, C8NUM);
|
||||||
|
float16x8_t scale_8 = vmovq_n_f16(scale);
|
||||||
|
float16x8_t shift_8 = vmovq_n_f16(shift);
|
||||||
|
for (; i < len_c8; i += C8NUM) {
|
||||||
|
float16x8_t tmp_8 = scale_8 * vld1q_f16(input + i) + shift_8;
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
PowerScalarFunFp16_ = CheckInteger(exponent[i + j]) ? OptimizedPowerScalarFp16 : StdPowerScalarFp16;
|
||||||
|
output[i + j] = PowerScalarFunFp16_(tmp_8[j], exponent + i + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; i < len; ++i) {
|
||||||
|
PowerScalarFunFp16_ = CheckInteger(exponent[i]) ? OptimizedPowerScalarFp16 : StdPowerScalarFp16;
|
||||||
|
output[i] = PowerScalarFunFp16_(scale * input[i] + shift, exponent + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int PowerFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale, float shift,
|
||||||
|
bool broadcast) {
|
||||||
|
if (input == NULL || exponent == NULL || output == NULL) {
|
||||||
|
return NNACL_NULL_PTR;
|
||||||
|
}
|
||||||
|
PowerFunFp16 PowerFunFp16_ = NULL;
|
||||||
|
PowerFunFp16_ = broadcast ? PowerBroadCastFp16 : PowerSingleFp16;
|
||||||
|
PowerFunFp16_(input, exponent, output, len, scale, shift);
|
||||||
|
return NNACL_OK;
|
||||||
|
}
|
@ -0,0 +1,63 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_LITE_NNACL_FP16_POWER_FP16_H_
|
||||||
|
#define MINDSPORE_LITE_NNACL_FP16_POWER_FP16_H_
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include "nnacl/op_base.h"
|
||||||
|
#include "nnacl/power_parameter.h"
|
||||||
|
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
typedef float16x8_t (*PowerSimdFunFp16)(float16x8_t x, const void *exponent);
|
||||||
|
#endif
|
||||||
|
typedef float16_t (*PowerScalarFunFp16)(float16_t x, const void *exponent);
|
||||||
|
typedef void (*PowerFunFp16)(const float16_t *, const float16_t *, float16_t *, int, float, float);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
inline bool CheckInteger(float16_t f) { return floorf(f) == f; }
|
||||||
|
|
||||||
|
static inline float16_t StdPowerScalarFp16(float16_t x, const void *exponent) {
|
||||||
|
return powf(x, *(float16_t *)exponent);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(ENABLE_NEON)
|
||||||
|
static inline float16x8_t StdPowerSimdFp16(float16x8_t x, const void *exponent) {
|
||||||
|
float16x8_t result;
|
||||||
|
result[0] = powf(x[0], *(float16_t *)exponent);
|
||||||
|
result[1] = powf(x[1], *(float16_t *)exponent);
|
||||||
|
result[2] = powf(x[2], *(float16_t *)exponent);
|
||||||
|
result[3] = powf(x[3], *(float16_t *)exponent);
|
||||||
|
result[4] = powf(x[4], *(float16_t *)exponent);
|
||||||
|
result[5] = powf(x[5], *(float16_t *)exponent);
|
||||||
|
result[6] = powf(x[6], *(float16_t *)exponent);
|
||||||
|
result[7] = powf(x[7], *(float16_t *)exponent);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
int PowerFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale, float shift,
|
||||||
|
bool broadcast);
|
||||||
|
void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale,
|
||||||
|
float shift);
|
||||||
|
void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float16_t *output, int len, float scale,
|
||||||
|
float shift);
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // MINDSPORE_LITE_NNACL_FP16_POWER_FP16_H_
|
@ -0,0 +1,129 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "src/runtime/kernel/arm/fp16/power_fp16.h"
|
||||||
|
#include "schema/model_generated.h"
|
||||||
|
#include "src/kernel_registry.h"
|
||||||
|
#include "src/runtime/runtime_api.h"
|
||||||
|
#include "include/errorcode.h"
|
||||||
|
|
||||||
|
using mindspore::lite::KernelRegistrar;
|
||||||
|
using mindspore::lite::RET_ERROR;
|
||||||
|
using mindspore::lite::RET_NULL_PTR;
|
||||||
|
using mindspore::lite::RET_OK;
|
||||||
|
using mindspore::schema::PrimitiveType_PowFusion;
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
int PowerFp16CPUKernel::Init() {
|
||||||
|
MS_ASSERT(in_tensors_.size() == 2);
|
||||||
|
exp_tensor_ = in_tensors_[1];
|
||||||
|
MS_ASSERT(exp_tensor_ != nullptr);
|
||||||
|
if (exp_tensor_->IsConst()) {
|
||||||
|
auto ret = GetExpData();
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "GetExpData is error in Init()!";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int PowerFp16CPUKernel::ReSize() { return RET_OK; }
|
||||||
|
|
||||||
|
int PowerFp16CPUKernel::GetExpData() {
|
||||||
|
exp_data_type_ = exp_tensor_->data_type();
|
||||||
|
if (exp_data_type_ == kNumberTypeFloat || exp_data_type_ == kNumberTypeFloat32) {
|
||||||
|
exp_data_ = reinterpret_cast<float16_t *>(malloc(exp_tensor_->ElementsNum() * sizeof(float16_t)));
|
||||||
|
if (exp_data_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "exp_data_ is nullptr";
|
||||||
|
return RET_NULL_PTR;
|
||||||
|
}
|
||||||
|
auto exp = reinterpret_cast<float *>(exp_tensor_->MutableData());
|
||||||
|
if (exp == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "exp is nullptr!";
|
||||||
|
return RET_NULL_PTR;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < exp_tensor_->ElementsNum(); ++i) {
|
||||||
|
exp_data_[i] = (float16_t)(exp[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
exp_data_ = reinterpret_cast<float16_t *>(exp_tensor_->MutableData());
|
||||||
|
if (exp_data_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "exp_data_ is nullptr";
|
||||||
|
return RET_NULL_PTR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int PowerImplFp16(void *cdata, int task_id) {
|
||||||
|
auto kernel = reinterpret_cast<PowerFp16CPUKernel *>(cdata);
|
||||||
|
auto ret = kernel->RunImpl(task_id);
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "PowerFp16Impl error: " << ret;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int PowerFp16CPUKernel::Run() {
|
||||||
|
if (exp_data_ == nullptr) {
|
||||||
|
auto ret = GetExpData();
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "GetExpData is error in run!";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto ret = ParallelLaunch(this->context_->thread_pool_, PowerImplFp16, this, thread_count_);
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "PowerFp16CPUKernel error: " << ret;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int PowerFp16CPUKernel::RunImpl(int task_id) {
|
||||||
|
auto x_addr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData());
|
||||||
|
MS_ASSERT(x_addr);
|
||||||
|
auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData());
|
||||||
|
MS_ASSERT(output_addr);
|
||||||
|
auto size = in_tensors_.at(0)->ElementsNum();
|
||||||
|
int stride = UP_DIV(size, thread_count_);
|
||||||
|
int len = MSMIN(stride, size - stride * task_id);
|
||||||
|
if (len <= 0) {
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
bool broadcast = true;
|
||||||
|
broadcast = in_tensors_[0]->shape() == in_tensors_[1]->shape() ? false : true;
|
||||||
|
float16_t *cur_exp = nullptr;
|
||||||
|
if (broadcast) {
|
||||||
|
cur_exp = exp_data_;
|
||||||
|
} else {
|
||||||
|
cur_exp = exp_data_ + stride * task_id;
|
||||||
|
}
|
||||||
|
PowerFp16(x_addr + stride * task_id, cur_exp, output_addr + stride * task_id, len, scale_, shift_, broadcast);
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
PowerFp16CPUKernel::~PowerFp16CPUKernel() {
|
||||||
|
if ((exp_data_type_ == kNumberTypeFloat || exp_data_type_ == kNumberTypeFloat32) && exp_data_ != nullptr) {
|
||||||
|
free(exp_data_);
|
||||||
|
exp_data_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_PowFusion, LiteKernelCreator<PowerFp16CPUKernel>)
|
||||||
|
} // namespace mindspore::kernel
|
@ -0,0 +1,52 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POWER_H_
|
||||||
|
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POWER_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "src/lite_kernel.h"
|
||||||
|
#include "include/context.h"
|
||||||
|
#include "mindspore/lite/nnacl/fp16/power_fp16.h"
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
class PowerFp16CPUKernel : public LiteKernel {
|
||||||
|
public:
|
||||||
|
PowerFp16CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
|
||||||
|
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||||
|
: LiteKernel(param, inputs, outputs, ctx),
|
||||||
|
thread_count_(ctx->thread_num_),
|
||||||
|
scale_(reinterpret_cast<PowerParameter *>(op_parameter_)->scale_),
|
||||||
|
shift_(reinterpret_cast<PowerParameter *>(op_parameter_)->shift_) {}
|
||||||
|
~PowerFp16CPUKernel() override;
|
||||||
|
|
||||||
|
int Init() override;
|
||||||
|
int ReSize() override;
|
||||||
|
int Run() override;
|
||||||
|
int RunImpl(int task_id);
|
||||||
|
|
||||||
|
private:
|
||||||
|
int GetExpData();
|
||||||
|
int thread_count_;
|
||||||
|
float scale_;
|
||||||
|
float shift_;
|
||||||
|
float16_t *exp_data_ = nullptr;
|
||||||
|
lite::Tensor *exp_tensor_ = nullptr;
|
||||||
|
TypeId exp_data_type_;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POWER_H_
|
Loading…
Reference in new issue