!12967 [ms][lite][cpu] biasadd bug fix

From: @lzkcode
Reviewed-by: @zhang_xue_tong,@zhanghaibo5
Signed-off-by: @zhang_xue_tong
pull/12967/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit b50381ebf9

@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }
int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); } int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); }
void ReluFp32(float *data, float *dst, int ele_num) { void ReluFp32(float *data, float *dst, int ele_num) {
int four_block = UP_DIV(ele_num, C4NUM); int index = 0;
for (int i = 0; i < four_block - 1; i++) { #ifdef ENABLE_AVX
int index = i * C4NUM; int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
#ifdef ENABLE_NEON for (; index < c8_block; index += C8NUM) {
float32x4_t relu_data = vld1q_f32(data + index); MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index);
float32x4_t zero_data = vdupq_n_f32(0); MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
relu_data = vmaxq_f32(relu_data, zero_data); relu_data = MS_MAX256_F32(relu_data, zero_data);
vst1q_f32(dst + index, relu_data); MS_ST256_F32(dst + index, relu_data);
#else
data[index] = data[index] < 0 ? 0 : data[index];
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
#endif
}
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
data[j] = data[j] < 0 ? 0 : data[j];
} }
}
void Relu6Fp32(float *data, float *dst, int ele_num) {
int four_block = UP_DIV(ele_num, C4NUM);
for (int i = 0; i < four_block - 1; i++) {
int index = i * C4NUM;
#ifdef ENABLE_NEON
float32x4_t relu6_data = vld1q_f32(data + index);
float32x4_t zero_data = vdupq_n_f32(0);
float32x4_t six_data = vdupq_n_f32(6);
relu6_data = vmaxq_f32(relu6_data, zero_data);
relu6_data = vminq_f32(relu6_data, six_data);
vst1q_f32(dst + index, relu6_data);
#else
data[index] = data[index] < 0 ? 0 : data[index];
data[index] = data[index] > 6 ? 6 : data[index];
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
#endif #endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
for (; index < c4_block; index += C4NUM) {
MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index);
MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
relu_data = MS_MAXQ_F32(relu_data, zero_data);
MS_STQ_F32(dst + index, relu_data);
} }
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { #endif
data[j] = data[j] < 0 ? 0 : data[j]; for (; index < ele_num; ++index) {
data[j] = data[j] > 6 ? 6 : data[j]; data[index] = data[index] < 0.0f ? 0.0f : data[index];
} }
} }
void Relu6Fp32(float *data, float *dst, int ele_num) {
int index = 0;
#ifdef ENABLE_AVX #ifdef ENABLE_AVX
#ifdef WIN32 int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
void ReluFp32C8(float *data, float *dst, int ele_num) { for (; index < c8_block; index += C8NUM) {
int four_block = UP_DIV(ele_num, C8NUM); MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index);
for (int i = 0; i < four_block - 1; i++) { MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
int index = i * C8NUM; MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f);
data[index] = data[index] < 0 ? 0 : data[index]; relu6_data = MS_MAX256_F32(relu6_data, zero_data);
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; relu6_data = MS_MIN256_F32(relu6_data, six_data);
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; MS_ST256_F32(dst + index, relu6_data);
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
}
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) {
data[j] = data[j] < 0 ? 0 : data[j];
} }
} #endif
void Relu6Fp32C8(float *data, float *dst, int ele_num) { #if defined(ENABLE_NEON) || defined(ENABLE_SSE)
int four_block = UP_DIV(ele_num, C8NUM); int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
for (int i = 0; i < four_block - 1; i++) { for (; index < c4_block; index += C4NUM) {
int index = i * C8NUM; MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index);
data[index] = data[index] < 0 ? 0 : data[index]; MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
data[index] = data[index] > 6 ? 6 : data[index]; MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f);
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; relu6_data = MS_MAXQ_F32(relu6_data, zero_data);
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; relu6_data = MS_MINQ_F32(relu6_data, six_data);
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; MS_STQ_F32(dst + index, relu6_data);
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4];
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5];
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6];
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7];
} }
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { #endif
data[j] = data[j] < 0 ? 0 : data[j]; for (; index < ele_num; ++index) {
data[j] = data[j] > 6 ? 6 : data[j]; data[index] = data[index] < 0.0f ? 0.0f : data[index];
data[index] = data[index] > 6.0f ? 6.0f : data[index];
} }
} }
#endif
#endif

@ -17,7 +17,7 @@
#include <vector> #include <vector>
#include "include/errorcode.h" #include "include/errorcode.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/runtime/kernel/arm/fp16/bias_fp16.h" #include "src/runtime/kernel/arm/fp16/biasadd_fp16.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::kernel::KERNEL_ARCH::kCPU;
@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_BiasAdd;
namespace mindspore::kernel { namespace mindspore::kernel {
int BiasCPUFp16Kernel::ReSize() { int BiasAddCPUFp16Kernel::ReSize() {
auto dims = in_tensors_.at(0)->shape(); auto dims = in_tensors_.at(0)->shape();
bias_param_->ndim_ = dims.size(); bias_param_->ndim_ = dims.size();
if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) { if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) {
@ -45,13 +45,20 @@ int BiasCPUFp16Kernel::ReSize() {
return RET_OK; return RET_OK;
} }
int BiasCPUFp16Kernel::Run() { int BiasAddCPUFp16Kernel::Run() {
if (bias_data_ == nullptr) {
auto ret = GetBiasData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetBiasData is error in run!";
return ret;
}
}
auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData()); auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData());
auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData()); auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData());
size_t data_size = in_tensors_.at(0)->ElementsNum(); size_t data_size = in_tensors_.at(0)->ElementsNum();
MS_ASSERT(context_->allocator != nullptr); MS_ASSERT(context_->allocator != nullptr);
auto *tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); auto tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
auto *tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); auto tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
if (tile_in == nullptr || tile_bias == nullptr) { if (tile_in == nullptr || tile_bias == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed"; MS_LOG(ERROR) << "Memory allocation failed";
context_->allocator->Free(tile_in); context_->allocator->Free(tile_in);
@ -64,43 +71,54 @@ int BiasCPUFp16Kernel::Run() {
return RET_OK; return RET_OK;
} }
BiasCPUFp16Kernel::~BiasCPUFp16Kernel() { BiasAddCPUFp16Kernel::~BiasAddCPUFp16Kernel() {
if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) { if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) {
free(bias_data_); free(bias_data_);
bias_data_ = nullptr; bias_data_ = nullptr;
} }
} }
int BiasCPUFp16Kernel::Init() { int BiasAddCPUFp16Kernel::GetBiasData() {
auto bias_tensor = in_tensors_.at(1); bias_data_type_ = bias_tensor_->data_type();
MS_ASSERT(bias_tensor != nullptr);
bias_data_type_ = bias_tensor->data_type();
if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) { if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) {
bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor->ElementsNum() * sizeof(float16_t))); bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor_->ElementsNum() * sizeof(float16_t)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "bias_data_ is nullptr"; MS_LOG(ERROR) << "bias_data_ is nullptr";
return RET_NULL_PTR; return RET_NULL_PTR;
} }
auto *bias = reinterpret_cast<float *>(bias_tensor->MutableData()); auto bias = reinterpret_cast<float *>(bias_tensor_->MutableData());
if (bias == nullptr) { if (bias == nullptr) {
MS_LOG(ERROR) << "bias is nullptr!"; MS_LOG(ERROR) << "bias is nullptr!";
return RET_NULL_PTR; return RET_NULL_PTR;
} }
for (int i = 0; i < bias_tensor->ElementsNum(); ++i) { for (int i = 0; i < bias_tensor_->ElementsNum(); ++i) {
bias_data_[i] = (float16_t)(bias[i]); bias_data_[i] = (float16_t)(bias[i]);
} }
} else { } else {
bias_data_ = reinterpret_cast<float16_t *>(bias_tensor->MutableData()); bias_data_ = reinterpret_cast<float16_t *>(bias_tensor_->MutableData());
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "bias_data_ is nullptr"; MS_LOG(ERROR) << "bias_data_ is nullptr";
return RET_NULL_PTR; return RET_NULL_PTR;
} }
} }
return RET_OK;
}
int BiasAddCPUFp16Kernel::Init() {
bias_tensor_ = in_tensors_.at(1);
MS_ASSERT(bias_tensor_ != nullptr);
if (bias_tensor_->IsConst()) {
auto ret = GetBiasData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetBiasData is error in Init()!";
return ret;
}
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
return ReSize(); return ReSize();
} }
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasCPUFp16Kernel>) REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasAddCPUFp16Kernel>)
} // namespace mindspore::kernel } // namespace mindspore::kernel

@ -14,31 +14,33 @@
* limitations under the License. * limitations under the License.
*/ */
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_
#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "nnacl/fp16/arithmetic_fp16.h" #include "nnacl/fp16/arithmetic_fp16.h"
namespace mindspore::kernel { namespace mindspore::kernel {
class BiasCPUFp16Kernel : public LiteKernel { class BiasAddCPUFp16Kernel : public LiteKernel {
public: public:
BiasCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, BiasAddCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) { : LiteKernel(parameter, inputs, outputs, ctx) {
bias_param_ = reinterpret_cast<ArithmeticParameter *>(parameter); bias_param_ = reinterpret_cast<ArithmeticParameter *>(parameter);
} }
~BiasCPUFp16Kernel() override; ~BiasAddCPUFp16Kernel() override;
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
private: private:
int GetBiasData();
ArithmeticParameter *bias_param_ = nullptr; ArithmeticParameter *bias_param_ = nullptr;
float16_t *bias_data_ = nullptr; float16_t *bias_data_ = nullptr;
lite::Tensor *bias_tensor_ = nullptr;
TypeId bias_data_type_; TypeId bias_data_type_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_
Loading…
Cancel
Save