diff --git a/mindspore/lite/nnacl/fp16/scale_fp16.c b/mindspore/lite/nnacl/fp16/scale_fp16.c new file mode 100644 index 0000000000..1b96499484 --- /dev/null +++ b/mindspore/lite/nnacl/fp16/scale_fp16.c @@ -0,0 +1,223 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16/scale_fp16.h" + +void ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size, int inner_size) { + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size * inner_size; + for (int i = 0; i < axis_size; i++) { + int axis_offset = out_offset + i * inner_size; + int in_index = 0; +#ifdef ENABLE_ARM64 + for (; in_index < inner_size - 8; in_index += 8) { + int in_offset = axis_offset + in_index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vdupq_n_f16(scale[i]); + float16x8_t offset_8 = vdupq_n_f16(offset[i]); + float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8); + + vst1q_f16(out_data + in_offset, reslut); + } +#endif + for (; in_index < inner_size; in_index++) { + int in_offset = axis_offset + in_index; + out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i]; + } + } + } +} + +void ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size) { + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size; + int index = 0; +#ifdef ENABLE_ARM64 + for (; index < axis_size - 8; index += 8) { + int in_offset = out_offset + index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vld1q_f16(scale + index); + float16x8_t offset_8 = vld1q_f16(offset + index); + float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8); + vst1q_f16(out_data + in_offset, reslut); + } +#endif + for (; index < axis_size; index++) { + int in_offset = out_offset + index; + out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index]; + } + } +} + +void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param) { + int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); + int outer_start = task_id * outer_step; + int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); + + if (scale_param->inner_size_ == 1) { + ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); + } else { + ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, + scale_param->inner_size_); + } +} + +void ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size, int inner_size) { +#ifdef ENABLE_ARM64 + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size * inner_size; + for (int i = 0; i < axis_size; i++) { + int axis_offset = out_offset + i * inner_size; + int in_index = 0; +#ifdef ENABLE_ARM64 + for (; in_index < inner_size - 8; in_index += 8) { + int in_offset = axis_offset + in_index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vdupq_n_f16(scale[i]); + float16x8_t offset_8 = vdupq_n_f16(offset[i]); + float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); + float16x8_t result = vmaxq_f16(tmp, zeros); + vst1q_f16(out_data + in_offset, result); + } +#endif + for (; in_index < inner_size; in_index++) { + int in_offset = axis_offset + in_index; + float tmp = in_data[in_offset] * scale[i] + offset[i]; + out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f; + } + } + } +} + +void ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size) { +#ifdef ENABLE_ARM64 + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size; + int index = 0; +#ifdef ENABLE_ARM64 + for (; index < axis_size - 8; index += 8) { + int in_offset = out_offset + index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vld1q_f16(scale + index); + float16x8_t offset_8 = vld1q_f16(offset + index); + float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); + float16x8_t result = vmaxq_f16(tmp, zeros); + vst1q_f16(out_data + in_offset, result); + } +#endif + for (; index < axis_size; index++) { + int in_offset = out_offset + index; + float tmp = in_data[in_offset] * scale[index] + offset[index]; + out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f; + } + } +} + +void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param) { + int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); + int outer_start = task_id * outer_step; + int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); + + if (scale_param->inner_size_ == 1) { + ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); + } else { + ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, + scale_param->inner_size_); + } +} + +void ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size, int inner_size) { +#ifdef ENABLE_ARM64 + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size * inner_size; + for (int i = 0; i < axis_size; i++) { + int axis_offset = out_offset + i * inner_size; + int in_index = 0; +#ifdef ENABLE_ARM64 + for (; in_index < inner_size - 8; in_index += 8) { + int in_offset = axis_offset + in_index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vdupq_n_f16(scale[i]); + float16x8_t offset_8 = vdupq_n_f16(offset[i]); + float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); + float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds); + vst1q_f16(out_data + in_offset, result); + } +#endif + for (; in_index < inner_size; in_index++) { + int in_offset = axis_offset + in_index; + float tmp = in_data[in_offset] * scale[i] + offset[i]; + out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f); + } + } + } +} + +void ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, + int outer_end, int axis_size) { +#ifdef ENABLE_ARM64 + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int out = outer_start; out < outer_end; out++) { + int out_offset = out * axis_size; + int index = 0; +#ifdef ENABLE_ARM64 + for (; index < axis_size - 8; index += 8) { + int in_offset = out_offset + index; + float16x8_t data = vld1q_f16(in_data + in_offset); + float16x8_t scale_8 = vld1q_f16(scale + index); + float16x8_t offset_8 = vld1q_f16(offset + index); + float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); + float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds); + vst1q_f16(out_data + in_offset, result); + } +#endif + for (; index < axis_size; index++) { + int in_offset = out_offset + index; + float tmp = in_data[in_offset] * scale[index] + offset[index]; + out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f); + } + } +} + +void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param) { + int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); + int outer_start = task_id * outer_step; + int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); + + if (scale_param->inner_size_ == 1) { + ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); + } else { + ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, + scale_param->inner_size_); + } +} diff --git a/mindspore/lite/nnacl/fp16/scale_fp16.h b/mindspore/lite/nnacl/fp16/scale_fp16.h new file mode 100644 index 0000000000..036d02a7e7 --- /dev/null +++ b/mindspore/lite/nnacl/fp16/scale_fp16.h @@ -0,0 +1,38 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_NNACL_SCALE_FP16_H_ +#define MINDSPORE_LITE_NNACL_SCALE_FP16_H_ + +#include "nnacl/op_base.h" +#include "nnacl/scale.h" +#ifdef ENABLE_NEON +#include +#endif +#ifdef __cplusplus +extern "C" { +#endif +void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param); +void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param); +void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, + ScaleParameter *scale_param); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_SCALE_FP16_H_ diff --git a/mindspore/lite/nnacl/fp16/stack_fp16.c b/mindspore/lite/nnacl/fp16/stack_fp16.c new file mode 100644 index 0000000000..1a282d91dd --- /dev/null +++ b/mindspore/lite/nnacl/fp16/stack_fp16.c @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16/stack_fp16.h" +#include "nnacl/arithmetic_common.h" + +size_t GetStackCopyNum(int axis, int *in_shape, size_t shape_size) { + size_t one_input_size = 1; + for (size_t i = 0; i < shape_size; ++i) { + one_input_size *= in_shape[i]; + } + int in_strides[4]; + ComputeStrides(in_shape, in_strides, shape_size); + + size_t copy_num = axis > 0 ? in_strides[axis - 1] : one_input_size; + return copy_num; +} + +size_t GetStackPreAxisCount(const int *in_shape, int axis) { + size_t pre_axis_count = 1; + for (size_t i = 0; i < axis; ++i) { + pre_axis_count *= in_shape[i]; + } + return pre_axis_count; +} + +void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis, + float16_t *output) { + size_t copy_num = GetStackCopyNum(axis, in_shape, shape_size); + size_t copy_size = copy_num * sizeof(float16_t); + size_t pre_axis_count = GetStackPreAxisCount(in_shape, axis); + size_t in_offset = 0; + size_t out_offset = 0; + for (size_t i = 0; i < pre_axis_count; ++i) { + for (size_t j = 0; j < input_num; ++j) { + memcpy(output + out_offset, inputs[j] + in_offset, copy_size); + out_offset += copy_num; + } + in_offset += copy_num; + } +} diff --git a/mindspore/lite/nnacl/fp16/stack_fp16.h b/mindspore/lite/nnacl/fp16/stack_fp16.h new file mode 100644 index 0000000000..83062a7925 --- /dev/null +++ b/mindspore/lite/nnacl/fp16/stack_fp16.h @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ +#define MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ + +#include "nnacl/op_base.h" +#ifdef ENABLE_NEON +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif +void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis, + float16_t *output); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ diff --git a/mindspore/lite/nnacl/fp32/stack.h b/mindspore/lite/nnacl/fp32/stack.h index 276cf77243..bf12641a90 100644 --- a/mindspore/lite/nnacl/fp32/stack.h +++ b/mindspore/lite/nnacl/fp32/stack.h @@ -18,11 +18,6 @@ #include "nnacl/op_base.h" -typedef struct StackParameter { - OpParameter op_parameter_; - int32_t axis_; -} StackParameter; - #ifdef __cplusplus extern "C" { #endif diff --git a/mindspore/lite/nnacl/stack_parameter.h b/mindspore/lite/nnacl/stack_parameter.h new file mode 100644 index 0000000000..aff9581dab --- /dev/null +++ b/mindspore/lite/nnacl/stack_parameter.h @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ +#define MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ + +#include "nnacl/op_base.h" +typedef struct StackParameter { + OpParameter op_parameter_; + int32_t axis_; +} StackParameter; + +#endif // MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc index a938e49e0e..f9d93610c5 100644 --- a/mindspore/lite/src/populate_parameter.cc +++ b/mindspore/lite/src/populate_parameter.cc @@ -126,7 +126,7 @@ #include "nnacl/prelu_parameter.h" #include "nnacl/shape.h" #include "nnacl/fp32/constant_of_shape.h" -#include "nnacl/fp32/stack.h" +#include "nnacl/stack_parameter.h" #include "nnacl/unstack.h" #include "nnacl/depth_to_space.h" #include "nnacl/conv_parameter.h" diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc new file mode 100644 index 0000000000..c278491ec6 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc @@ -0,0 +1,214 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/scale_fp16.h" +#include +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" +#include "src/runtime/kernel/arm/fp16/common_fp16.h" +#include "nnacl/fp16/scale_fp16.h" +#include "nnacl/fp16/cast_fp16.h" + +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Scale; + +namespace mindspore::kernel { + +int ScaleFp16CPUKernel::InitScaleOffset() { + auto input_tensor = in_tensors_.at(0); + malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32; + + auto scale_tensor = in_tensors_.at(1); + malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32; + + if (in_tensors_.size() == 2) { + malloc_offset_ = true; + } else { + auto offset_tensor = in_tensors_.at(2); + malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32; + } + + auto output_tensor = out_tensors_.at(0); + malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32; + return RET_OK; +} + +int ScaleFp16CPUKernel::Init() { + if (in_tensors_.size() < 2 || in_tensors_.size() > 3) { + MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << in_tensors_.size() << " is given."; + return RET_ERROR; + } + + if (!InferShapeDone()) { + return RET_OK; + } + ReSize(); + return RET_OK; +} + +int ScaleFp16CPUKernel::ReSize() { + auto ret = CalculateParameter(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Scale fp16 CalculateParameter failed."; + return RET_ERROR; + } + + return RET_OK; +} + +int ScaleFp16CPUKernel::Scale(int task_id) { + switch (scale_param_->activation_type_) { + case schema::ActivationType_RELU6: + DoScaleRelu6Fp16(input_, output_, scale_, offset_, task_id, scale_param_); + break; + case schema::ActivationType_RELU: + DoScaleReluFp16(input_, output_, scale_, offset_, task_id, scale_param_); + break; + case schema::ActivationType_NO_ACTIVATION: + DoScaleFp16(input_, output_, scale_, offset_, task_id, scale_param_); + break; + default: + MS_LOG(ERROR) << "ScaleFp16 does not support activation type " << scale_param_->activation_type_; + return RET_ERROR; + } + return RET_OK; +} + +int ScaleRun(void *cdata, int task_id) { + auto scale = reinterpret_cast(cdata); + auto ret = scale->Scale(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ScaleRun error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ScaleFp16CPUKernel::Run() { + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << ret; + return ret; + } + ret = InitScaleOffset(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed."; + return RET_ERROR; + } + + ret = MallocAssignTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Scale Fp16 malloc tmp buffer failed"; + FreeTmpBuffer(); + return ret; + } + + ret = ParallelLaunch(this->context_->thread_pool_, ScaleRun, this, op_parameter_->thread_num_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Scale error error_code[" << ret << "]"; + return RET_ERROR; + } + + // if output tensor is fp32, we need to transform + if (malloc_output_) { + auto out_tensor = out_tensors_.at(0); + Float16ToFloat32(output_, reinterpret_cast(out_tensor->MutableData()), out_tensor->ElementsNum()); + } + FreeTmpBuffer(); + return RET_OK; +} + +int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { + input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); + if (input_ == nullptr) { + return RET_ERROR; + } + scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_); + if (scale_ == nullptr) { + return RET_ERROR; + } + if (in_tensors_.size() == 3) { + offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), context_); + if (offset_ == nullptr) { + return RET_ERROR; + } + } else { + offset_ = + reinterpret_cast(context_->allocator->Malloc(in_tensors_.at(1)->ElementsNum() * sizeof(float16_t))); + if (offset_ == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)); + } + output_ = MallocOutputFp16(out_tensors_.at(0), context_); + if (output_ == nullptr) { + return RET_ERROR; + } + return RET_OK; +} + +void ScaleFp16CPUKernel::FreeTmpBuffer() { + if (malloc_input_ && input_ != nullptr) { + context_->allocator->Free(input_); + input_ = nullptr; + } + if (malloc_scale_ && scale_ != nullptr) { + context_->allocator->Free(scale_); + scale_ = nullptr; + } + if (malloc_offset_ && offset_ != nullptr) { + context_->allocator->Free(offset_); + offset_ = nullptr; + } + if (malloc_output_ && output_ != nullptr) { + context_->allocator->Free(output_); + output_ = nullptr; + } +} + +kernel::LiteKernel *CpuScaleFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::InnerContext *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + MS_ASSERT(desc.type == schema::PrimitiveType_Scale); + if (opParameter == nullptr) { + MS_LOG(ERROR) << "opParameter is nullptr"; + return nullptr; + } + + auto *kernel = new (std::nothrow) ScaleFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "New kernel fails."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, CpuScaleFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h new file mode 100644 index 0000000000..a8c4749702 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h @@ -0,0 +1,58 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp32/scale.h" +#include "nnacl/scale.h" + +namespace mindspore::kernel { + +class ScaleFp16CPUKernel : public ScaleCPUKernel { + public: + ScaleFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : ScaleCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~ScaleFp16CPUKernel() = default; + + int Init() override; + int ReSize() override; + int Run() override; + int InitScaleOffset() override; + int Scale(int task_id); + + private: + int MallocAssignTmpBuffer(); + void FreeTmpBuffer(); + + private: + bool malloc_input_ = false; + bool malloc_scale_ = false; + bool malloc_offset_ = false; + bool malloc_output_ = false; + + float16_t *input_ = nullptr; + float16_t *scale_ = nullptr; + float16_t *offset_ = nullptr; + float16_t *output_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc new file mode 100644 index 0000000000..6f0684155b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc @@ -0,0 +1,134 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/kernel/arm/fp16/stack_fp16.h" +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/stack_parameter.h" +#include "include/errorcode.h" +#include "src/runtime/kernel/arm/fp16/common_fp16.h" +#include "nnacl/fp16/cast_fp16.h" +#include "nnacl/fp16/stack_fp16.h" + +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Stack; + +namespace mindspore::kernel { + +int StackFp16CPUKernel::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +void StackFp16CPUKernel::InitMallocFlags() { + malloc_buffers_.resize(in_tensors_.size()); + for (size_t i = 0; i < in_tensors_.size(); ++i) { + malloc_buffers_[i] = in_tensors_[i]->data_type() == kNumberTypeFloat32; + } + malloc_out = out_tensors_[0]->data_type() == kNumberTypeFloat32; +} + +int StackFp16CPUKernel::MallocAssignBuffer() { + buffers_.resize(in_tensors_.size(), nullptr); + for (size_t i = 0; i < in_tensors_.size(); ++i) { + buffers_[i] = ConvertInputFp32toFp16(in_tensors_[i], context_); + if (buffers_[i] == nullptr) { + return RET_ERROR; + } + } + + out_buffer_ = nullptr; + out_buffer_ = MallocOutputFp16(out_tensors_[0], context_); + if (out_buffer_ == nullptr) { + return RET_ERROR; + } + return RET_OK; +} + +void StackFp16CPUKernel::FreeBuffer() { + for (size_t i = 0; i < buffers_.size(); ++i) { + if (malloc_buffers_[i] && buffers_[i] != nullptr) { + context_->allocator->Free(buffers_[i]); + buffers_[i] = nullptr; + } + } + if (malloc_out && out_buffer_ != nullptr) { + context_->allocator->Free(out_buffer_); + out_buffer_ = nullptr; + } +} + +int StackFp16CPUKernel::Run() { + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << ret; + return ret; + } + size_t inputs_num = in_tensors_.size(); + auto input0 = in_tensors_[0]; + if (inputs_num == 1) { + memcpy(out_tensors_[0]->MutableData(), input0->MutableData(), input0->Size()); + return RET_OK; + } + InitMallocFlags(); + ret = MallocAssignBuffer(); + if (ret != RET_OK) { + FreeBuffer(); + return ret; + } + auto input0_shape = input0->shape(); + DoStackFp16(buffers_.data(), inputs_num, input0_shape.data(), input0_shape.size(), axis_, out_buffer_); + // if output tensor is fp32, we need to transform + if (malloc_out) { + auto out_tensor = out_tensors_.at(0); + Float16ToFloat32(out_buffer_, reinterpret_cast(out_tensor->MutableData()), out_tensor->ElementsNum()); + } + + FreeBuffer(); + return RET_OK; +} + +kernel::LiteKernel *CpuStackFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *op_parameter, + const lite::InnerContext *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + if (op_parameter == nullptr) { + MS_LOG(ERROR) << "Input op_parameter is nullptr!"; + return nullptr; + } + MS_ASSERT(desc.type == schema::PrimitiveType_Stack); + auto *kernel = new (std::nothrow) StackFp16CPUKernel(op_parameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "new StackFp16CPUKernel fail!"; + return nullptr; + } + + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(op_parameter->type_)); + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Stack, CpuStackFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h new file mode 100644 index 0000000000..98ca30fc5c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h @@ -0,0 +1,49 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp32/stack.h" + +namespace mindspore::kernel { +class StackFp16CPUKernel : public StackCPUKernel { + public: + StackFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + + ~StackFp16CPUKernel() = default; + + int Init() override; + int Run() override; + + private: + void InitMallocFlags(); + int MallocAssignBuffer(); + void FreeBuffer(); + + private: + std::vector malloc_buffers_; + std::vector buffers_; + float16_t *out_buffer_; + bool malloc_out; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc index 3663a7ab63..4b2d79b8d2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc @@ -138,7 +138,7 @@ int ScaleCPUKernel::Init() { int ScaleCPUKernel::ReSize() { auto ret = CalculateParameter(); if (ret != RET_OK) { - MS_LOG(ERROR) << "Scale fp32 InitParameter failed."; + MS_LOG(ERROR) << "Scale fp32 CalculateParameter failed."; return RET_ERROR; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h index 7fc1531921..765a7352c7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h @@ -37,15 +37,17 @@ class ScaleCPUKernel : public LiteKernel { int ReSize() override; int Run() override; int CalculateParameter(); - int InitScaleOffset(); + virtual int InitScaleOffset(); int Scale(int task_id); + protected: + ScaleParameter *scale_param_; + private: float *input_ptr_ = nullptr; float *scale_ = nullptr; float *offset_ = nullptr; float *output_ptr_ = nullptr; - ScaleParameter *scale_param_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc index f3c6d6f078..b7fb7900a7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc @@ -18,6 +18,7 @@ #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "nnacl/fp32/stack.h" +#include "nnacl/stack_parameter.h" #include "include/errorcode.h" using mindspore::lite::KernelRegistrar; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h index deafdd5d6c..db42b29613 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h @@ -33,7 +33,7 @@ class StackCPUKernel : public LiteKernel { int ReSize() override; int Run() override; - private: + protected: int axis_; }; } // namespace mindspore::kernel