diff --git a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S index 6bbc60a82c..2cb3219589 100644 --- a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S +++ b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S @@ -14,6 +14,8 @@ Float16ToFloat32: // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved // whereas our coding style do not permit such amount of parameters + cmp x2, #0 + beq LoopEnd cmp x2, #64 blt Loop Loop64: diff --git a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S index 10e029d246..a321b16a34 100644 --- a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S +++ b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S @@ -14,6 +14,8 @@ Float32ToFloat16: // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved // whereas our coding style do not permit such amount of parameters + cmp x2, #0 + beq LoopEnd cmp x2, #64 blt Loop Loop64: diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc index 47486d23e4..100ae61a0d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc @@ -34,98 +34,119 @@ MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { FreeTmpBuffer(); } void MatmulFP16CPUKernel::FreeTmpBuffer() { if (a_pack_ptr_ != nullptr) { - ctx_->allocator->Free(a_pack_ptr_); + free(a_pack_ptr_); a_pack_ptr_ = nullptr; } if (b_pack_ptr_ != nullptr) { - ctx_->allocator->Free(b_pack_ptr_); + free(b_pack_ptr_); b_pack_ptr_ = nullptr; } if (bias_ptr_ != nullptr) { - ctx_->allocator->Free(bias_ptr_); + free(bias_ptr_); bias_ptr_ = nullptr; } - if (output_ptr_ != nullptr) { - ctx_->allocator->Free(output_ptr_); - output_ptr_ = nullptr; - } } -int MatmulFP16CPUKernel::ReSize() { - FreeTmpBuffer(); - int batch = 1; +int MatmulFP16CPUKernel::MallocMatrixABuffer() { auto a_shape = in_tensors_[0]->shape(); - auto c_shape = out_tensors_[0]->shape(); - if (in_tensors_.size() == 3) { - auto bias_shape = in_tensors_[2]->shape(); - if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { - MS_LOG(ERROR) << "The bias' dimension is not equal with column"; - return RET_INPUT_TENSOR_ERROR; - } - } - + int batch = 1; for (size_t i = 0; i < a_shape.size() - 2; ++i) { batch *= a_shape[i]; } params_->batch = batch; - params_->row_ = c_shape[c_shape.size() - 2]; - params_->col_ = c_shape[c_shape.size() - 1]; + params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; params_->row_16_ = UP_ROUND(params_->row_, C16NUM); - params_->col_8_ = UP_ROUND(params_->col_, C8NUM); - thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; - a_pack_ptr_ = reinterpret_cast( - ctx_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); + a_pack_ptr_ = + reinterpret_cast(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); if (a_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); + return RET_OK; +} + +int MatmulFP16CPUKernel::MallocMatrixBBuffer() { + auto b_shape = in_tensors_[1]->shape(); + if (b_shape.empty()) { + return RET_OK; + } + int batch = 1; + for (size_t i = 0; i < b_shape.size() - 2; ++i) { + batch *= b_shape[i]; + } + params_->batch = batch; + params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; + params_->col_8_ = UP_ROUND(params_->col_, 8); + params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; - b_pack_ptr_ = reinterpret_cast( - ctx_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); + b_pack_ptr_ = + reinterpret_cast(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); if (b_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); + thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; + return RET_OK; +} - params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); - params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); - if (params_->a_const_ == true) { - if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); - } else { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); - } - } - if (params_->b_const_ == true) { - if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); - } else { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); - } - } - +int MatmulFP16CPUKernel::InitBias() { if (in_tensors_.size() == 3) { - bias_ptr_ = reinterpret_cast(ctx_->allocator->Malloc(params_->col_8_ * sizeof(float16_t))); + auto c_shape = out_tensors_[0]->shape(); + auto bias_shape = in_tensors_[1]->shape(); + if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { + MS_LOG(ERROR) << "The bias'dimension is not equal with colum"; + FreeTmpBuffer(); + return RET_INPUT_TENSOR_ERROR; + } + auto col = c_shape[c_shape.size() - 1]; + auto col_8 = UP_ROUND(col, 8); + bias_ptr_ = reinterpret_cast(malloc(col_8 * sizeof(float16_t))); if (bias_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); - Float32ToFloat16(reinterpret_cast(in_tensors_[2]->data_c()), bias_ptr_, params_->col_); + memset(bias_ptr_, 0, col_8 * sizeof(float16_t)); + Float32ToFloat16(reinterpret_cast(in_tensors_[2]->data_c()), bias_ptr_, col); } + return RET_OK; +} - if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { - output_ptr_ = reinterpret_cast( - ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); - if (output_ptr_ == nullptr) { - MS_LOG(ERROR) << "malloc output_ptr_ failed."; - return RET_MEMORY_FAILED; +int MatmulFP16CPUKernel::ReSize() { + if (params_->a_const_ == false || params_->a_init_shape_ == false) { + if (a_pack_ptr_ != nullptr) { + free(a_pack_ptr_); + a_pack_ptr_ = nullptr; } + auto ret = MallocMatrixABuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed"; + return RET_ERROR; + } + } + if (params_->b_const_ == false || params_->b_init_shape_ == false) { + if (b_pack_ptr_ != nullptr) { + free(b_pack_ptr_); + b_pack_ptr_ = nullptr; + } + auto ret = MallocMatrixBBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed"; + return RET_ERROR; + } + } + if (bias_ptr_ != nullptr) { + free(bias_ptr_); + bias_ptr_ = nullptr; + } + auto ret = InitBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 init bias failed"; + return RET_ERROR; } return RET_OK; } @@ -179,10 +200,61 @@ void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { } int MatmulFP16CPUKernel::Init() { + params_->a_init_shape_ = (in_tensors_[0]->shape().size() != 0); + params_->b_init_shape_ = (in_tensors_[1]->shape().size() != 0); + if (params_->a_init_shape_ == true) { + auto ret = MallocMatrixABuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed"; + return RET_ERROR; + } + } + if (params_->b_init_shape_ == true) { + auto ret = MallocMatrixBBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed"; + return RET_ERROR; + } + } + + params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); + params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); + if (params_->a_const_ == true) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + } + } + if (params_->b_const_ == true) { + if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + } else { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + } + } + if (!InferShapeDone()) { return RET_OK; } - return ReSize(); + auto ret = InitBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 init bias failed"; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulFP16CPUKernel::MallocFp16Output() { + if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { + output_ptr_ = reinterpret_cast( + ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); + if (output_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc output_ptr_ failed."; + return RET_MEMORY_FAILED; + } + } + return RET_OK; } int MatmulFP16CPUKernel::RunImpl(int task_id) { @@ -211,6 +283,11 @@ int MatmulFP16Run(void *cdata, int task_id) { int MatmulFP16CPUKernel::Run() { auto out_tensor = out_tensors_[0]; + auto ret = MallocFp16Output(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul MallocFp16Output failed"; + return RET_ERROR; + } float16_t *c_ptr = nullptr; if (out_tensor->data_type() == kNumberTypeFloat32) { c_ptr = output_ptr_; @@ -241,6 +318,7 @@ int MatmulFP16CPUKernel::Run() { auto size = out_tensor->ElementsNum(); auto out_tensor_data = reinterpret_cast(out_tensor->data_c()); Float16ToFloat32(output_ptr_, out_tensor_data, size); + ctx_->allocator->Free(output_ptr_); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h index dc87c1798d..9b64f424a3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h @@ -39,6 +39,10 @@ class MatmulFP16CPUKernel : public MatmulBaseCPUKernel { int RunImpl(int task_id); private: + int MallocMatrixABuffer(); + int MallocMatrixBBuffer(); + int InitBias(); + int MallocFp16Output(); void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc index ed9f0ce29f..a2e40fddcb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc @@ -162,7 +162,7 @@ int TransposeCPUKernel::Run() { return ret; } return ret; -} // namespace mindspore::kernel +} kernel::LiteKernel *CpuTransposeFp32KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter,