!10998 [MSLITE] fp32 matmul base

From: @ling_qiao_min
Reviewed-by: 
Signed-off-by:
pull/10998/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit f2b4db1cc8

File diff suppressed because it is too large Load Diff

@ -23,12 +23,23 @@
#include "nnacl/matmul_parameter.h"
#include "nnacl/op_base.h"
#define ADD_BIAS(value, bias, c) \
if (bias != NULL) value = value + bias[c];
#define DO_RELU(value, act_type) \
if (act_type == ActType_Relu) value = MSMAX(0.0f, value);
#define DO_RELU6(value, act_type) \
if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); \
if (act_type == ActType_Relu6) value = MSMAX(0.0f, value);
#ifdef __cplusplus
extern "C" {
#endif
void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
int col, size_t stride, int out_type);
void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col);
void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col);
void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col);
void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col);
@ -40,9 +51,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t
void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
#ifdef ENABLE_ARM
void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
#endif
#ifdef ENABLE_ARM64
void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
int col, size_t stride, size_t writeNhwc, size_t WriteWino);
@ -67,10 +76,6 @@ void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bi
#endif
#endif
#ifdef ENABLE_NNACL_INFER_SHAPE
int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
int *in_datatype, int *out_datatype, OpParameter *param);
#endif
#ifdef __cplusplus
}
#endif

@ -44,14 +44,11 @@ typedef struct MatMulParameter {
int col_;
int row_4_;
int row_6_;
int row_8_;
int row_12_;
int row_16_;
int row_align_;
int col_2_;
int col_4_;
int col_8_;
int col_16_;
int col_align_;
int deep_;
int deep_4_;
@ -61,8 +58,6 @@ typedef struct MatMulParameter {
bool b_transpose_; /* true : col-major */
bool a_const_;
bool b_const_;
bool a_init_shape_;
bool b_init_shape_;
ActType act_type_;
} MatMulParameter;

@ -62,11 +62,8 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
matmul_param_->col_ = conv_param_->output_channel_;
matmul_param_->deep_ = conv_param_->input_channel_;
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_);
matmul_param_->act_type_ = conv_param_->act_type_;
return;
}
@ -76,20 +73,6 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();
#ifdef ENABLE_AVX
row_tile_ = C6NUM;
col_tile_ = C16NUM;
#elif defined(ENABLE_SSE)
row_tile_ = C4NUM;
col_tile_ = C8NUM;
#elif defined(ENABLE_ARM32)
row_tile_ = C12NUM;
col_tile_ = C4NUM;
#else
row_tile_ = C12NUM;
col_tile_ = C8NUM;
#endif
if (in_tensors_.size() == 3) {
int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
int weight_size = output_channel * sizeof(float);
@ -146,6 +129,19 @@ int Convolution1x1CPUKernel::InitConv1x1Param() {
}
int Convolution1x1CPUKernel::Init() {
#ifdef ENABLE_AVX
row_tile_ = C6NUM;
col_tile_ = C16NUM;
#elif defined(ENABLE_SSE)
row_tile_ = C4NUM;
col_tile_ = C8NUM;
#elif defined(ENABLE_ARM32)
row_tile_ = C12NUM;
col_tile_ = C4NUM;
#else
row_tile_ = C12NUM;
col_tile_ = C8NUM;
#endif
matmul_param_ = new (std::nothrow) MatMulParameter;
if (matmul_param_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
@ -270,20 +266,6 @@ void Convolution1x1CPUKernel::PackWeight() {
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();
#ifdef ENABLE_AVX
row_tile_ = C6NUM;
col_tile_ = C16NUM;
#elif defined(ENABLE_SSE)
row_tile_ = C4NUM;
col_tile_ = C8NUM;
#elif defined(ENABLE_ARM32)
row_tile_ = C12NUM;
col_tile_ = C4NUM;
#else
row_tile_ = C12NUM;
col_tile_ = C8NUM;
#endif
int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);

@ -21,43 +21,19 @@
#include "include/context.h"
#include "include/errorcode.h"
#include "nnacl/fp32/matmul_fp32.h"
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"
using mindspore::lite::InnerContext;
namespace mindspore::kernel {
class FullconnectionCPUKernel : public LiteKernel {
class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel {
public:
FullconnectionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~FullconnectionCPUKernel() override;
: MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~FullconnectionCPUKernel() = default;
int Init() override;
int ReSize() override;
int Run() override;
public:
int DoMatmul(int task_id);
void FreeBuf();
private:
void InitMatrixA(const float *src_ptr, float *dst_ptr);
void InitMatrixB(const float *src_ptr, float *dst_ptr);
private:
MatMulParameter *fc_param_ = nullptr;
float *a_pack_ptr_ = nullptr;
float *b_pack_ptr_ = nullptr;
float *c_ptr_ = nullptr;
float *bias_ptr_ = nullptr;
float *a_ptr_ = nullptr;
float *b_ptr_ = nullptr;
bool is_vector_input_ = false;
int thread_count_ = 1;
int thread_stride_ = 0;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_

File diff suppressed because it is too large Load Diff

@ -19,47 +19,20 @@
#include <vector>
#include "nnacl/matmul_parameter.h"
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"
namespace mindspore::kernel {
class MatmulCPUKernel : public LiteKernel {
class MatmulCPUKernel : public MatmulFp32BaseCPUKernel {
public:
explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~MatmulCPUKernel() override;
: MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~MatmulCPUKernel() = default;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
int Eval() override;
private:
int MallocMatrixABuffer();
int MallocMatrixBBuffer();
int InitBias();
void InitMatrixA(const float *src_ptr, float *dst_ptr);
void InitMatrixB(const float *src_ptr, float *dst_ptr);
void FreeTmpBuffer();
private:
MatMulParameter *params_ = nullptr;
float *a_pack_ptr_ = nullptr;
float *b_pack_ptr_ = nullptr;
float *bias_ptr_ = nullptr;
float *a_ptr_ = nullptr;
float *b_ptr_ = nullptr;
float *cur_a_ptr_ = nullptr;
float *cur_b_ptr_ = nullptr;
float *cur_c_ptr_ = nullptr;
bool is_vector_a_ = false;
int col_tile_ = 0;
int thread_stride_ = 0;
int thread_count_ = 0;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_

@ -0,0 +1,77 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"
#include "include/errorcode.h"
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
namespace mindspore::kernel {
class MatmulFp32BaseCPUKernel : public LiteKernel {
public:
MatmulFp32BaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
vec_matmul_ = false;
}
~MatmulFp32BaseCPUKernel();
int Init() override;
int ReSize() override;
int Run() override;
public:
int FloatRun(int task_id);
protected:
int InitBufferA();
int InitBufferB();
int InitMatrixA(const float *src_ptr);
int InitMatrixB(const float *src_ptr);
void FreeBiasBuf();
int InitBiasData();
void InitParameter();
private:
void ResizeParameter();
void FreeResizeBufA();
void FreeResizeBufB();
protected:
MatMulParameter *params_ = nullptr;
float *a_pack_ptr_ = nullptr;
float *b_pack_ptr_ = nullptr;
float *c_ptr_ = nullptr;
float *bias_ptr_ = nullptr;
float *batch_a_ptr_ = nullptr;
float *batch_b_ptr_ = nullptr;
float *batch_c_ptr_ = nullptr;
int col_tile_ = 0;
int row_tile_ = 0;
int thread_stride_ = 0;
int thread_count_ = 0;
bool vec_matmul_ = false;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_

@ -318,12 +318,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
matmul_param_->deep_ = conv_param_->input_channel_;
matmul_param_->col_ = conv_param_->output_channel_;
matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM);
matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM);
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
matmul_param_->col_16_ = UP_ROUND(matmul_param_->col_, C16NUM);
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);

@ -156,12 +156,8 @@ void FullconnectionInt8CPUKernel::InitParam() {
fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM);
fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM);
fc_param_->col_2_ = UP_ROUND(fc_param_->col_, C2NUM);
fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM);
fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
fc_param_->col_16_ = UP_ROUND(fc_param_->col_, C16NUM);
fc_param_->deep_4_ = UP_ROUND(fc_param_->deep_, C4NUM);
fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM);
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM));

Loading…
Cancel
Save