Optimize fullconnection kernel for vector input

pull/7393/head
zhanyuan 4 years ago
parent 550c3fe0d2
commit 2635dc0f97

@ -0,0 +1,178 @@
#ifdef __aarch64__
.text
.align 5
.global MatmulFp16Neon64_1xN
#ifndef __APPLE__
.type MatmulFp16Neon64_1xN, %function
#endif
// void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
// x0: a
// x1: b
// x2: c
// x3: bias
// w4: act_type
// w5: depth
// w6: col
MatmulFp16Neon64_1xN:
sub sp, sp, #128
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
mov w14, #2 // sizeof(float16)
mul w8, w14, w5 // rhs depthx1 block stride
mov w14, #4
mul w13, w8, w14 // rhs depthx4 block stride
Loop:
mov x15, x0 // reload a ptr
mov x7, x1 // reload b ptr
mov w9, w5 // reload depth
cmp w6, #4
blt Loop1x1
Loop1x4:
dup v5.8h, wzr
dup v6.8h, wzr
dup v7.8h, wzr
dup v8.8h, wzr
dup v9.8h, wzr
dup v10.8h, wzr
dup v11.8h, wzr
dup v12.8h, wzr
dup v13.8h, wzr
add x10, x7, x8
add x11, x10, x8
add x12, x11, x8
Depth8_1x4:
cmp w9, #8
blt Depth1_1x4
ld1 {v0.8h}, [x15], #16
ld1 {v1.8h}, [x7], #16
ld1 {v2.8h}, [x10], #16
ld1 {v3.8h}, [x11], #16
ld1 {v4.8h}, [x12], #16
fmla v5.8h, v1.8h, v0.8h
fmla v6.8h, v2.8h, v0.8h
fmla v7.8h, v3.8h, v0.8h
fmla v8.8h, v4.8h, v0.8h
sub w9, w9, #8
cbz w9, End1x4
b Depth8_1x4
Depth1_1x4:
ld1 {v0.h}[0], [x15], #2
ld1 {v1.h}[0], [x7], #2
ld1 {v1.h}[1], [x10], #2
ld1 {v1.h}[2], [x11], #2
ld1 {v1.h}[3], [x12], #2
fmla v9.8h, v1.8h, v0.h[0]
sub w9, w9, #1
cbz w9, End1x4
b Depth1_1x4
End1x4:
faddp v10.8h, v5.8h, v6.8h
faddp v11.8h, v7.8h, v8.8h
faddp v12.8h, v10.8h, v11.8h
faddp v13.8h, v12.8h, v12.8h
fadd v13.8h, v13.8h, v9.8h
cbz x3, Act1x4
ld1 {v14.4h}, [x3], #8
fadd v13.8h, v13.8h, v14.8h
Act1x4:
cmp w4, #3
beq Relu6_1x4
cmp w4, #1
beq Relu1x4
b Write1x4
Relu6_1x4:
movi v14.8h, #0x46, lsl #8
fmin v13.8h, v13.8h, v14.8h
Relu1x4:
dup v14.8h, wzr
fmax v13.8h, v13.8h, v14.8h
Write1x4:
st1 {v13.4h}, [x2], #8
sub w6, w6, #4
cbz w6, End
add x1, x1, x13
b Loop
Loop1x1:
dup v2.8h, wzr
dup v3.8h, wzr
dup v4.8h, wzr
dup v5.8h, wzr
dup v6.8h, wzr
Depth8_1x1:
cmp w9, #8
blt Depth1_1x1
ld1 {v0.8h}, [x15], #16
ld1 {v1.8h}, [x7], #16
fmla v2.8h, v1.8h, v0.8h
sub w9, w9, #8
cbz w9, End1x1
b Depth8_1x1
Depth1_1x1:
ld1 {v0.h}[0], [x15], #2
ld1 {v1.h}[0], [x7], #2
fmla v3.8h, v1.8h, v0.h[0]
sub w9, w9, #1
cbz w9, End1x1
b Depth1_1x1
End1x1:
faddp v4.8h, v2.8h, v2.8h
faddp v5.8h, v4.8h, v4.8h
faddp v6.8h, v5.8h, v5.8h
fadd v6.8h, v6.8h, v3.8h
cbz x3, Act1x1
ld1 {v7.h}[0], [x3], #2
fadd v6.8h, v6.8h, v7.8h
Act1x1:
cmp w4, #3
beq Relu6_1x1
cmp w4, #1
beq Relu1x1
b Write1x1
Relu6_1x1:
movi v7.8h, #0x46, lsl #8
fmin v6.8h, v6.8h, v7.8h
Relu1x1:
dup v7.8h, wzr
fmax v6.8h, v6.8h, v7.8h
Write1x1:
st1 {v6.h}[0], [x2], #2
sub w6, w6, #1
cbz w6, End
add x1, x1, x8
b Loop
End:
sub sp, sp, #128
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
ret
#endif

@ -289,6 +289,11 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
return;
}
void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
int col) {
MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col);
}
void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
size_t row_up_16 = UP_ROUND(row, C16NUM);
size_t row16 = row / C16NUM * C16NUM;

@ -35,6 +35,9 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, int out_type);
void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
int col);
void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);
void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
@ -45,6 +48,9 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);
void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
int depth, int col);
void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);
void RowMajor2Row16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

@ -62,38 +62,59 @@ int FullconnectionFP16CPUKernel::ReSize() {
thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM;
if (row == 1) is_vector_input_ = true;
int a_pack_row = 0;
int b_pack_col = 0;
if (is_vector_input_) {
a_pack_row = 1;
b_pack_col = fc_param_->col_;
} else {
a_pack_row = fc_param_->row_16_;
b_pack_col = fc_param_->col_8_;
}
a_pack_ptr_ =
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)));
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t)));
if (a_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t));
memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t));
b_pack_ptr_ =
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)));
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t)));
if (b_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t));
memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t));
fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
if (fc_param_->b_const_) {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
fc_param_->col_ * fc_param_->deep_);
} else {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
if (is_vector_input_) {
memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()),
fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t));
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
}
b_ptr_ = b_pack_ptr_;
}
if (in_tensors_.size() == 3) {
bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t)));
bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * sizeof(float16_t)));
if (bias_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t));
memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t));
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, fc_param_->col_);
}
@ -102,7 +123,7 @@ int FullconnectionFP16CPUKernel::ReSize() {
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t)));
}
return RET_OK;
}
} // namespace mindspore::kernel
void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true);
@ -133,11 +154,16 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
if (cur_oc <= 0) {
return RET_OK;
}
auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = output_ptr_ + task_id * thread_stride_;
MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
OutType_Nhwc);
if (is_vector_input_) {
MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
} else {
MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
OutType_Nhwc);
}
return RET_OK;
}
@ -163,16 +189,39 @@ int FullconnectionFP16CPUKernel::Run() {
} else {
output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
}
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_, fc_param_->deep_);
} else {
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
}
a_ptr_ = a_pack_ptr_;
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
if (is_vector_input_) {
a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
a_ptr_ = a_pack_ptr_;
}
}
if (!fc_param_->b_const_) {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
fc_param_->col_ * fc_param_->deep_);
} else {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
b_ptr_ = b_pack_ptr_;
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
if (is_vector_input_) {
b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
b_ptr_ = b_pack_ptr_;
}
}
}
ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_);

@ -51,6 +51,9 @@ class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel {
float16_t *bias_ptr_ = nullptr;
float16_t *output_fp16_ = nullptr;
float16_t *output_ptr_ = nullptr;
float16_t *a_ptr_ = nullptr;
float16_t *b_ptr_ = nullptr;
bool is_vector_input_ = false;
};
} // namespace mindspore::kernel

Loading…
Cancel
Save