fix fp16 matmul bug

pull/7059/head
lixian 4 years ago
parent 0e8b56c1e5
commit d573a1180d

@ -1195,8 +1195,6 @@ LoopRow:
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
add x11, x11, x16
b WriteEnd
WriteWino:
@ -1217,14 +1215,6 @@ LoopRow:
st1 {v29.8h}, [x11], x15
st1 {v30.8h}, [x11], x15
st1 {v31.8h}, [x11], x15
st1 {v24.8h}, [x11], x15
st1 {v25.8h}, [x11], x15
st1 {v26.8h}, [x11], x15
st1 {v27.8h}, [x11], x15
st1 {v28.8h}, [x11], x15
st1 {v29.8h}, [x11], x15
st1 {v30.8h}, [x11], x15
st1 {v31.8h}, [x11], x15
b WriteEnd
Write8:
add x2, x2, #16

@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
for (int i = 0; i < input_unit_square; ++i) {
RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel);
MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
cal_num, oc8 * C8NUM, input_unit_square, false);
MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8);
}
// step 4 : output transform

@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
}
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, bool write_nhwc) {
if (!write_nhwc) {
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
int depth, int row, int col, int stride, int out_type) {
if (out_type == OutType_C8) {
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false);
} else {
MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1);
MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type);
}
return;
}

@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
int deep, int row, int col, int stride, bool write_nhwc);
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, bool write_nhwc);
int depth, int row, int col, int stride, int out_type);
void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);
@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc);
void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
size_t depth, size_t row, size_t col, size_t stride, int write_nhwc);
size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);
void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) {
MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
matmul_param_->row_, cur_oc, matmul_param_->col_, true);
matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
return RET_OK;
}
@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) {
float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_),
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true);
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
OutType_Nhwc);
return RET_OK;
}

@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
false);
OutType_C8);
DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);

@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = output_ptr_ + task_id * thread_stride_;
MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
true);
OutType_Nhwc);
return RET_OK;
}

@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) {
auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = current_c_ + task_id * thread_stride_;
MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true);
MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
return RET_OK;
}

Loading…
Cancel
Save