|
|
|
@ -62,6 +62,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
|
|
|
|
|
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
|
|
|
|
|
matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
|
|
|
|
|
matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
|
|
|
|
|
matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
|
|
|
|
|
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
|
|
|
|
|
matmul_param_->act_type_ = conv_param_->act_type_;
|
|
|
|
|
return;
|
|
|
|
@ -73,14 +74,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
|
|
|
|
|
auto output_channel = filter_tensor->Batch();
|
|
|
|
|
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
int col_tile = C16NUM;
|
|
|
|
|
row_tile_ = C6NUM;
|
|
|
|
|
col_tile_ = C16NUM;
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
row_tile_ = C4NUM;
|
|
|
|
|
col_tile_ = C8NUM;
|
|
|
|
|
#elif defined(ENABLE_ARM32)
|
|
|
|
|
int col_tile = C4NUM;
|
|
|
|
|
row_tile_ = C12NUM;
|
|
|
|
|
col_tile_ = C4NUM;
|
|
|
|
|
#else
|
|
|
|
|
int col_tile = C8NUM;
|
|
|
|
|
row_tile_ = C12NUM;
|
|
|
|
|
col_tile_ = C8NUM;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
if (in_tensors_.size() == 3) {
|
|
|
|
|
int size = UP_ROUND(output_channel, col_tile) * sizeof(float);
|
|
|
|
|
int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
|
|
|
|
|
int weight_size = output_channel * sizeof(float);
|
|
|
|
|
bias_data_ = malloc(size);
|
|
|
|
|
if (bias_data_ == nullptr) {
|
|
|
|
@ -91,8 +99,8 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
|
|
|
|
|
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int size = input_channel * UP_ROUND(output_channel, col_tile) * sizeof(float);
|
|
|
|
|
int down_size = input_channel * DOWN_DIV(output_channel, col_tile) * col_tile * sizeof(float);
|
|
|
|
|
int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
|
|
|
|
|
int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
|
|
|
|
|
weight_ptr_ = reinterpret_cast<float *>(malloc(size));
|
|
|
|
|
if (weight_ptr_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
|
|
|
|
@ -113,27 +121,14 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int Convolution1x1CPUKernel::InitConv1x1Param() {
|
|
|
|
|
int hw_tile = C12NUM;
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
hw_tile = C6NUM;
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
hw_tile = C4NUM;
|
|
|
|
|
#endif
|
|
|
|
|
if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
|
|
|
|
|
if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
|
|
|
|
|
multi_thread_by_hw_ = true;
|
|
|
|
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile));
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile;
|
|
|
|
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
int col_tile = C16NUM;
|
|
|
|
|
#elif defined(ENABLE_ARM32)
|
|
|
|
|
int col_tile = C4NUM;
|
|
|
|
|
#else
|
|
|
|
|
int col_tile = C8NUM;
|
|
|
|
|
#endif
|
|
|
|
|
multi_thread_by_hw_ = false;
|
|
|
|
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile));
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile), thread_count_) * col_tile;
|
|
|
|
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
|
|
|
|
@ -167,6 +162,16 @@ int Convolution1x1CPUKernel::Init() {
|
|
|
|
|
return ReSize();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
|
|
|
|
|
#if ENABLE_AVX
|
|
|
|
|
RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
|
|
|
|
|
#else
|
|
|
|
|
RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
|
|
|
|
|
int res_stride = matmul_param_->col_ - task_id * thread_stride_;
|
|
|
|
|
int cur_oc = MSMIN(thread_stride_, res_stride);
|
|
|
|
@ -198,20 +203,20 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
|
|
|
|
|
float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
|
|
|
|
|
|
|
|
|
|
#if ENABLE_AVX
|
|
|
|
|
RowMajor2Col6Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
|
|
|
|
|
#else
|
|
|
|
|
RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_;
|
|
|
|
|
float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
|
|
|
|
|
MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
|
|
|
|
|
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
|
|
|
|
|
float *cur_intput = thread_input_ptr;
|
|
|
|
|
float *cur_output = thread_output_ptr;
|
|
|
|
|
for (int i = 0; i < cur_hw_; i += row_tile_) {
|
|
|
|
|
int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i);
|
|
|
|
|
PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_);
|
|
|
|
|
MatMulOpt(thread_pack_input, weight_ptr_, cur_output, reinterpret_cast<float *>(bias_data_),
|
|
|
|
|
matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_,
|
|
|
|
|
OutType_Nhwc);
|
|
|
|
|
cur_intput += row_tile_ * matmul_param_->deep_;
|
|
|
|
|
cur_output += row_tile_ * matmul_param_->col_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -228,17 +233,9 @@ int Convolution1x1RunHw(void *cdata, int task_id) {
|
|
|
|
|
int Convolution1x1CPUKernel::Run() {
|
|
|
|
|
auto src_in = reinterpret_cast<float *>(in_tensors_[0]->MutableData());
|
|
|
|
|
auto src_out = reinterpret_cast<float *>(out_tensors_[0]->MutableData());
|
|
|
|
|
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
pack_input_ =
|
|
|
|
|
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_6_ * matmul_param_->deep_ * sizeof(float)));
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
pack_input_ =
|
|
|
|
|
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_4_ * matmul_param_->deep_ * sizeof(float)));
|
|
|
|
|
#else
|
|
|
|
|
pack_input_ =
|
|
|
|
|
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
|
|
|
|
|
#endif
|
|
|
|
|
int pack_input_size = multi_thread_by_hw_ ? (thread_count_ * row_tile_ * matmul_param_->deep_)
|
|
|
|
|
: (matmul_param_->row_align_ * matmul_param_->deep_);
|
|
|
|
|
pack_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(pack_input_size * sizeof(float)));
|
|
|
|
|
if (pack_input_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
|
|
|
|
|
return RET_MEMORY_FAILED;
|
|
|
|
@ -256,13 +253,7 @@ int Convolution1x1CPUKernel::Run() {
|
|
|
|
|
if (multi_thread_by_hw_) {
|
|
|
|
|
ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_);
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
RowMajor2Col6Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
|
|
|
|
|
#elif defined(ENABLE_SSE)
|
|
|
|
|
RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
|
|
|
|
|
#else
|
|
|
|
|
RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
|
|
|
|
|
#endif
|
|
|
|
|
PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
|
|
|
|
|
ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|