|
|
|
@ -29,15 +29,15 @@ MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); }
|
|
|
|
|
|
|
|
|
|
void MatmulCPUKernel::FreeTmpBuffer() {
|
|
|
|
|
if (a_c12_ptr_ != nullptr) {
|
|
|
|
|
ctx_->allocator->Free(a_c12_ptr_);
|
|
|
|
|
free(a_c12_ptr_);
|
|
|
|
|
a_c12_ptr_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (b_r8_ptr_ != nullptr) {
|
|
|
|
|
ctx_->allocator->Free(b_r8_ptr_);
|
|
|
|
|
free(b_r8_ptr_);
|
|
|
|
|
b_r8_ptr_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (bias_ptr_ != nullptr) {
|
|
|
|
|
ctx_->allocator->Free(bias_ptr_);
|
|
|
|
|
free(bias_ptr_);
|
|
|
|
|
bias_ptr_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -67,23 +67,28 @@ int MatmulCPUKernel::ReSize() {
|
|
|
|
|
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
|
|
|
|
|
|
|
|
|
|
a_c12_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_12_ * params_->deep_ * sizeof(float)));
|
|
|
|
|
a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float)));
|
|
|
|
|
if (a_c12_ptr_ == nullptr) {
|
|
|
|
|
FreeTmpBuffer();
|
|
|
|
|
return RET_MEMORY_FAILED;
|
|
|
|
|
}
|
|
|
|
|
memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float));
|
|
|
|
|
b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float)));
|
|
|
|
|
|
|
|
|
|
b_r8_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float)));
|
|
|
|
|
if (b_r8_ptr_ == nullptr) {
|
|
|
|
|
FreeTmpBuffer();
|
|
|
|
|
return RET_MEMORY_FAILED;
|
|
|
|
|
}
|
|
|
|
|
memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
|
|
|
|
|
|
|
|
|
|
params_->a_const_ = false;
|
|
|
|
|
params_->b_const_ = false;
|
|
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_);
|
|
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
|
|
|
|
|
params_->a_const_ = (in_tensors_[0]->Data() != nullptr);
|
|
|
|
|
params_->b_const_ = (in_tensors_[1]->Data() != nullptr);
|
|
|
|
|
if (params_->a_const_ == true) {
|
|
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_);
|
|
|
|
|
}
|
|
|
|
|
if (params_->b_const_ == true) {
|
|
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bias_ptr_ = reinterpret_cast<float *>(malloc(params_->col_8_ * sizeof(float)));
|
|
|
|
|
if (bias_ptr_ == nullptr) {
|
|
|
|
@ -99,35 +104,27 @@ int MatmulCPUKernel::ReSize() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
|
|
|
|
|
if (params_->a_const_ == true) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (src_ptr == nullptr) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
params_->a_const_ = true;
|
|
|
|
|
|
|
|
|
|
if (params_->a_transpose_) {
|
|
|
|
|
RowMajor2Row12Major(src_ptr, dst_ptr, params_->deep_, params_->row_);
|
|
|
|
|
} else {
|
|
|
|
|
RowMajor2Col12Major(src_ptr, dst_ptr, params_->row_, params_->deep_);
|
|
|
|
|
for (int i = 0; i < params_->batch; i++) {
|
|
|
|
|
float *src = src_ptr + i * params_->deep_ * params_->row_;
|
|
|
|
|
float *dst = dst_ptr + i * params_->deep_ * params_->row_12_;
|
|
|
|
|
if (params_->a_transpose_) {
|
|
|
|
|
RowMajor2Row12Major(src, dst, params_->deep_, params_->row_);
|
|
|
|
|
} else {
|
|
|
|
|
RowMajor2Col12Major(src, dst, params_->row_, params_->deep_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
|
|
|
|
|
if (params_->b_const_ == true) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (src_ptr == nullptr) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
params_->b_const_ = true;
|
|
|
|
|
|
|
|
|
|
if (params_->b_transpose_) {
|
|
|
|
|
RowMajor2Col8Major(src_ptr, dst_ptr, params_->col_, params_->deep_);
|
|
|
|
|
} else {
|
|
|
|
|
RowMajor2Row8Major(src_ptr, dst_ptr, params_->deep_, params_->col_);
|
|
|
|
|
for (int i = 0; i < params_->batch; i++) {
|
|
|
|
|
float *src = src_ptr + i * params_->deep_ * params_->col_;
|
|
|
|
|
float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
|
|
|
|
|
if (params_->b_transpose_) {
|
|
|
|
|
RowMajor2Col8Major(src, dst, params_->col_, params_->deep_);
|
|
|
|
|
} else {
|
|
|
|
|
RowMajor2Row8Major(src, dst, params_->deep_, params_->col_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
@ -144,8 +141,8 @@ int MatmulCPUKernel::RunImpl(int task_id) {
|
|
|
|
|
if (cur_oc <= 0) {
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
|
MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_,
|
|
|
|
|
c_r_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No,
|
|
|
|
|
MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_,
|
|
|
|
|
c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No,
|
|
|
|
|
params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
@ -166,20 +163,21 @@ int MatmulCPUKernel::Run() {
|
|
|
|
|
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
|
|
|
|
|
return prepare_ret;
|
|
|
|
|
}
|
|
|
|
|
auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->Data());
|
|
|
|
|
auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->Data());
|
|
|
|
|
auto c_ptr = reinterpret_cast<float *>(out_tensors_[0]->Data());
|
|
|
|
|
auto a_stride = params_->row_ * params_->deep_;
|
|
|
|
|
auto b_stride = params_->deep_ * params_->col_;
|
|
|
|
|
auto c_stride = params_->row_ * params_->col_;
|
|
|
|
|
for (int i = 0; i < params_->batch; ++i) {
|
|
|
|
|
auto cur_a_ptr = a_ptr + i * a_stride;
|
|
|
|
|
auto cur_b_ptr = b_ptr + i * b_stride;
|
|
|
|
|
c_r_ptr_ = c_ptr + i * c_stride;
|
|
|
|
|
auto a_src = reinterpret_cast<float *>(in_tensors_[0]->Data());
|
|
|
|
|
auto b_src = reinterpret_cast<float *>(in_tensors_[1]->Data());
|
|
|
|
|
auto c_src = reinterpret_cast<float *>(out_tensors_[0]->Data());
|
|
|
|
|
|
|
|
|
|
InitMatrixA(cur_a_ptr, a_c12_ptr_);
|
|
|
|
|
InitMatrixB(cur_b_ptr, b_r8_ptr_);
|
|
|
|
|
if (params_->a_const_ == false) {
|
|
|
|
|
InitMatrixA(a_src, a_c12_ptr_);
|
|
|
|
|
}
|
|
|
|
|
if (params_->b_const_ == false) {
|
|
|
|
|
InitMatrixB(b_src, b_r8_ptr_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < params_->batch; ++i) {
|
|
|
|
|
a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_;
|
|
|
|
|
b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_;
|
|
|
|
|
c_ptr_ = c_src + i * params_->row_ * params_->col_;
|
|
|
|
|
LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
|
|
|
|
|
}
|
|
|
|
|
return RET_OK;
|
|
|
|
|