|
|
|
@ -48,16 +48,9 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|
|
|
|
conv_param_->input_channel_ = in_channel;
|
|
|
|
|
conv_param_->output_channel_ = out_channel;
|
|
|
|
|
|
|
|
|
|
int oc4 = UP_DIV(out_channel, C4NUM);
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
const int oc_block = C16NUM;
|
|
|
|
|
#else
|
|
|
|
|
const int oc_block = C8NUM;
|
|
|
|
|
#endif
|
|
|
|
|
int oc_block_num = UP_DIV(out_channel, oc_block);
|
|
|
|
|
|
|
|
|
|
// set data
|
|
|
|
|
auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float);
|
|
|
|
|
auto trans_matrix_data_size =
|
|
|
|
|
input_unit_ * input_unit_ * in_channel * UP_ROUND(out_channel, oc_block_) * sizeof(float);
|
|
|
|
|
if (trans_weight_ == nullptr) {
|
|
|
|
|
trans_weight_ = reinterpret_cast<float *>(malloc(trans_matrix_data_size));
|
|
|
|
|
if (trans_weight_ == nullptr) {
|
|
|
|
@ -83,14 +76,15 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|
|
|
|
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block);
|
|
|
|
|
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block_);
|
|
|
|
|
if (ret != RET_OK) {
|
|
|
|
|
MS_LOG(ERROR) << "winograd filter transform failed.";
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// init bias
|
|
|
|
|
size_t new_bias_size = oc4 * C4NUM * sizeof(float);
|
|
|
|
|
size_t new_bias_size = UP_ROUND(out_channel, C4NUM) * sizeof(float);
|
|
|
|
|
bias_data_ = malloc(new_bias_size);
|
|
|
|
|
if (bias_data_ == nullptr) {
|
|
|
|
|
bias_data_ = reinterpret_cast<float *>(malloc(new_bias_size));
|
|
|
|
|
if (bias_data_ == nullptr) {
|
|
|
|
@ -98,31 +92,30 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|
|
|
|
return RET_MEMORY_FAILED;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
memset(bias_data_, 0, new_bias_size);
|
|
|
|
|
if (in_tensors_.size() == kInputSize2) {
|
|
|
|
|
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
|
|
|
|
|
size_t origin_size = out_channel * sizeof(float);
|
|
|
|
|
memcpy(bias_data_, origin_bias_, origin_size);
|
|
|
|
|
memset(reinterpret_cast<float *>(bias_data_) + out_channel, 0, new_bias_size - origin_size);
|
|
|
|
|
} else {
|
|
|
|
|
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
|
|
|
|
memset(bias_data_, 0, new_bias_size);
|
|
|
|
|
}
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
|
|
|
|
int channel_out = conv_param_->output_channel_;
|
|
|
|
|
int oc8 = UP_DIV(channel_out, C8NUM);
|
|
|
|
|
int tile_num = C12NUM;
|
|
|
|
|
MS_ASSERT(ctx_->allocator != nullptr);
|
|
|
|
|
|
|
|
|
|
size_t tile_buffer_size =
|
|
|
|
|
thread_count_ * tile_num * input_unit_ * input_unit_ * conv_param_->input_channel_ * sizeof(float);
|
|
|
|
|
thread_count_ * tile_num_ * input_unit_ * input_unit_ * conv_param_->input_channel_ * sizeof(float);
|
|
|
|
|
trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
|
|
|
|
|
if (trans_input_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "malloc trans_input_ failed.";
|
|
|
|
|
return RET_MEMORY_FAILED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int oc8 = UP_ROUND(conv_param_->output_channel_, C8NUM);
|
|
|
|
|
gemm_out_ = reinterpret_cast<float *>(
|
|
|
|
|
ctx_->allocator->Malloc(thread_count_ * tile_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float)));
|
|
|
|
|
ctx_->allocator->Malloc(thread_count_ * tile_num_ * input_unit_ * input_unit_ * oc8 * sizeof(float)));
|
|
|
|
|
if (gemm_out_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "malloc gemm_out_ failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
@ -136,7 +129,7 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
col_buffer_ = reinterpret_cast<float *>(
|
|
|
|
|
ctx_->allocator->Malloc(thread_count_ * tile_num * conv_param_->input_channel_ * sizeof(float)));
|
|
|
|
|
ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->input_channel_ * sizeof(float)));
|
|
|
|
|
if (col_buffer_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "malloc col_buffer_ failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
@ -164,10 +157,17 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ConvolutionWinogradCPUKernel::Init() {
|
|
|
|
|
tile_num_ = C12NUM;
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
oc_block_ = C16NUM;
|
|
|
|
|
#else
|
|
|
|
|
oc_block_ = C8NUM;
|
|
|
|
|
#endif
|
|
|
|
|
kernel_unit_ = conv_param_->kernel_h_;
|
|
|
|
|
input_unit_ = output_unit_ + kernel_unit_ - 1;
|
|
|
|
|
conv_param_->input_unit_ = input_unit_;
|
|
|
|
|
conv_param_->output_unit_ = output_unit_;
|
|
|
|
|
|
|
|
|
|
auto ret = InitWeightBias();
|
|
|
|
|
if (ret != RET_OK) {
|
|
|
|
|
MS_LOG(ERROR) << "Init weight bias failed.";
|
|
|
|
@ -197,8 +197,8 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
|
|
|
|
|
|
|
|
|
int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
|
|
|
|
|
auto input_tensor = in_tensors_.at(kInputIndex);
|
|
|
|
|
auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
|
|
|
|
|
auto output_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
|
|
|
|
|
auto ori_input_data = reinterpret_cast<float *>(input_tensor->data_c());
|
|
|
|
|
auto output_data = reinterpret_cast<float *>(out_tensors_.front()->data_c());
|
|
|
|
|
ConvWinogardFp32(ori_input_data, trans_weight_, reinterpret_cast<const float *>(bias_data_), output_data,
|
|
|
|
|
tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
|
|
|
|
|
return RET_OK;
|
|
|
|
|