diff --git a/mindspore/lite/src/ops/pooling.cc b/mindspore/lite/src/ops/pooling.cc index 20745e7cd0..9fa8c16215 100644 --- a/mindspore/lite/src/ops/pooling.cc +++ b/mindspore/lite/src/ops/pooling.cc @@ -56,13 +56,11 @@ int Pooling::InferShape(std::vector inputs_, std::vectorroundMode(); if (round_mode == schema::RoundMode_FLOOR) { - output_h = std::floor((input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH() + 1); - output_w = std::floor((input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW() + 1); + output_h = std::floor(static_cast(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1; + output_w = std::floor(static_cast(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1; } else if (round_mode == schema::RoundMode_CEIL) { - output_h = - std::ceil((input_h + pooling_prim->padUp() + pooling_prim->padDown() - window_h) / pooling_prim->strideH() + 1); - output_w = std::ceil( - (input_w + pooling_prim->padLeft() + pooling_prim->padRight() - window_w) / pooling_prim->strideW() + 1); + output_h = std::ceil(static_cast(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1; + output_w = std::ceil(static_cast(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1; } else { MS_LOG(ERROR) << "unsupported round mode."; } @@ -80,4 +78,3 @@ int Pooling::InferShape(std::vector inputs_, std::vectorshape(); + auto n_dim = input_shapes.size(); + batchnorm_param_->channel_ = input_shapes[n_dim - 1]; + batchnorm_param_->unit_ = 1; + for (int i = 0; i < n_dim - 1; i++) { + batchnorm_param_->unit_ *= input_shapes[i]; + } + batchnorm_param_->op_parameter_.thread_num_ = + MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->unit_); + return RET_OK; +} int BatchnormCPUKernel::ReSize() { return RET_OK; } -int BatchnormCPUKernel::DoExecute(int tid) { - int count = MSMIN(thread_unit_, units_ - tid * thread_unit_); - if (count <= 0) { - return RET_OK; - } - int offset = tid * thread_unit_ * channel_; - BatchNorm(in_addr_ + offset, mean_addr_, var_addr_, count, channel_, batchnorm_param_->epsilon_, out_addr_ + offset); +int BatchnormCPUKernel::DoExecute(int task_id) { + BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_); return RET_OK; } @@ -62,15 +68,8 @@ int BatchnormCPUKernel::Run() { mean_addr_ = reinterpret_cast(inputs_.at(1)->Data()); var_addr_ = reinterpret_cast(inputs_.at(2)->Data()); out_addr_ = reinterpret_cast(outputs_.at(0)->Data()); - auto input_shapes = inputs_[0]->shape(); - channel_ = input_shapes[3]; - units_ = 1; - for (int i = 0; i < 3; i++) { - units_ *= input_shapes[i]; - } - thread_count_ = MSMIN(thread_count_, units_); - thread_unit_ = UP_DIV(units_, thread_count_); - int ret = LiteBackendParallelLaunch(BatchNormRun, this, thread_count_); + + int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]"; return ret; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h index c3532b19ae..4ad0224511 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h @@ -30,10 +30,11 @@ class BatchnormCPUKernel : public LiteKernel { BatchnormCPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) { + : LiteKernel(parameter, inputs, outputs, ctx, primitive) { + opParameter->thread_num_ = ctx->thread_num_; batchnorm_param_ = reinterpret_cast(parameter); } - ~BatchnormCPUKernel() override { delete batchnorm_param_; } + ~BatchnormCPUKernel() override = default; int Init() override; int ReSize() override; @@ -41,15 +42,10 @@ class BatchnormCPUKernel : public LiteKernel { int DoExecute(int tid); private: - int thread_count_; - int thread_unit_; - int units_; - int channel_; float *in_addr_; float *mean_addr_; float *var_addr_; float *out_addr_; - const Context *ctx_; BatchNormParameter *batchnorm_param_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/nchw2nhwc.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/nchw2nhwc.cc index 571f06ef22..74a77bf579 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/nchw2nhwc.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/nchw2nhwc.cc @@ -36,8 +36,12 @@ int Nchw2NhwcCPUKernel::Run() { auto input = inputs_[0]; auto output = outputs_[0]; - PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), - output->Channel()); + if (input->shape().size() == 4) { + PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), + output->Channel()); + } else { + memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float)); + } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/nhwc2nchw.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/nhwc2nchw.cc index f511940f9a..634961aa71 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/nhwc2nchw.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/nhwc2nchw.cc @@ -36,8 +36,12 @@ int Nhwc2NchwCPUKernel::Run() { auto input = inputs_[0]; auto output = outputs_[0]; - PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), - output->Channel()); + if (input->shape().size() == 4) { + PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), + output->Channel()); + } else { + memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float)); + } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc index 433310de10..f89da381a7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc @@ -45,12 +45,13 @@ int ScaleCPUKernel::InitScaleOffset() { } if (inputs_.size() == 3) { - auto offset_tensor = inputs_.at(1); + auto offset_tensor = inputs_.at(2); offset_ = reinterpret_cast(malloc(offset_tensor->ElementsNum() * sizeof(float))); if (offset_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } + memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float)); param->has_offset_ = true; } else { offset_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc index e61d5ea596..2a0c1433db 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc @@ -16,12 +16,12 @@ #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h" -void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int units, int channel, - float epsilon, float *output_ptr) { - for (int u = 0; u < units; u++) { - for (int c = 0; c < channel; c++) { - auto variance_sqrt = sqrt(variance_ptr[c] + epsilon); - output_ptr[u * channel + c] = (input_ptr[u * channel + c] - mean_ptr[c]) / variance_sqrt; +void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, + BatchNormParameter *param) { + for (int u = task_id; u < param->unit_; u += param->op_parameter_.thread_num_) { + for (int c = 0; c < param->channel_; c++) { + auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_); + output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt; } } } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h index 135f7a73e0..fb6f025e08 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h @@ -22,9 +22,11 @@ struct BatchNormParameter { OpParameter op_parameter_; float epsilon_; + int unit_; + int channel_; }; -void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int count, int channel, - float epsilon, float *output_ptr); +void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, + BatchNormParameter *param); #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_ diff --git a/mindspore/lite/src/runtime/thread_pool.cc b/mindspore/lite/src/runtime/thread_pool.cc index 933a49664b..adcd8b9026 100644 --- a/mindspore/lite/src/runtime/thread_pool.cc +++ b/mindspore/lite/src/runtime/thread_pool.cc @@ -245,8 +245,6 @@ bool ThreadPool::SetThreadPool() { } else { AddRunThread(localMaxThreadNums); } - MS_LOG(DEBUG) << "configThreadNums=" << configThreadNums << ", curThreadNums=" << curThreadNums - << ", curThreadRunNums=" << curThreadRunNums << ", localMaxThreadNums=" << localMaxThreadNums; return true; } @@ -276,7 +274,6 @@ void ThreadPool::AddNewThread(int newNums) { } curThreadNums += newNums; curThreadRunNums += newNums; - MS_LOG(DEBUG) << "add " << newNums << " thread"; } bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) { @@ -330,7 +327,6 @@ bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) { } bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { - MS_LOG(DEBUG) << "numTask = " << numTask << ", curThreadRunNums = " << curThreadRunNums; auto taskOri = *task; if (numTask > curThreadRunNums) { task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int { @@ -370,12 +366,10 @@ bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { } } } - MS_LOG(DEBUG) << "finish " << numTask << " task successful"; return CheckResult(); } void ThreadPool::AddRunThread(int num) { - MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums; int activeNums = num - curThreadRunNums; if (activeNums <= 0 || activateList.size() < activeNums) { return; @@ -389,7 +383,6 @@ void ThreadPool::AddRunThread(int num) { } void ThreadPool::SubRunThread(int num) { - MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums; int deactiveNums = curThreadRunNums - num; if (deactiveNums <= 0) { return; diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_pooling_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_pooling_parser.cc index c80dd6b21d..0dbe2927a9 100644 --- a/mindspore/lite/tools/converter/parser/caffe/caffe_pooling_parser.cc +++ b/mindspore/lite/tools/converter/parser/caffe/caffe_pooling_parser.cc @@ -56,6 +56,8 @@ STATUS CaffePoolingParser::Parse(const caffe::LayerParameter &proto, return RET_ERROR; } + // default roundMode RoundMode_CEIL + attr->roundMode = schema::RoundMode_CEIL; if (poolingParam.has_round_mode()) { if (poolingParam.round_mode() == caffe::PoolingParameter_RoundMode_FLOOR) { attr->roundMode = schema::RoundMode_FLOOR;