From 3066d4a17cc73490a9498f1b34b29ab755fed884 Mon Sep 17 00:00:00 2001 From: hangangqiang Date: Fri, 26 Mar 2021 10:47:31 +0800 Subject: [PATCH] reduce runtime ram while fp16 is enabled --- mindspore/lite/nnacl/fp16/pack_fp16.c | 34 +++ mindspore/lite/nnacl/fp16/pack_fp16.h | 4 + .../lite/nnacl/infer/quant_dtype_cast_infer.c | 3 - .../lite/nnacl/infer/quant_dtype_cast_infer.h | 2 +- mindspore/lite/src/inner_context.cc | 50 +++-- mindspore/lite/src/inner_context.h | 7 + .../kernel/arm/base/quant_dtype_cast.cc | 50 +---- .../arm/fp16/convolution_depthwise_fp16.cc | 4 +- .../convolution_depthwise_slidewindow_fp16.cc | 4 +- .../arm/fp16/deconvolution_depthwise_fp16.cc | 8 +- .../kernel/arm/fp16/deconvolution_fp16.cc | 20 +- .../arm/fp16/deconvolution_winograd_fp16.cc | 2 +- mindspore/lite/src/scheduler.cc | 200 +++++++++++------- mindspore/lite/src/scheduler.h | 4 + mindspore/lite/tools/benchmark/benchmark.cc | 126 ++++++++--- mindspore/lite/tools/benchmark/benchmark.h | 15 +- .../tools/optimizer/graph/infershape_pass.cc | 2 +- 17 files changed, 350 insertions(+), 185 deletions(-) diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c index 67fe8d6455..aecc351ec0 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/nnacl/fp16/pack_fp16.c @@ -474,6 +474,25 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, } } +void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) { + int c8 = UP_DIV(channel, C8NUM); + for (int b = 0; b < batch; b++) { + int src_offset = b * plane * channel; + int dst_offset = b * plane * c8 * C8NUM; + for (int c = 0; c < channel; c++) { + int c8_block_num = c / C8NUM; + int c8_block_rem = c % C8NUM; + int src_c_offset = src_offset + c * plane; + int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_c_offset + k; + int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem; + (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0]; + } + } + } +} + void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; for (int b = 0; b < batch; b++) { @@ -504,6 +523,21 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, return; } +void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) { + for (int n = 0; n < batch; n++) { + for (int hw = 0; hw < plane; hw++) { + for (int c = 0; c < channel; c++) { + int c8div = c / C8NUM; + int c8mod = c % C8NUM; + int src_index = n * plane * channel + hw * channel + c; + int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod; + dst[dst_index] = src[src_index]; + } + } + } + return; +} + void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) { int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; for (int b = 0; b < batch; b++) { diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.h b/mindspore/lite/nnacl/fp16/pack_fp16.h index fc82ff66a3..b49a35b479 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.h +++ b/mindspore/lite/nnacl/fp16/pack_fp16.h @@ -61,10 +61,14 @@ void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); +void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); + void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); +void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); + void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel); void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); diff --git a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c index d0caa00192..5fdc564972 100644 --- a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c +++ b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c @@ -30,9 +30,6 @@ int QuantDtypeCastInferShape(const TensorC *const *inputs, size_t inputs_size, T TensorC *output = outputs[0]; QuantDtypeCastParameter *param = (QuantDtypeCastParameter *)parameter; - if (input->data_type_ != param->srcT_) { - return NNACL_ERR; - } output->data_type_ = param->dstT_; output->format_ = input->format_; if (!parameter->infer_flag_) { diff --git a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h index b1fb1ca101..8357fec315 100644 --- a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h +++ b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h @@ -24,7 +24,7 @@ extern "C" { typedef struct QuantDtypeCastParameter { OpParameter op_parameter_; - int srcT_; + int srcT_; // deprecated int dstT_; } QuantDtypeCastParameter; diff --git a/mindspore/lite/src/inner_context.cc b/mindspore/lite/src/inner_context.cc index 33f01758a4..29dfeab99f 100644 --- a/mindspore/lite/src/inner_context.cc +++ b/mindspore/lite/src/inner_context.cc @@ -17,6 +17,7 @@ #include "src/inner_context.h" #include "include/errorcode.h" #include "src/common/log_adapter.h" +#include "src/common/utils.h" #ifdef SUPPORT_NPU #include "src/runtime/agent/npu/npu_manager.h" #endif @@ -85,18 +86,18 @@ int InnerContext::IsValid() const { MS_LOG(ERROR) << "Device list is empty."; return RET_NOT_SUPPORT; } - if (!IsCpuEnabled()) { - MS_LOG(ERROR) << "CPU is not supported."; + if (!IsUserSetCpu()) { + MS_LOG(ERROR) << "CPU context should be set."; return RET_NOT_SUPPORT; } #ifndef SUPPORT_GPU - if (IsGpuEnabled()) { + if (IsUserSetGpu()) { MS_LOG(ERROR) << "GPU is not supported."; return RET_NOT_SUPPORT; } #endif #ifndef SUPPORT_NPU - if (IsNpuEnabled()) { + if (IsUserSetNpu()) { MS_LOG(ERROR) << "NPU is not supported."; return RET_NOT_SUPPORT; } @@ -108,6 +109,9 @@ bool InnerContext::IsCpuFloat16Enabled() const { if (!IsCpuEnabled()) { return false; } + if (!IsSupportFloat16()) { + return false; + } return GetCpuInfo().enable_float16_; } @@ -115,31 +119,47 @@ bool InnerContext::IsGpuFloat16Enabled() const { if (!IsGpuEnabled()) { return false; } + if (!IsSupportFloat16()) { + return false; + } return GetGpuInfo().enable_float16_; } -bool InnerContext::IsCpuEnabled() const { +bool InnerContext::IsCpuEnabled() const { return IsUserSetCpu(); } + +bool InnerContext::IsGpuEnabled() const { +#ifdef SUPPORT_GPU + return IsUserSetGpu(); +#else + return false; +#endif +} + +bool InnerContext::IsNpuEnabled() const { +#ifdef SUPPORT_NPU + MS_ASSERT(npu_manager_ != nullptr); + return IsUserSetNpu() && npu_manager_->IsSupportNPU(); +#else + return false; +#endif +} + +bool InnerContext::IsUserSetCpu() const { return this->device_list_.end() != std::find_if(this->device_list_.begin(), this->device_list_.end(), [](const DeviceContext &device) { return device.device_type_ == DT_CPU; }); } -bool InnerContext::IsGpuEnabled() const { +bool InnerContext::IsUserSetGpu() const { return this->device_list_.end() != std::find_if(this->device_list_.begin(), this->device_list_.end(), [](const DeviceContext &device) { return device.device_type_ == DT_GPU; }); } -bool InnerContext::IsNpuEnabled() const { -#ifdef SUPPORT_NPU - MS_ASSERT(npu_manager_ != nullptr); +bool InnerContext::IsUserSetNpu() const { return this->device_list_.end() != - std::find_if(this->device_list_.begin(), this->device_list_.end(), - [](const DeviceContext &device) { return device.device_type_ == DT_NPU; }) && - npu_manager_->IsSupportNPU(); -#else - return false; -#endif + std::find_if(this->device_list_.begin(), this->device_list_.end(), + [](const DeviceContext &device) { return device.device_type_ == DT_NPU; }); } CpuDeviceInfo InnerContext::GetCpuInfo() const { diff --git a/mindspore/lite/src/inner_context.h b/mindspore/lite/src/inner_context.h index 3a5f18182d..41e4cad9d9 100644 --- a/mindspore/lite/src/inner_context.h +++ b/mindspore/lite/src/inner_context.h @@ -58,6 +58,13 @@ struct InnerContext : public Context { virtual ~InnerContext(); + private: + bool IsUserSetCpu() const; + + bool IsUserSetGpu() const; + + bool IsUserSetNpu() const; + #if SUPPORT_NPU private: diff --git a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc index 411ecd8619..0f295434f3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc @@ -44,48 +44,12 @@ int QuantDTypeCastCPUKernel::Init() { MS_ASSERT(out_tensor); auto param = reinterpret_cast(op_parameter_); MS_ASSERT(param); - if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeInt8) { - if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeInt8) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeFloat32) { - if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeFloat32) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeInt8) { - if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeInt8) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeInt8) { - if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeInt8) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeUInt8) { - if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeUInt8) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeFloat32) { - if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeFloat32) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeUInt8) { - if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeUInt8) { - MS_LOG(ERROR) << "param data type and tensor data type do not match."; - return RET_ERROR; - } - } else { - MS_LOG(ERROR) << "param data type not supported:" - << " src: " << param->srcT << " dst: " << param->dstT; - return RET_PARAM_INVALID; - } - src_dtype = param->srcT; + src_dtype = in_tensor->data_type(); dst_dtype = param->dstT; + if (out_tensor->data_type() != dst_dtype) { + MS_LOG(ERROR) << "param data type and tensor data type do not match."; + return RET_ERROR; + } if (!InferShapeDone()) { return RET_OK; @@ -149,6 +113,10 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) { ret = DoQuantizeFp32ToInt8(float32_ptr_ + thread_offset, int8_out_ptr_ + thread_offset, output_quant_arg.scale, output_quant_arg.zeroPoint, num_unit_thread, from_uint8_src); } + } else { + MS_LOG(ERROR) << "param data type not supported:" + << " src: " << src_dtype << " dst: " << dst_dtype; + return RET_PARAM_INVALID; } if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index 3121e93809..521f5b1501 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -47,7 +47,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { MS_LOG(ERROR) << "get execute filter data failed."; return ret; } - PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); if (fp16_weight_ != nullptr) { free(fp16_weight_); @@ -64,7 +64,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { if (in_tensors_.size() == kInputSize2) { auto bias_tensor = in_tensors_.at(kBiasIndex); MS_ASSERT(origin_bias_); - auto ori_bias = reinterpret_cast(origin_bias_); + auto ori_bias = reinterpret_cast(origin_bias_); for (int i = 0; i < bias_tensor->ElementsNum(); i++) { bias_fp16[i] = (float16_t)ori_bias[i]; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc index 9bc4503cee..02cdf2721c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc @@ -68,7 +68,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - PackNCHWFp32ToNC8HW8Fp16(reinterpret_cast(origin_weight_), packed_weight_, 1, + PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast(origin_weight_), packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); @@ -81,7 +81,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { if (in_tensors_.size() == kInputSize2) { auto bias_tensor = in_tensors_.at(kBiasIndex); MS_ASSERT(origin_bias_); - auto ori_bias = reinterpret_cast(origin_bias_); + auto ori_bias = reinterpret_cast(origin_bias_); for (int i = 0; i < bias_tensor->ElementsNum(); i++) { bias_fp16[i] = (float16_t)ori_bias[i]; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index d35f867b8e..3c1200fb97 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = in_tensors_.at(kWeightIndex); int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); - auto origin_weight = reinterpret_cast(weight_tensor->MutableData()); + auto origin_weight = reinterpret_cast(weight_tensor->MutableData()); int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); @@ -81,7 +81,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); @@ -92,9 +92,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); if (in_tensors_.size() == kInputSize2) { auto bias_tensor = in_tensors_.at(kBiasIndex); - auto ori_bias = reinterpret_cast(bias_tensor->MutableData()); + auto ori_bias = reinterpret_cast(bias_tensor->MutableData()); for (int i = 0; i < bias_tensor->ElementsNum(); i++) { - reinterpret_cast(bias_data_)[i] = (float16_t)ori_bias[i]; + reinterpret_cast(bias_data_)[i] = ori_bias[i]; } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc index 6f3106ec4c..0a750215c9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc @@ -57,7 +57,8 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { auto kernel_h = weight_tensor->Height(); auto kernel_w = weight_tensor->Width(); - bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float16_t)); + auto bias_size = UP_ROUND(output_channel, C4NUM) * sizeof(float16_t); + bias_data_ = malloc(bias_size); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "deconv malloc bias_data_ error!"; return RET_ERROR; @@ -65,8 +66,15 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t)); if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && in_tensors_.at(kBiasIndex)->DimensionSize(0) == output_channel) { - Float32ToFloat16(reinterpret_cast(in_tensors_.at(2)->MutableData()), - reinterpret_cast(bias_data_), output_channel); + if (in_tensors_.at(2)->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "deconv fp16 kernel require fp16 bias"; + return RET_ERROR; + } + if (bias_size != in_tensors_.at(2)->Size()) { + MS_LOG(ERROR) << "input bias size not match : " << bias_size << " vs " << in_tensors_.at(2)->Size(); + return RET_ERROR; + } + memcpy(bias_data_, in_tensors_.at(2)->data_c(), bias_size); } size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); @@ -76,7 +84,11 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { return RET_ERROR; } memset(execute_weight_, 0, weight_pack_size); - PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast(in_tensors_.at(1)->MutableData()), execute_weight_, input_channel, + if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight"; + return RET_ERROR; + } + PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast(in_tensors_.at(1)->data_c()), execute_weight_, input_channel, kernel_w * kernel_h, output_channel); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc index c3ad9aa89a..9cbf54369b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc @@ -341,7 +341,7 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() { auto fp16_bias_data = reinterpret_cast(bias_data_); if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) { - auto src_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->MutableData()); + auto src_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->MutableData()); MS_ASSERT(src_bias); for (int i = 0; i < conv_param_->output_channel_; ++i) { fp16_bias_data[i] = (float16_t)src_bias[i]; diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index 2ec491ac8b..0308c1d927 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -239,6 +239,9 @@ int CopyConstTensor(Tensor *tensor, std::map *restored_origi return RET_ERROR; #endif } else { + if (tensor->own_data()) { + return RET_OK; + } tensor->set_data(nullptr); auto ret = tensor->MallocData(); if (RET_OK != ret) { @@ -253,8 +256,18 @@ int CopyConstTensor(Tensor *tensor, std::map *restored_origi } #endif -inline void RestoreTensorData(const std::map &restored_origin_tensors) { - for (auto &restored_origin_tensor : restored_origin_tensors) { +inline void FreeRestoreTensors(std::map *restored_origin_tensors) { + MS_ASSERT(restored_origin_tensors != nullptr); + for (auto &restored_origin_tensor : *restored_origin_tensors) { + restored_origin_tensor.second->set_data(nullptr); + delete (restored_origin_tensor.second); + } + restored_origin_tensors->clear(); +} + +inline void RestoreTensorData(std::map *restored_origin_tensors) { + MS_ASSERT(restored_origin_tensors != nullptr); + for (auto &restored_origin_tensor : *restored_origin_tensors) { auto *origin_tensor = restored_origin_tensor.first; auto *restored_tensor = restored_origin_tensor.second; MS_ASSERT(origin_tensor != nullptr); @@ -264,15 +277,7 @@ inline void RestoreTensorData(const std::map &restored_origi origin_tensor->set_data(restored_tensor->data_c()); origin_tensor->set_own_data(restored_tensor->own_data()); } -} - -inline void FreeRestoreTensors(std::map *restored_origin_tensors) { - MS_ASSERT(restored_origin_tensors != nullptr); - for (auto &restored_origin_tensor : *restored_origin_tensors) { - restored_origin_tensor.second->set_data(nullptr); - delete (restored_origin_tensor.second); - } - restored_origin_tensors->clear(); + FreeRestoreTensors(restored_origin_tensors); } inline bool IsChannelFirst(int index, OpParameter *op_parameter) { @@ -297,54 +302,54 @@ kernel::LiteKernel *Scheduler::FindCpuKernel(const std::vector &in_ten if (!KernelRegistry::GetInstance()->SupportKernel(desc)) { return nullptr; } + kernel::KernelKey cpu_desc = desc; + if (kernel_data_type == kNumberTypeFloat16) { + if (!context_->IsCpuFloat16Enabled() || + (cpu_desc.data_type != kNumberTypeFloat32 && cpu_desc.data_type != kNumberTypeFloat16)) { + return nullptr; + } + cpu_desc.data_type = kNumberTypeFloat16; + } std::map restored_origin_tensors; int index = 0; for (auto &tensor : in_tensors) { auto channel_first = IsChannelFirst(index++, op_parameter); - auto *restore_tensor = DequantUtil::DequantTensor(tensor, desc.data_type, channel_first, kernel_data_type); + auto *restore_tensor = DequantUtil::DequantTensor(tensor, cpu_desc.data_type, channel_first, kernel_data_type); if (restore_tensor != nullptr) { restored_origin_tensors[tensor] = restore_tensor; } else { #ifndef SUPPORT_TRAIN - if (!IsPackedOp(op_type) && !tensor->own_data()) { // && op_type != schema::PrimitiveType_LSTM - auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type); - if (ret != RET_OK) { - MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret; - return nullptr; - } + auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type); + if (ret != RET_OK) { + MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret; + return nullptr; } #endif } } - auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, desc, op_parameter); + auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, cpu_desc, op_parameter); if (kernel != nullptr) { - MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveTypeName(op_type); + MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveCurVersionTypeName(op_type); FreeRestoreTensors(&restored_origin_tensors); } else { - RestoreTensorData(restored_origin_tensors); + RestoreTensorData(&restored_origin_tensors); } return kernel; -} +} // namespace mindspore::lite -kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in_tensors, - const std::vector &out_tensors, const Model::Node *node, - TypeId prefer_data_type) { - MS_ASSERT(node != nullptr); - bool need_dequant = node->quant_type_ == schema::QuantType_WeightQuant; - TypeId data_type = need_dequant ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors); - OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)]; - if (op_parameter == nullptr) { - MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_)); - return nullptr; - } - bool infer_shape_interrupt = !op_parameter->infer_flag_; - kernel::KernelKey desc{kCPU, data_type, static_cast(op_parameter->type_)}; -#if SUPPORT_GPU +kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector &in_tensors, + const std::vector &out_tensors, OpParameter *op_parameter, + const kernel::KernelKey &desc) { + MS_ASSERT(op_parameter != nullptr); if (context_->IsGpuEnabled()) { // support more data type like int32 kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type}; - if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16; - if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8; + if (context_->IsGpuFloat16Enabled()) { + gpu_desc.data_type = kNumberTypeFloat16; + } + if (in_tensors.front()->data_type() == kNumberTypeInt8) { + gpu_desc.data_type = kNumberTypeInt8; + } // weight quant std::map restored_origin_tensors; @@ -359,36 +364,32 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); if (kernel != nullptr) { - MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_; + MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type); FreeRestoreTensors(&restored_origin_tensors); - return kernel; } else { - MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " - << node->name_; - auto ret = InferNodeShape(node, &infer_shape_interrupt); - if (ret == RET_INFER_INVALID || ret == RET_OK) { - op_parameter = op_parameters_[node->output_indices_.at(0)]; - } else { - RestoreTensorData(restored_origin_tensors); - MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; - return nullptr; - } + MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type); + RestoreTensorData(&restored_origin_tensors); } + return kernel; + } else { + return nullptr; } -#endif -#if SUPPORT_NPU +} + +kernel::LiteKernel *Scheduler::FindNpuKernel(const std::vector &in_tensors, + const std::vector &out_tensors, OpParameter *op_parameter, + const kernel::KernelKey &desc) { + MS_ASSERT(op_parameter != nullptr); + kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type}; if (context_->IsNpuEnabled()) { - if (desc.data_type == kNumberTypeFloat16) { - desc.data_type = kNumberTypeFloat32; + if (npu_desc.data_type == kNumberTypeFloat16) { + npu_desc.data_type = kNumberTypeFloat32; } for (auto tensor : in_tensors) { if (tensor->data_type() == kNumberTypeFloat16) { tensor->set_data_type(kNumberTypeFloat32); } } - kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type}; - - // weight quant std::map restored_origin_tensors; for (auto &tensor : in_tensors) { int index = 0; @@ -400,33 +401,72 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in } auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, npu_desc, op_parameter); if (kernel != nullptr) { - MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " << node->name_; FreeRestoreTensors(&restored_origin_tensors); - return kernel; + MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type); } else { - MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " - << node->name_; - RestoreTensorData(restored_origin_tensors); - auto ret = InferNodeShape(node, &infer_shape_interrupt); - if (ret == RET_INFER_INVALID || ret == RET_OK) { - op_parameter = op_parameters_[node->output_indices_.at(0)]; - } else { - MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; - return nullptr; - } + RestoreTensorData(&restored_origin_tensors); + MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type); + } + return kernel; + } else { + return nullptr; + } +} + +kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in_tensors, + const std::vector &out_tensors, const Model::Node *node, + TypeId prefer_data_type) { + MS_ASSERT(node != nullptr); + // why we need this + TypeId data_type = + (node->quant_type_ == schema::QuantType_WeightQuant) ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors); + OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)]; + if (op_parameter == nullptr) { + MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_)); + return nullptr; + } + bool infer_shape_interrupt = !op_parameter->infer_flag_; + kernel::KernelKey desc{kCPU, data_type, static_cast(op_parameter->type_)}; + kernel::LiteKernel *kernel = nullptr; +#ifdef SUPPORT_GPU + kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc); + if (kernel != nullptr) { + return kernel; + } else { + MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " + << node->name_; + auto ret = InferNodeShape(node, &infer_shape_interrupt); + if (ret == RET_INFER_INVALID || ret == RET_OK) { + op_parameter = op_parameters_[node->output_indices_.at(0)]; + } else { + MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; + return nullptr; } } #endif - if ((prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) && - mindspore::lite::IsSupportFloat16() && - ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) { - kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type}; - auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, fp16_cpu_desc, kNumberTypeFloat16); +#ifdef SUPPORT_NPU + kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc); + if (kernel != nullptr) { + return kernel; + } else { + MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " + << node->name_; + auto ret = InferNodeShape(node, &infer_shape_interrupt); + if (ret == RET_INFER_INVALID || ret == RET_OK) { + op_parameter = op_parameters_[node->output_indices_.at(0)]; + } else { + MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; + return nullptr; + } + } +#endif + if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) { + kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16); if (kernel != nullptr) { return kernel; } else { - MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(fp16_cpu_desc.type) - << " " << node->name_; + MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " + << node->name_; auto ret = InferNodeShape(node, &infer_shape_interrupt); if (ret == RET_INFER_INVALID || ret == RET_OK) { op_parameter = op_parameters_[node->output_indices_.at(0)]; @@ -441,20 +481,18 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in desc.data_type = kNumberTypeFloat32; } if (prefer_data_type == kNumberTypeFloat32 || prefer_data_type == kTypeUnknown) { - auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32); + kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32); if (kernel != nullptr) { return kernel; } else { auto ret = InferNodeShape(node, &infer_shape_interrupt); if (!(ret == RET_INFER_INVALID || ret == RET_OK)) { - MS_LOG(ERROR) - - << "Try repeat infer fail: " << node->name_; + MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; } } } return nullptr; -} // namespace mindspore::lite +} kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *src_node) { MS_ASSERT(src_model_ != nullptr); diff --git a/mindspore/lite/src/scheduler.h b/mindspore/lite/src/scheduler.h index f4fe469520..8c67a4d95d 100644 --- a/mindspore/lite/src/scheduler.h +++ b/mindspore/lite/src/scheduler.h @@ -61,6 +61,10 @@ class Scheduler { TypeId prefer_data_type = kTypeUnknown); kernel::LiteKernel *FindCpuKernel(const std::vector &in_tensors, const std::vector &out_tensors, OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type); + kernel::LiteKernel *FindGpuKernel(const std::vector &in_tensors, const std::vector &out_tensors, + OpParameter *op_parameter, const kernel::KernelKey &desc); + kernel::LiteKernel *FindNpuKernel(const std::vector &in_tensors, const std::vector &out_tensors, + OpParameter *op_parameter, const kernel::KernelKey &desc); // schedule a partial node to a subgraph_kernel kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node); // schedule a node to a kernel diff --git a/mindspore/lite/tools/benchmark/benchmark.cc b/mindspore/lite/tools/benchmark/benchmark.cc index e9a74debe5..8eabdf1cb4 100644 --- a/mindspore/lite/tools/benchmark/benchmark.cc +++ b/mindspore/lite/tools/benchmark/benchmark.cc @@ -412,9 +412,7 @@ int Benchmark::MarkPerformance() { for (int i = 0; i < flags_->loop_count_; i++) { session_->BindThread(true); auto start = GetTimeUs(); - auto status = (flags_->time_profiling_ || flags_->perf_profiling_) - ? session_->RunGraph(before_call_back_, after_call_back_) - : session_->RunGraph(); + auto status = session_->RunGraph(before_call_back_, after_call_back_); if (status != 0) { MS_LOG(ERROR) << "Inference error " << status; std::cerr << "Inference error " << status; @@ -479,7 +477,7 @@ int Benchmark::MarkAccuracy() { std::cerr << "PrintInputData error " << status << std::endl; return status; } - status = session_->RunGraph(); + status = session_->RunGraph(before_call_back_, after_call_back_); if (status != RET_OK) { MS_LOG(ERROR) << "Inference error " << status; std::cerr << "Inference error " << status << std::endl; @@ -615,7 +613,9 @@ int Benchmark::RunBenchmark() { return ret; } } - if (model != nullptr) model->Free(); + if (model != nullptr) { + model->Free(); + } ms_inputs_ = session_->GetInputs(); auto end_prepare_time = GetTimeUs(); @@ -689,18 +689,18 @@ int Benchmark::InitTimeProfilingCallbackParameter() { // before callback before_call_back_ = [&](const std::vector &before_inputs, const std::vector &before_outputs, - const CallBackParam &callParam) { + const CallBackParam &call_param) { if (before_inputs.empty()) { MS_LOG(INFO) << "The num of beforeInputs is empty"; } if (before_outputs.empty()) { MS_LOG(INFO) << "The num of beforeOutputs is empty"; } - if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) { - op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f))); + if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) { + op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f))); } - if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) { - op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f))); + if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) { + op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f))); } op_call_times_total_++; @@ -735,6 +735,7 @@ int Benchmark::InitTimeProfilingCallbackParameter() { }; return RET_OK; } + int Benchmark::InitPerfProfilingCallbackParameter() { #ifndef ENABLE_ARM64 MS_LOG(ERROR) << "Only support perf_profiling on arm64."; @@ -781,18 +782,18 @@ int Benchmark::InitPerfProfilingCallbackParameter() { // before callback before_call_back_ = [&](const std::vector &before_inputs, const std::vector &before_outputs, - const CallBackParam &callParam) { + const CallBackParam &call_param) { if (before_inputs.empty()) { MS_LOG(INFO) << "The num of beforeInputs is empty"; } if (before_outputs.empty()) { MS_LOG(INFO) << "The num of beforeOutputs is empty"; } - if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) { - op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero))); + if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) { + op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero))); } - if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) { - op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero))); + if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) { + op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero))); } op_call_times_total_++; @@ -831,12 +832,89 @@ int Benchmark::InitPerfProfilingCallbackParameter() { return RET_OK; } +namespace { +template +std::string DataToString(void *data, size_t data_number) { + if (data == nullptr) { + return "Data of tensor is nullptr"; + } + std::ostringstream oss; + auto casted_data = static_cast(data); + for (size_t i = 0; i < 40 && i < data_number; i++) { + oss << " " << casted_data[i]; + } + return oss.str(); +} + +std::string DumpMSTensor(tensor::MSTensor *tensor) { + if (tensor == nullptr) { + return "Tensor is nullptr"; + } + std::ostringstream oss; + oss << " DataType: " << tensor->data_type(); + oss << " Shape:"; + for (auto &dim : tensor->shape()) { + oss << " " << dim; + } + oss << std::endl << "Data:"; + switch (tensor->data_type()) { + case kNumberTypeFloat32: { + oss << DataToString(tensor->data(), tensor->ElementsNum()); + } break; + case kNumberTypeFloat16: { + oss << DataToString(tensor->data(), tensor->ElementsNum()); + } break; + case kNumberTypeInt32: { + oss << DataToString(tensor->data(), tensor->ElementsNum()); + } break; + case kNumberTypeInt16: { + oss << DataToString(tensor->data(), tensor->ElementsNum()); + } break; + case kNumberTypeInt8: { + oss << DataToString(tensor->data(), tensor->ElementsNum()); + } break; + default: + oss << "Unsupported data type to print"; + break; + } + return oss.str(); +} +} // namespace + +int Benchmark::InitDumpProfilingCallbackParameter() { + // before callback + before_call_back_ = [&](const std::vector &before_inputs, + const std::vector &before_outputs, + const CallBackParam &call_param) { return true; }; + + // after callback + after_call_back_ = [&](const std::vector &after_inputs, + const std::vector &after_outputs, + const CallBackParam &call_param) { + std::cout << "================================================================" << std::endl; + std::cout << call_param.node_name << " inputs : " << std::endl; + for (auto ms_tensor : after_inputs) { + std::cout << DumpMSTensor(ms_tensor) << std::endl; + } + std::cout << "----------------------------------------------------------------" << std::endl; + std::cout << call_param.node_name << " outputs : " << std::endl; + for (const auto ms_tensor : after_outputs) { + std::cout << DumpMSTensor(ms_tensor) << std::endl; + } + std::cout << "================================================================" << std::endl; + return true; + }; + return RET_OK; +} + int Benchmark::InitCallbackParameter() { int ret = RET_OK; if (flags_->time_profiling_) { ret = InitTimeProfilingCallbackParameter(); } else if (flags_->perf_profiling_) { ret = InitPerfProfilingCallbackParameter(); + } else if (flags_->dump_profiling_) { + ret = InitDumpProfilingCallbackParameter(); } return ret; } @@ -917,16 +995,14 @@ int Benchmark::Init() { return RET_ERROR; } - if (flags_->time_profiling_ || flags_->perf_profiling_) { - if (flags_->time_profiling_ && flags_->perf_profiling_) { - MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling."; - } - auto status = InitCallbackParameter(); - if (status != RET_OK) { - MS_LOG(ERROR) << "Init callback Parameter failed."; - std::cerr << "Init callback Parameter failed." << std::endl; - return RET_ERROR; - } + if (flags_->time_profiling_ && flags_->perf_profiling_) { + MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling."; + } + auto status = InitCallbackParameter(); + if (status != RET_OK) { + MS_LOG(ERROR) << "Init callback Parameter failed."; + std::cerr << "Init callback Parameter failed." << std::endl; + return RET_ERROR; } return RET_OK; diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h index c62c973d66..a33a367f5e 100644 --- a/mindspore/lite/tools/benchmark/benchmark.h +++ b/mindspore/lite/tools/benchmark/benchmark.h @@ -113,9 +113,6 @@ class MS_API BenchmarkFlags : public virtual FlagParser { int num_threads_ = 2; bool enable_fp16_ = false; int warm_up_loop_count_ = 3; - bool time_profiling_ = false; - bool perf_profiling_ = false; - std::string perf_event_ = "CYCLE"; // MarkAccuracy std::string benchmark_data_file_; std::string benchmark_data_type_ = "FLOAT"; @@ -125,6 +122,10 @@ class MS_API BenchmarkFlags : public virtual FlagParser { std::vector> resize_dims_; std::string device_ = "CPU"; + bool time_profiling_ = false; + bool perf_profiling_ = false; + std::string perf_event_ = "CYCLE"; + bool dump_profiling_ = false; }; class MS_API Benchmark { @@ -163,9 +164,13 @@ class MS_API Benchmark { int *total_size); int InitCallbackParameter(); + int InitTimeProfilingCallbackParameter(); + int InitPerfProfilingCallbackParameter(); + int InitDumpProfilingCallbackParameter(); + int PrintResult(const std::vector &title, const std::map> &result); #ifdef ENABLE_ARM64 @@ -289,8 +294,8 @@ class MS_API Benchmark { std::map> op_perf_by_type_; std::map> op_perf_by_name_; #endif - KernelCallBack before_call_back_; - KernelCallBack after_call_back_; + KernelCallBack before_call_back_ = nullptr; + KernelCallBack after_call_back_ = nullptr; std::mt19937 random_engine_; }; diff --git a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc index da8bece8b0..9d2e067bbc 100644 --- a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc +++ b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc @@ -193,7 +193,7 @@ STATUS InferShapePass::GetCNodeInputTensors(const CNodePtr &cnode, std::vector tensor = nullptr;