From 3066d4a17cc73490a9498f1b34b29ab755fed884 Mon Sep 17 00:00:00 2001
From: hangangqiang <hangangqiang2@huawei.com>
Date: Fri, 26 Mar 2021 10:47:31 +0800
Subject: [PATCH] reduce runtime ram while fp16 is enabled

---
 mindspore/lite/nnacl/fp16/pack_fp16.c         |  34 +++
 mindspore/lite/nnacl/fp16/pack_fp16.h         |   4 +
 .../lite/nnacl/infer/quant_dtype_cast_infer.c |   3 -
 .../lite/nnacl/infer/quant_dtype_cast_infer.h |   2 +-
 mindspore/lite/src/inner_context.cc           |  50 +++--
 mindspore/lite/src/inner_context.h            |   7 +
 .../kernel/arm/base/quant_dtype_cast.cc       |  50 +----
 .../arm/fp16/convolution_depthwise_fp16.cc    |   4 +-
 .../convolution_depthwise_slidewindow_fp16.cc |   4 +-
 .../arm/fp16/deconvolution_depthwise_fp16.cc  |   8 +-
 .../kernel/arm/fp16/deconvolution_fp16.cc     |  20 +-
 .../arm/fp16/deconvolution_winograd_fp16.cc   |   2 +-
 mindspore/lite/src/scheduler.cc               | 200 +++++++++++-------
 mindspore/lite/src/scheduler.h                |   4 +
 mindspore/lite/tools/benchmark/benchmark.cc   | 126 ++++++++---
 mindspore/lite/tools/benchmark/benchmark.h    |  15 +-
 .../tools/optimizer/graph/infershape_pass.cc  |   2 +-
 17 files changed, 350 insertions(+), 185 deletions(-)

diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c
index 67fe8d6455..aecc351ec0 100644
--- a/mindspore/lite/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.c
@@ -474,6 +474,25 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
   }
 }
 
+void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * channel;
+    int dst_offset = b * plane * c8 * C8NUM;
+    for (int c = 0; c < channel; c++) {
+      int c8_block_num = c / C8NUM;
+      int c8_block_rem = c % C8NUM;
+      int src_c_offset = src_offset + c * plane;
+      int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
+      for (int k = 0; k < plane; k++) {
+        int src_kernel_offset = src_c_offset + k;
+        int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
+        (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
+      }
+    }
+  }
+}
+
 void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
@@ -504,6 +523,21 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane,
   return;
 }
 
+void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+  for (int n = 0; n < batch; n++) {
+    for (int hw = 0; hw < plane; hw++) {
+      for (int c = 0; c < channel; c++) {
+        int c8div = c / C8NUM;
+        int c8mod = c % C8NUM;
+        int src_index = n * plane * channel + hw * channel + c;
+        int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
+        dst[dst_index] = src[src_index];
+      }
+    }
+  }
+  return;
+}
+
 void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.h b/mindspore/lite/nnacl/fp16/pack_fp16.h
index fc82ff66a3..b49a35b479 100644
--- a/mindspore/lite/nnacl/fp16/pack_fp16.h
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.h
@@ -61,10 +61,14 @@ void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int
 
 void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
 
+void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+
 void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
 
 void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
 
+void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+
 void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
 
 void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
diff --git a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c
index d0caa00192..5fdc564972 100644
--- a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c
+++ b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.c
@@ -30,9 +30,6 @@ int QuantDtypeCastInferShape(const TensorC *const *inputs, size_t inputs_size, T
   TensorC *output = outputs[0];
 
   QuantDtypeCastParameter *param = (QuantDtypeCastParameter *)parameter;
-  if (input->data_type_ != param->srcT_) {
-    return NNACL_ERR;
-  }
   output->data_type_ = param->dstT_;
   output->format_ = input->format_;
   if (!parameter->infer_flag_) {
diff --git a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h
index b1fb1ca101..8357fec315 100644
--- a/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h
+++ b/mindspore/lite/nnacl/infer/quant_dtype_cast_infer.h
@@ -24,7 +24,7 @@ extern "C" {
 
 typedef struct QuantDtypeCastParameter {
   OpParameter op_parameter_;
-  int srcT_;
+  int srcT_;  // deprecated
   int dstT_;
 } QuantDtypeCastParameter;
 
diff --git a/mindspore/lite/src/inner_context.cc b/mindspore/lite/src/inner_context.cc
index 33f01758a4..29dfeab99f 100644
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@@ -17,6 +17,7 @@
 #include "src/inner_context.h"
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
+#include "src/common/utils.h"
 #ifdef SUPPORT_NPU
 #include "src/runtime/agent/npu/npu_manager.h"
 #endif
@@ -85,18 +86,18 @@ int InnerContext::IsValid() const {
     MS_LOG(ERROR) << "Device list is empty.";
     return RET_NOT_SUPPORT;
   }
-  if (!IsCpuEnabled()) {
-    MS_LOG(ERROR) << "CPU is not supported.";
+  if (!IsUserSetCpu()) {
+    MS_LOG(ERROR) << "CPU context should be set.";
     return RET_NOT_SUPPORT;
   }
 #ifndef SUPPORT_GPU
-  if (IsGpuEnabled()) {
+  if (IsUserSetGpu()) {
     MS_LOG(ERROR) << "GPU is not supported.";
     return RET_NOT_SUPPORT;
   }
 #endif
 #ifndef SUPPORT_NPU
-  if (IsNpuEnabled()) {
+  if (IsUserSetNpu()) {
     MS_LOG(ERROR) << "NPU is not supported.";
     return RET_NOT_SUPPORT;
   }
@@ -108,6 +109,9 @@ bool InnerContext::IsCpuFloat16Enabled() const {
   if (!IsCpuEnabled()) {
     return false;
   }
+  if (!IsSupportFloat16()) {
+    return false;
+  }
   return GetCpuInfo().enable_float16_;
 }
 
@@ -115,31 +119,47 @@ bool InnerContext::IsGpuFloat16Enabled() const {
   if (!IsGpuEnabled()) {
     return false;
   }
+  if (!IsSupportFloat16()) {
+    return false;
+  }
   return GetGpuInfo().enable_float16_;
 }
 
-bool InnerContext::IsCpuEnabled() const {
+bool InnerContext::IsCpuEnabled() const { return IsUserSetCpu(); }
+
+bool InnerContext::IsGpuEnabled() const {
+#ifdef SUPPORT_GPU
+  return IsUserSetGpu();
+#else
+  return false;
+#endif
+}
+
+bool InnerContext::IsNpuEnabled() const {
+#ifdef SUPPORT_NPU
+  MS_ASSERT(npu_manager_ != nullptr);
+  return IsUserSetNpu() && npu_manager_->IsSupportNPU();
+#else
+  return false;
+#endif
+}
+
+bool InnerContext::IsUserSetCpu() const {
   return this->device_list_.end() !=
          std::find_if(this->device_list_.begin(), this->device_list_.end(),
                       [](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
 }
 
-bool InnerContext::IsGpuEnabled() const {
+bool InnerContext::IsUserSetGpu() const {
   return this->device_list_.end() !=
          std::find_if(this->device_list_.begin(), this->device_list_.end(),
                       [](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
 }
 
-bool InnerContext::IsNpuEnabled() const {
-#ifdef SUPPORT_NPU
-  MS_ASSERT(npu_manager_ != nullptr);
+bool InnerContext::IsUserSetNpu() const {
   return this->device_list_.end() !=
-           std::find_if(this->device_list_.begin(), this->device_list_.end(),
-                        [](const DeviceContext &device) { return device.device_type_ == DT_NPU; }) &&
-         npu_manager_->IsSupportNPU();
-#else
-  return false;
-#endif
+         std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                      [](const DeviceContext &device) { return device.device_type_ == DT_NPU; });
 }
 
 CpuDeviceInfo InnerContext::GetCpuInfo() const {
diff --git a/mindspore/lite/src/inner_context.h b/mindspore/lite/src/inner_context.h
index 3a5f18182d..41e4cad9d9 100644
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@@ -58,6 +58,13 @@ struct InnerContext : public Context {
 
   virtual ~InnerContext();
 
+ private:
+  bool IsUserSetCpu() const;
+
+  bool IsUserSetGpu() const;
+
+  bool IsUserSetNpu() const;
+
 #if SUPPORT_NPU
 
  private:
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
index 411ecd8619..0f295434f3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@@ -44,48 +44,12 @@ int QuantDTypeCastCPUKernel::Init() {
   MS_ASSERT(out_tensor);
   auto param = reinterpret_cast<QuantDTypeCastParameter *>(op_parameter_);
   MS_ASSERT(param);
-  if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeInt8) {
-    if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeInt8) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeFloat32) {
-    if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeFloat32) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeInt8) {
-    if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeInt8) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeInt8) {
-    if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeInt8) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeUInt8) {
-    if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeUInt8) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeFloat32) {
-    if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeFloat32) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeUInt8) {
-    if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeUInt8) {
-      MS_LOG(ERROR) << "param data type and tensor data type do not match.";
-      return RET_ERROR;
-    }
-  } else {
-    MS_LOG(ERROR) << "param data type not supported:"
-                  << " src: " << param->srcT << " dst: " << param->dstT;
-    return RET_PARAM_INVALID;
-  }
-  src_dtype = param->srcT;
+  src_dtype = in_tensor->data_type();
   dst_dtype = param->dstT;
+  if (out_tensor->data_type() != dst_dtype) {
+    MS_LOG(ERROR) << "param data type and tensor data type do not match.";
+    return RET_ERROR;
+  }
 
   if (!InferShapeDone()) {
     return RET_OK;
@@ -149,6 +113,10 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) {
       ret = DoQuantizeFp32ToInt8(float32_ptr_ + thread_offset, int8_out_ptr_ + thread_offset, output_quant_arg.scale,
                                  output_quant_arg.zeroPoint, num_unit_thread, from_uint8_src);
     }
+  } else {
+    MS_LOG(ERROR) << "param data type not supported:"
+                  << " src: " << src_dtype << " dst: " << dst_dtype;
+    return RET_PARAM_INVALID;
   }
 
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 3121e93809..521f5b1501 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -47,7 +47,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
     MS_LOG(ERROR) << "get execute filter data failed.";
     return ret;
   }
-  PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+  PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
                      weight_tensor->Batch());
   if (fp16_weight_ != nullptr) {
     free(fp16_weight_);
@@ -64,7 +64,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
     MS_ASSERT(origin_bias_);
-    auto ori_bias = reinterpret_cast<float *>(origin_bias_);
+    auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
     for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
       bias_fp16[i] = (float16_t)ori_bias[i];
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
index 9bc4503cee..02cdf2721c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -68,7 +68,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackNCHWFp32ToNC8HW8Fp16(reinterpret_cast<float *>(origin_weight_), packed_weight_, 1,
+  PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1,
                            weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
 
   bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
@@ -81,7 +81,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
     MS_ASSERT(origin_bias_);
-    auto ori_bias = reinterpret_cast<float *>(origin_bias_);
+    auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
     for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
       bias_fp16[i] = (float16_t)ori_bias[i];
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index d35f867b8e..3c1200fb97 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
-  auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
+  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData());
   int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
@@ -81,7 +81,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+  PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
                            weight_tensor->Batch());
 
   bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
@@ -92,9 +92,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
-    auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
+    auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData());
     for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
-      reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
+      reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i];
     }
   }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index 6f3106ec4c..0a750215c9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -57,7 +57,8 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
   auto kernel_h = weight_tensor->Height();
   auto kernel_w = weight_tensor->Width();
 
-  bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
+  auto bias_size = UP_ROUND(output_channel, C4NUM) * sizeof(float16_t);
+  bias_data_ = malloc(bias_size);
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
     return RET_ERROR;
@@ -65,8 +66,15 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
   memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
   if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
       in_tensors_.at(kBiasIndex)->DimensionSize(0) == output_channel) {
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->MutableData()),
-                     reinterpret_cast<float16_t *>(bias_data_), output_channel);
+    if (in_tensors_.at(2)->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << "deconv fp16 kernel require fp16 bias";
+      return RET_ERROR;
+    }
+    if (bias_size != in_tensors_.at(2)->Size()) {
+      MS_LOG(ERROR) << "input bias size not match : " << bias_size << " vs " << in_tensors_.at(2)->Size();
+      return RET_ERROR;
+    }
+    memcpy(bias_data_, in_tensors_.at(2)->data_c(), bias_size);
   }
 
   size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
@@ -76,7 +84,11 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(execute_weight_, 0, weight_pack_size);
-  PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()), execute_weight_, input_channel,
+  if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight";
+    return RET_ERROR;
+  }
+  PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel,
                            kernel_w * kernel_h, output_channel);
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
index c3ad9aa89a..9cbf54369b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -341,7 +341,7 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() {
   auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
       in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) {
-    auto src_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
+    auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData());
     MS_ASSERT(src_bias);
     for (int i = 0; i < conv_param_->output_channel_; ++i) {
       fp16_bias_data[i] = (float16_t)src_bias[i];
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index 2ec491ac8b..0308c1d927 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -239,6 +239,9 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi
     return RET_ERROR;
 #endif
   } else {
+    if (tensor->own_data()) {
+      return RET_OK;
+    }
     tensor->set_data(nullptr);
     auto ret = tensor->MallocData();
     if (RET_OK != ret) {
@@ -253,8 +256,18 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi
 }
 #endif
 
-inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origin_tensors) {
-  for (auto &restored_origin_tensor : restored_origin_tensors) {
+inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
+  MS_ASSERT(restored_origin_tensors != nullptr);
+  for (auto &restored_origin_tensor : *restored_origin_tensors) {
+    restored_origin_tensor.second->set_data(nullptr);
+    delete (restored_origin_tensor.second);
+  }
+  restored_origin_tensors->clear();
+}
+
+inline void RestoreTensorData(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
+  MS_ASSERT(restored_origin_tensors != nullptr);
+  for (auto &restored_origin_tensor : *restored_origin_tensors) {
     auto *origin_tensor = restored_origin_tensor.first;
     auto *restored_tensor = restored_origin_tensor.second;
     MS_ASSERT(origin_tensor != nullptr);
@@ -264,15 +277,7 @@ inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origi
     origin_tensor->set_data(restored_tensor->data_c());
     origin_tensor->set_own_data(restored_tensor->own_data());
   }
-}
-
-inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
-  MS_ASSERT(restored_origin_tensors != nullptr);
-  for (auto &restored_origin_tensor : *restored_origin_tensors) {
-    restored_origin_tensor.second->set_data(nullptr);
-    delete (restored_origin_tensor.second);
-  }
-  restored_origin_tensors->clear();
+  FreeRestoreTensors(restored_origin_tensors);
 }
 
 inline bool IsChannelFirst(int index, OpParameter *op_parameter) {
@@ -297,54 +302,54 @@ kernel::LiteKernel *Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_ten
   if (!KernelRegistry::GetInstance()->SupportKernel(desc)) {
     return nullptr;
   }
+  kernel::KernelKey cpu_desc = desc;
+  if (kernel_data_type == kNumberTypeFloat16) {
+    if (!context_->IsCpuFloat16Enabled() ||
+        (cpu_desc.data_type != kNumberTypeFloat32 && cpu_desc.data_type != kNumberTypeFloat16)) {
+      return nullptr;
+    }
+    cpu_desc.data_type = kNumberTypeFloat16;
+  }
   std::map<Tensor *, Tensor *> restored_origin_tensors;
   int index = 0;
   for (auto &tensor : in_tensors) {
     auto channel_first = IsChannelFirst(index++, op_parameter);
-    auto *restore_tensor = DequantUtil::DequantTensor(tensor, desc.data_type, channel_first, kernel_data_type);
+    auto *restore_tensor = DequantUtil::DequantTensor(tensor, cpu_desc.data_type, channel_first, kernel_data_type);
     if (restore_tensor != nullptr) {
       restored_origin_tensors[tensor] = restore_tensor;
     } else {
 #ifndef SUPPORT_TRAIN
-      if (!IsPackedOp(op_type) && !tensor->own_data()) {  //  && op_type != schema::PrimitiveType_LSTM
-        auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type);
-        if (ret != RET_OK) {
-          MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret;
-          return nullptr;
-        }
+      auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type);
+      if (ret != RET_OK) {
+        MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret;
+        return nullptr;
       }
 #endif
     }
   }
-  auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, desc, op_parameter);
+  auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, cpu_desc, op_parameter);
   if (kernel != nullptr) {
-    MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveTypeName(op_type);
+    MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveCurVersionTypeName(op_type);
     FreeRestoreTensors(&restored_origin_tensors);
   } else {
-    RestoreTensorData(restored_origin_tensors);
+    RestoreTensorData(&restored_origin_tensors);
   }
   return kernel;
-}
+}  // namespace mindspore::lite
 
-kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors,
-                                                 const std::vector<Tensor *> &out_tensors, const Model::Node *node,
-                                                 TypeId prefer_data_type) {
-  MS_ASSERT(node != nullptr);
-  bool need_dequant = node->quant_type_ == schema::QuantType_WeightQuant;
-  TypeId data_type = need_dequant ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors);
-  OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
-  if (op_parameter == nullptr) {
-    MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
-    return nullptr;
-  }
-  bool infer_shape_interrupt = !op_parameter->infer_flag_;
-  kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
-#if SUPPORT_GPU
+kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors,
+                                             const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
+                                             const kernel::KernelKey &desc) {
+  MS_ASSERT(op_parameter != nullptr);
   if (context_->IsGpuEnabled()) {
     // support more data type like int32
     kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
-    if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16;
-    if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8;
+    if (context_->IsGpuFloat16Enabled()) {
+      gpu_desc.data_type = kNumberTypeFloat16;
+    }
+    if (in_tensors.front()->data_type() == kNumberTypeInt8) {
+      gpu_desc.data_type = kNumberTypeInt8;
+    }
 
     // weight quant
     std::map<Tensor *, Tensor *> restored_origin_tensors;
@@ -359,36 +364,32 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
 
     auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
     if (kernel != nullptr) {
-      MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_;
+      MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type);
       FreeRestoreTensors(&restored_origin_tensors);
-      return kernel;
     } else {
-      MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " "
-                    << node->name_;
-      auto ret = InferNodeShape(node, &infer_shape_interrupt);
-      if (ret == RET_INFER_INVALID || ret == RET_OK) {
-        op_parameter = op_parameters_[node->output_indices_.at(0)];
-      } else {
-        RestoreTensorData(restored_origin_tensors);
-        MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
-        return nullptr;
-      }
+      MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type);
+      RestoreTensorData(&restored_origin_tensors);
     }
+    return kernel;
+  } else {
+    return nullptr;
   }
-#endif
-#if SUPPORT_NPU
+}
+
+kernel::LiteKernel *Scheduler::FindNpuKernel(const std::vector<Tensor *> &in_tensors,
+                                             const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
+                                             const kernel::KernelKey &desc) {
+  MS_ASSERT(op_parameter != nullptr);
+  kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type};
   if (context_->IsNpuEnabled()) {
-    if (desc.data_type == kNumberTypeFloat16) {
-      desc.data_type = kNumberTypeFloat32;
+    if (npu_desc.data_type == kNumberTypeFloat16) {
+      npu_desc.data_type = kNumberTypeFloat32;
     }
     for (auto tensor : in_tensors) {
       if (tensor->data_type() == kNumberTypeFloat16) {
         tensor->set_data_type(kNumberTypeFloat32);
       }
     }
-    kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type};
-
-    // weight quant
     std::map<Tensor *, Tensor *> restored_origin_tensors;
     for (auto &tensor : in_tensors) {
       int index = 0;
@@ -400,33 +401,72 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
     }
     auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, npu_desc, op_parameter);
     if (kernel != nullptr) {
-      MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " << node->name_;
       FreeRestoreTensors(&restored_origin_tensors);
-      return kernel;
+      MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type);
     } else {
-      MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type) << " "
-                    << node->name_;
-      RestoreTensorData(restored_origin_tensors);
-      auto ret = InferNodeShape(node, &infer_shape_interrupt);
-      if (ret == RET_INFER_INVALID || ret == RET_OK) {
-        op_parameter = op_parameters_[node->output_indices_.at(0)];
-      } else {
-        MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
-        return nullptr;
-      }
+      RestoreTensorData(&restored_origin_tensors);
+      MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type);
+    }
+    return kernel;
+  } else {
+    return nullptr;
+  }
+}
+
+kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors,
+                                                 const std::vector<Tensor *> &out_tensors, const Model::Node *node,
+                                                 TypeId prefer_data_type) {
+  MS_ASSERT(node != nullptr);
+  // why we need this
+  TypeId data_type =
+    (node->quant_type_ == schema::QuantType_WeightQuant) ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors);
+  OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
+  if (op_parameter == nullptr) {
+    MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
+    return nullptr;
+  }
+  bool infer_shape_interrupt = !op_parameter->infer_flag_;
+  kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
+  kernel::LiteKernel *kernel = nullptr;
+#ifdef SUPPORT_GPU
+  kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc);
+  if (kernel != nullptr) {
+    return kernel;
+  } else {
+    MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
+                  << node->name_;
+    auto ret = InferNodeShape(node, &infer_shape_interrupt);
+    if (ret == RET_INFER_INVALID || ret == RET_OK) {
+      op_parameter = op_parameters_[node->output_indices_.at(0)];
+    } else {
+      MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
+      return nullptr;
     }
   }
 #endif
-  if ((prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) &&
-      mindspore::lite::IsSupportFloat16() &&
-      ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) {
-    kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type};
-    auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, fp16_cpu_desc, kNumberTypeFloat16);
+#ifdef SUPPORT_NPU
+  kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc);
+  if (kernel != nullptr) {
+    return kernel;
+  } else {
+    MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
+                  << node->name_;
+    auto ret = InferNodeShape(node, &infer_shape_interrupt);
+    if (ret == RET_INFER_INVALID || ret == RET_OK) {
+      op_parameter = op_parameters_[node->output_indices_.at(0)];
+    } else {
+      MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
+      return nullptr;
+    }
+  }
+#endif
+  if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) {
+    kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16);
     if (kernel != nullptr) {
       return kernel;
     } else {
-      MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(fp16_cpu_desc.type)
-                    << " " << node->name_;
+      MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
+                    << node->name_;
       auto ret = InferNodeShape(node, &infer_shape_interrupt);
       if (ret == RET_INFER_INVALID || ret == RET_OK) {
         op_parameter = op_parameters_[node->output_indices_.at(0)];
@@ -441,20 +481,18 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
     desc.data_type = kNumberTypeFloat32;
   }
   if (prefer_data_type == kNumberTypeFloat32 || prefer_data_type == kTypeUnknown) {
-    auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32);
+    kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32);
     if (kernel != nullptr) {
       return kernel;
     } else {
       auto ret = InferNodeShape(node, &infer_shape_interrupt);
       if (!(ret == RET_INFER_INVALID || ret == RET_OK)) {
-        MS_LOG(ERROR)
-
-          << "Try repeat infer fail: " << node->name_;
+        MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
       }
     }
   }
   return nullptr;
-}  // namespace mindspore::lite
+}
 
 kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *src_node) {
   MS_ASSERT(src_model_ != nullptr);
diff --git a/mindspore/lite/src/scheduler.h b/mindspore/lite/src/scheduler.h
index f4fe469520..8c67a4d95d 100644
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@@ -61,6 +61,10 @@ class Scheduler {
                                         TypeId prefer_data_type = kTypeUnknown);
   kernel::LiteKernel *FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                                     OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type);
+  kernel::LiteKernel *FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                                    OpParameter *op_parameter, const kernel::KernelKey &desc);
+  kernel::LiteKernel *FindNpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                                    OpParameter *op_parameter, const kernel::KernelKey &desc);
   // schedule a partial node to a subgraph_kernel
   kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node);
   // schedule a node to a kernel
diff --git a/mindspore/lite/tools/benchmark/benchmark.cc b/mindspore/lite/tools/benchmark/benchmark.cc
index e9a74debe5..8eabdf1cb4 100644
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@@ -412,9 +412,7 @@ int Benchmark::MarkPerformance() {
   for (int i = 0; i < flags_->loop_count_; i++) {
     session_->BindThread(true);
     auto start = GetTimeUs();
-    auto status = (flags_->time_profiling_ || flags_->perf_profiling_)
-                    ? session_->RunGraph(before_call_back_, after_call_back_)
-                    : session_->RunGraph();
+    auto status = session_->RunGraph(before_call_back_, after_call_back_);
     if (status != 0) {
       MS_LOG(ERROR) << "Inference error " << status;
       std::cerr << "Inference error " << status;
@@ -479,7 +477,7 @@ int Benchmark::MarkAccuracy() {
     std::cerr << "PrintInputData error " << status << std::endl;
     return status;
   }
-  status = session_->RunGraph();
+  status = session_->RunGraph(before_call_back_, after_call_back_);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Inference error " << status;
     std::cerr << "Inference error " << status << std::endl;
@@ -615,7 +613,9 @@ int Benchmark::RunBenchmark() {
       return ret;
     }
   }
-  if (model != nullptr) model->Free();
+  if (model != nullptr) {
+    model->Free();
+  }
 
   ms_inputs_ = session_->GetInputs();
   auto end_prepare_time = GetTimeUs();
@@ -689,18 +689,18 @@ int Benchmark::InitTimeProfilingCallbackParameter() {
   // before callback
   before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
                           const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                          const CallBackParam &callParam) {
+                          const CallBackParam &call_param) {
     if (before_inputs.empty()) {
       MS_LOG(INFO) << "The num of beforeInputs is empty";
     }
     if (before_outputs.empty()) {
       MS_LOG(INFO) << "The num of beforeOutputs is empty";
     }
-    if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
-      op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
+    if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
+      op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
     }
-    if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
-      op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
+    if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
+      op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
     }
 
     op_call_times_total_++;
@@ -735,6 +735,7 @@ int Benchmark::InitTimeProfilingCallbackParameter() {
   };
   return RET_OK;
 }
+
 int Benchmark::InitPerfProfilingCallbackParameter() {
 #ifndef ENABLE_ARM64
   MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
@@ -781,18 +782,18 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
   // before callback
   before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
                           const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                          const CallBackParam &callParam) {
+                          const CallBackParam &call_param) {
     if (before_inputs.empty()) {
       MS_LOG(INFO) << "The num of beforeInputs is empty";
     }
     if (before_outputs.empty()) {
       MS_LOG(INFO) << "The num of beforeOutputs is empty";
     }
-    if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
-      op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
+    if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
+      op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
     }
-    if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
-      op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
+    if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
+      op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
     }
 
     op_call_times_total_++;
@@ -831,12 +832,89 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
   return RET_OK;
 }
 
+namespace {
+template <typename T>
+std::string DataToString(void *data, size_t data_number) {
+  if (data == nullptr) {
+    return "Data of tensor is nullptr";
+  }
+  std::ostringstream oss;
+  auto casted_data = static_cast<T *>(data);
+  for (size_t i = 0; i < 40 && i < data_number; i++) {
+    oss << " " << casted_data[i];
+  }
+  return oss.str();
+}
+
+std::string DumpMSTensor(tensor::MSTensor *tensor) {
+  if (tensor == nullptr) {
+    return "Tensor is nullptr";
+  }
+  std::ostringstream oss;
+  oss << " DataType: " << tensor->data_type();
+  oss << " Shape:";
+  for (auto &dim : tensor->shape()) {
+    oss << " " << dim;
+  }
+  oss << std::endl << "Data:";
+  switch (tensor->data_type()) {
+    case kNumberTypeFloat32: {
+      oss << DataToString<float>(tensor->data(), tensor->ElementsNum());
+    } break;
+    case kNumberTypeFloat16: {
+      oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum());
+    } break;
+    case kNumberTypeInt32: {
+      oss << DataToString<int32_t>(tensor->data(), tensor->ElementsNum());
+    } break;
+    case kNumberTypeInt16: {
+      oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum());
+    } break;
+    case kNumberTypeInt8: {
+      oss << DataToString<int8_t>(tensor->data(), tensor->ElementsNum());
+    } break;
+    default:
+      oss << "Unsupported data type to print";
+      break;
+  }
+  return oss.str();
+}
+}  // namespace
+
+int Benchmark::InitDumpProfilingCallbackParameter() {
+  // before callback
+  before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
+                          const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
+                          const CallBackParam &call_param) { return true; };
+
+  // after callback
+  after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
+                         const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
+                         const CallBackParam &call_param) {
+    std::cout << "================================================================" << std::endl;
+    std::cout << call_param.node_name << " inputs : " << std::endl;
+    for (auto ms_tensor : after_inputs) {
+      std::cout << DumpMSTensor(ms_tensor) << std::endl;
+    }
+    std::cout << "----------------------------------------------------------------" << std::endl;
+    std::cout << call_param.node_name << " outputs : " << std::endl;
+    for (const auto ms_tensor : after_outputs) {
+      std::cout << DumpMSTensor(ms_tensor) << std::endl;
+    }
+    std::cout << "================================================================" << std::endl;
+    return true;
+  };
+  return RET_OK;
+}
+
 int Benchmark::InitCallbackParameter() {
   int ret = RET_OK;
   if (flags_->time_profiling_) {
     ret = InitTimeProfilingCallbackParameter();
   } else if (flags_->perf_profiling_) {
     ret = InitPerfProfilingCallbackParameter();
+  } else if (flags_->dump_profiling_) {
+    ret = InitDumpProfilingCallbackParameter();
   }
   return ret;
 }
@@ -917,16 +995,14 @@ int Benchmark::Init() {
     return RET_ERROR;
   }
 
-  if (flags_->time_profiling_ || flags_->perf_profiling_) {
-    if (flags_->time_profiling_ && flags_->perf_profiling_) {
-      MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
-    }
-    auto status = InitCallbackParameter();
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "Init callback Parameter failed.";
-      std::cerr << "Init callback Parameter failed." << std::endl;
-      return RET_ERROR;
-    }
+  if (flags_->time_profiling_ && flags_->perf_profiling_) {
+    MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
+  }
+  auto status = InitCallbackParameter();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Init callback Parameter failed.";
+    std::cerr << "Init callback Parameter failed." << std::endl;
+    return RET_ERROR;
   }
 
   return RET_OK;
diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h
index c62c973d66..a33a367f5e 100644
--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@@ -113,9 +113,6 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
   int num_threads_ = 2;
   bool enable_fp16_ = false;
   int warm_up_loop_count_ = 3;
-  bool time_profiling_ = false;
-  bool perf_profiling_ = false;
-  std::string perf_event_ = "CYCLE";
   // MarkAccuracy
   std::string benchmark_data_file_;
   std::string benchmark_data_type_ = "FLOAT";
@@ -125,6 +122,10 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
   std::vector<std::vector<int>> resize_dims_;
 
   std::string device_ = "CPU";
+  bool time_profiling_ = false;
+  bool perf_profiling_ = false;
+  std::string perf_event_ = "CYCLE";
+  bool dump_profiling_ = false;
 };
 
 class MS_API Benchmark {
@@ -163,9 +164,13 @@ class MS_API Benchmark {
                                      int *total_size);
 
   int InitCallbackParameter();
+
   int InitTimeProfilingCallbackParameter();
+
   int InitPerfProfilingCallbackParameter();
 
+  int InitDumpProfilingCallbackParameter();
+
   int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
 
 #ifdef ENABLE_ARM64
@@ -289,8 +294,8 @@ class MS_API Benchmark {
   std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
   std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
 #endif
-  KernelCallBack before_call_back_;
-  KernelCallBack after_call_back_;
+  KernelCallBack before_call_back_ = nullptr;
+  KernelCallBack after_call_back_ = nullptr;
   std::mt19937 random_engine_;
 };
 
diff --git a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
index da8bece8b0..9d2e067bbc 100644
--- a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
@@ -193,7 +193,7 @@ STATUS InferShapePass::GetCNodeInputTensors(const CNodePtr &cnode, std::vector<l
     tensor::TensorPtr tensor_info;
     auto status = GetTensorInfoFromAbstract(&tensor_info, cnode, i);
     if (status != RET_OK) {
-      MS_LOG(ERROR) << "get tensor info failed.";
+      MS_LOG(DEBUG) << "get tensor info failed.";
       return RET_ERROR;
     }
     std::unique_ptr<lite::Tensor> tensor = nullptr;