optimize init func

4 years ago · 66219788cb
parent b9c996484e
commit 66219788cb
19 changed files with 217 additions and 219 deletions
--- a/mindspore/lite/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.c
@ -256,6 +256,7 @@ void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int c

 void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel) {
  int ic8 = UP_DIV(channel, C8NUM);
+  int c8_channel = ic8 * C8NUM;
  int nhwc8_batch_unit_offset = ic8 * C8NUM * plane;
  int ic_remainder_ = channel % C8NUM;
  if (ic_remainder_ != 0) {
@ -263,8 +264,11 @@ void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int c
    for (int b = 0; b < batch; b++) {
      int batch_offset = b * channel * plane;
      for (int i = 0; i < plane; i++) {
-        memcpy((float16_t *)dst + nhwc8_batch_offset + i * ic8 * C8NUM, (float16_t *)src + batch_offset + i * channel,
-               channel * sizeof(float16_t));
+        float16_t *dst_per_plane = (float16_t *)dst + nhwc8_batch_offset + i * c8_channel;
+        memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t));
+        for (int j = channel; j < c8_channel; ++j) {
+          dst_per_plane[j] = 0;
+        }
      }
      nhwc8_batch_offset += nhwc8_batch_unit_offset;
    }
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@ -641,21 +641,38 @@ void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvPara
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int ic8 = UP_DIV(in_channel, C8NUM);
+  int ic8_minus = ic8 - 1;

  for (int b = 0; b < in_batch; b++) {
    int src_batch_offset = b * in_channel * in_h * in_w;
    int dst_batch_offset = b * ic8 * C8NUM * in_h * in_w;
-    for (int c = 0; c < in_channel; c++) {
-      int ic8_block = c / C8NUM;
-      int ic8_res = c % C8NUM;
-      int src_c_offset = src_batch_offset + c;
-      int dst_c_offset = dst_batch_offset + ic8_block * C8NUM * in_h * in_w + ic8_res;
-      for (int k = 0; k < in_w * in_h; k++) {
-        int src_plane_offset = src_c_offset + k * in_channel;
-        int dst_plane_offset = dst_c_offset + k * C8NUM;
-        (packed_input + dst_plane_offset)[0] = (int16_t)(input_data + src_plane_offset)[0];
-      }
-    }
+    for (int k = 0; k < in_w * in_h; k++) {
+      int src_plane_offset = src_batch_offset + k * in_channel;
+      int dst_plane_offset = dst_batch_offset + k * C8NUM;
+      for (int i = 0; i < ic8_minus; ++i) {
+        int src_c_offset = src_plane_offset + i * C8NUM;
+        int dst_c_offset = dst_plane_offset + i * C8NUM * in_h * in_w;
+#ifdef ENABLE_ARM
+        vst1q_s16(packed_input + dst_c_offset, vmovl_s8(vld1_s8(input_data + src_c_offset)));
+#else
+        for (int j = 0; j < C8NUM; ++j) {
+          (packed_input + dst_c_offset)[j] = (int16_t)(input_data + src_c_offset)[j];
+        }
+#endif
+      }  // ic8_minus loop
+      int tmp_ic = ic8_minus * C8NUM;
+      int res_c = in_channel - tmp_ic;
+      int tmp_ic_offset = tmp_ic * in_h * in_w;
+      for (int l = 0; l < res_c; ++l) {
+        int src_c_offset = src_plane_offset + tmp_ic + l;
+        int dst_c_offset = dst_plane_offset + tmp_ic_offset + l;
+        (packed_input + dst_c_offset)[l] = (int16_t)(input_data + src_c_offset)[l];
+      }  // res ic loop
+      for (int l = res_c; l < C8NUM; ++l) {
+        int dst_c_offset = dst_plane_offset + tmp_ic_offset + l;
+        (packed_input + dst_c_offset)[l] = 0;
+      }  // res ic loop
+    }    // kh * kw loop
  }
 }

@ -692,17 +709,30 @@ void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight

 void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel) {
  int c4 = UP_DIV(channel, C4NUM);
+  int c4_minus = c4 - 1;
  for (int b = 0; b < batch; b++) {
    int src_oc_offset = b * plane * channel;
    int dst_oc_offset = b * plane * c4 * C4NUM;
    for (int k = 0; k < plane; k++) {
      int src_kernel_offset = src_oc_offset + k * channel;
      int dst_kernel_offset = dst_oc_offset + k * C4NUM;
-      for (int i = 0; i < channel; i++) {
-        int c4_block_num = i / C4NUM;
-        int c4_block_rem = i % C4NUM;
-        int src_ic_offset = src_kernel_offset + i;
-        int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem;
+      for (int j = 0; j < c4_minus; ++j) {
+        int src_ic_offset = src_kernel_offset + j * C4NUM;
+        int dst_ic_offset = dst_kernel_offset + j * plane * C4NUM;
+#ifdef ENABLE_ARM
+        vst1q_f32((float *)dst + dst_ic_offset, vld1q_f32((float *)src + src_ic_offset));
+#else
+        for (int i = 0; i < C4NUM; ++i) {
+          ((float *)dst + dst_ic_offset)[i] = ((float *)src + src_ic_offset)[i];
+        }
+#endif
+      }
+      int tmp_c = c4_minus * C4NUM;
+      int tmp_c_offset = tmp_c * plane;
+      int res_c = channel - tmp_c;
+      for (int l = 0; l < res_c; ++l) {
+        int src_ic_offset = src_kernel_offset + tmp_c + l;
+        int dst_ic_offset = dst_kernel_offset + tmp_c_offset + l;
        ((float *)dst + dst_ic_offset)[0] = ((float *)src + src_ic_offset)[0];
      }
    }
@ -956,6 +986,7 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int

 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) {
  int c4 = UP_DIV(channel, C4NUM);
+  int c4_channel = c4 * C4NUM;
  int nhwc4_batch_unit_offset = c4 * C4NUM * plane;
  int ic_remainder_ = channel % C4NUM;
  if (ic_remainder_ != 0) {
@ -963,8 +994,11 @@ void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int c
    for (int b = 0; b < batch; b++) {
      int batch_offset = b * channel * plane;
      for (int i = 0; i < plane; i++) {
-        memcpy((int8_t *)dst + nhwc4_batch_offset + i * c4 * C4NUM, (int8_t *)src + batch_offset + i * channel,
-               channel);
+        int8_t *dst_per_plane = (int8_t *)dst + nhwc4_batch_offset + i * c4_channel;
+        memcpy(dst_per_plane, (int8_t *)src + batch_offset + i * channel, channel);
+        for (int j = channel; j < c4_channel; ++j) {
+          dst_per_plane[j] = 0;
+        }
      }
      nhwc4_batch_offset += nhwc4_batch_unit_offset;
    }
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@ -379,50 +379,50 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4
      float32x4_t dst01 = g01;
      float32x4_t dst02 = g02;

-      float32x4_t dst10 = vaddq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5));
-      dst10 = vaddq_f32(dst10, vmulq_n_f32(g20, 0.5));
-      float32x4_t dst11 = vaddq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5));
-      dst11 = vaddq_f32(dst11, vmulq_n_f32(g21, 0.5));
-      float32x4_t dst12 = vaddq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5));
-      dst12 = vaddq_f32(dst12, vmulq_n_f32(g22, 0.5));
-
-      float32x4_t dst20 = vsubq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5));
-      dst20 = vaddq_f32(dst20, vmulq_n_f32(g20, 0.5));
-      float32x4_t dst21 = vsubq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5));
-      dst21 = vaddq_f32(dst21, vmulq_n_f32(g21, 0.5));
-      float32x4_t dst22 = vsubq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5));
-      dst22 = vaddq_f32(dst22, vmulq_n_f32(g22, 0.5));
+      float32x4_t dst10 = vaddq_f32(vaddq_f32(g00, g10), g20);
+      dst10 = vmulq_n_f32(dst10, 0.5);
+      float32x4_t dst11 = vaddq_f32(vaddq_f32(g01, g11), g21);
+      dst11 = vmulq_n_f32(dst11, 0.5);
+      float32x4_t dst12 = vaddq_f32(vaddq_f32(g02, g12), g22);
+      dst12 = vmulq_n_f32(dst12, 0.5);
+
+      float32x4_t dst20 = vaddq_f32(vsubq_f32(g00, g10), g20);
+      dst20 = vmulq_n_f32(dst20, 0.5);
+      float32x4_t dst21 = vaddq_f32(vsubq_f32(g01, g11), g21);
+      dst21 = vmulq_n_f32(dst21, 0.5);
+      float32x4_t dst22 = vaddq_f32(vsubq_f32(g02, g12), g22);
+      dst22 = vmulq_n_f32(dst22, 0.5);

      float32x4_t dst30 = g20;
      float32x4_t dst31 = g21;
      float32x4_t dst32 = g22;

      float32x4_t m00 = dst00;
-      float32x4_t m01 = vaddq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5));
-      m01 = vaddq_f32(m01, vmulq_n_f32(dst02, 0.5));
-      float32x4_t m02 = vsubq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5));
-      m02 = vaddq_f32(m02, vmulq_n_f32(dst02, 0.5));
+      float32x4_t m01 = vaddq_f32(vaddq_f32(dst00, dst01), dst02);
+      m01 = vmulq_n_f32(m01, 0.5);
+      float32x4_t m02 = vaddq_f32(vsubq_f32(dst00, dst01), dst02);
+      m02 = vmulq_n_f32(m02, 0.5);
      float32x4_t m03 = dst02;

      float32x4_t m10 = dst10;
-      float32x4_t m11 = vaddq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5));
-      m11 = vaddq_f32(m11, vmulq_n_f32(dst12, 0.5));
-      float32x4_t m12 = vsubq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5));
-      m12 = vaddq_f32(m12, vmulq_n_f32(dst12, 0.5));
+      float32x4_t m11 = vaddq_f32(vaddq_f32(dst10, dst11), dst12);
+      m11 = vmulq_n_f32(m11, 0.5);
+      float32x4_t m12 = vaddq_f32(vsubq_f32(dst10, dst11), dst12);
+      m12 = vmulq_n_f32(m12, 0.5);
      float32x4_t m13 = dst12;

      float32x4_t m20 = dst20;
-      float32x4_t m21 = vaddq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5));
-      m21 = vaddq_f32(m21, vmulq_n_f32(dst22, 0.5));
-      float32x4_t m22 = vsubq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5));
-      m22 = vaddq_f32(m22, vmulq_n_f32(dst22, 0.5));
+      float32x4_t m21 = vaddq_f32(vaddq_f32(dst20, dst21), dst22);
+      m21 = vmulq_n_f32(m21, 0.5);
+      float32x4_t m22 = vaddq_f32(vsubq_f32(dst20, dst21), dst22);
+      m22 = vmulq_n_f32(m22, 0.5);
      float32x4_t m23 = dst22;

      float32x4_t m30 = dst30;
-      float32x4_t m31 = vaddq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5));
-      m31 = vaddq_f32(m31, vmulq_n_f32(dst32, 0.5));
-      float32x4_t m32 = vsubq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5));
-      m32 = vaddq_f32(m32, vmulq_n_f32(dst32, 0.5));
+      float32x4_t m31 = vaddq_f32(vaddq_f32(dst30, dst31), dst32);
+      m31 = vmulq_n_f32(m31, 0.5);
+      float32x4_t m32 = vaddq_f32(vsubq_f32(dst30, dst31), dst32);
+      m32 = vmulq_n_f32(m32, 0.5);
      float32x4_t m33 = dst32;

      dst_ic4_ptr[0] = m00[0];
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
@ -68,7 +68,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel {
  int thread_count_;
  ConvParameter *conv_param_;
  ConvQuantArg *conv_quant_arg_;
-  LayoutConvertor convert_func_;
+  LayoutConvertor convert_func_ = nullptr;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
@ -98,8 +98,16 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
  int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
  MS_ASSERT(ctx_->allocator != nullptr);

+  size_t nhwc8_input_size =
+    iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc8_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
+    return RET_ERROR;
+  }
+
  size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t);
-  tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
+  tile_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (tile_buffer_ == nullptr) {
    MS_LOG(ERROR) << "malloc tile_buffer_ failed.";
    return RET_ERROR;
@ -160,27 +168,11 @@ int Convolution3x3FP16CPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return ret;
  }
-
-  int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
-  size_t nhwc8_input_size =
-    iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
-  nhwc4_input_ = malloc(nhwc8_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc8_input_size);
-
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
@ -52,6 +52,10 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (tile_buffer_ != nullptr) {
      ctx_->allocator->Free(tile_buffer_);
      tile_buffer_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
@ -105,6 +105,15 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
  int out_channel = conv_param_->output_channel_;
  int oc4 = UP_DIV(out_channel, C4NUM);

+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
+  size_t nhwc4_input_size =
+    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
+    return RET_ERROR;
+  }
+
  tmp_output_block_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(
    conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t)));
  if (tmp_output_block_ == nullptr) {
@ -145,10 +154,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
  if (slidingWindow_param_ != nullptr) {
    delete slidingWindow_param_;
    slidingWindow_param_ = nullptr;
@ -160,16 +165,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
    return ret;
  }

-  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
  // init sliding window param
  slidingWindow_param_ = new (std::nothrow) SlidingWindowParam;
  if (slidingWindow_param_ == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
@ -54,6 +54,10 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (tmp_output_block_ != nullptr) {
      ctx_->allocator->Free(tmp_output_block_);
      tmp_output_block_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@ -236,6 +236,14 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
  int oc8 = UP_DIV(channel_out, C8NUM);
  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);

+  size_t nhwc8_input_size =
+    ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc8_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
+    return RET_ERROR;
+  }
+
  size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t);
  trans_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (trans_input_ == nullptr) {
@ -303,11 +311,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
@ -318,17 +321,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
  conv_param_->input_unit_ = input_unit_;
  conv_param_->output_unit_ = output_unit_;

-  int channel_in = conv_param_->input_channel_;
-  int ic8 = UP_DIV(channel_in, C8NUM);
-  size_t nhwc8_input_size =
-    ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
-  nhwc4_input_ = malloc(nhwc8_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc8_input_size);
-
  ret = ConfigInputOutput();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConfigInputOutput failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@ -59,6 +59,10 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (trans_input_ != nullptr) {
      ctx_->allocator->Free(trans_input_);
      trans_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@ -100,6 +100,14 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
 #else
  int tile_num = 12;
 #endif
+  size_t nhwc4_input_size =
+    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
+    return RET_ERROR;
+  }
+
  size_t tile_buffer_size = thread_count_ * tile_num * C16NUM * ic4 * C4NUM * sizeof(float);
  tile_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (tile_buffer_ == nullptr) {
@ -174,27 +182,11 @@ int Convolution3x3CPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.ret: " << ret;
    return RET_ERROR;
  }
-
-  int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
@ -45,6 +45,10 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (tile_buffer_ != nullptr) {
      ctx_->allocator->Free(tile_buffer_);
      tile_buffer_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
@ -79,6 +79,14 @@ int ConvolutionSWCPUKernel::InitTmpBuffer() {
  int out_channel = conv_param_->output_channel_;
  int oc4 = UP_DIV(out_channel, C4NUM);
  MS_ASSERT(ctx_->allocator != nullptr);
+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
+  size_t nhwc4_input_size =
+    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
+    return RET_ERROR;
+  }

  tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(
    conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float)));
@ -116,10 +124,6 @@ int ConvolutionSWCPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
  if (slidingWindow_param_ != nullptr) {
    delete slidingWindow_param_;
    slidingWindow_param_ = nullptr;
@ -131,16 +135,6 @@ int ConvolutionSWCPUKernel::ReSize() {
    return RET_ERROR;
  }

-  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
  // init sliding window param
  slidingWindow_param_ = new (std::nothrow) SlidingWindowParam;
  if (slidingWindow_param_ == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h
@ -53,6 +53,10 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (tmp_output_block_ != nullptr) {
      ctx_->allocator->Free(tmp_output_block_);
      tmp_output_block_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@ -157,6 +157,14 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
 #endif
  MS_ASSERT(ctx_->allocator != nullptr);

+  size_t nhwc4_input_size =
+    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
+    return RET_ERROR;
+  }
+
  size_t tile_buffer_size = thread_count_ * tile_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
  trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (trans_input_ == nullptr) {
@ -249,11 +257,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
@ -265,16 +268,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
  conv_param_->input_unit_ = input_unit_;
  conv_param_->output_unit_ = output_unit_;

-  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
  ret = ConfigInputOutput();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConfigInputOutput failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
@ -50,6 +50,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
    if (trans_input_ != nullptr) {
      ctx_->allocator->Free(trans_input_);
      trans_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
@ -44,6 +44,10 @@ void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParamete
 }

 void Convolution3x3Int8CPUKernel::FreeTmpBuffer() {
+  if (input_data_ != nullptr) {
+    ctx_->allocator->Free(input_data_);
+    input_data_ = nullptr;
+  }
  if (tile_buffer_ != nullptr) {
    ctx_->allocator->Free(tile_buffer_);
    tile_buffer_ = nullptr;
@ -67,10 +71,6 @@ Convolution3x3Int8CPUKernel::~Convolution3x3Int8CPUKernel() {
    free(transformed_filter_addr_);
    transformed_filter_addr_ = nullptr;
  }
-  if (input_data_ != nullptr) {
-    free(input_data_);
-    input_data_ = nullptr;
-  }
  FreeQuantParam();
 }

@ -118,6 +118,14 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() {
  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
  MS_ASSERT(ctx_->allocator != nullptr);

+  size_t c8_input_size =
+    conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t);
+  input_data_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(c8_input_size));
+  if (input_data_ == nullptr) {
+    MS_LOG(ERROR) << "malloc input_data_ failed.";
+    return RET_ERROR;
+  }
+
  size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t);
  tile_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (tile_buffer_ == nullptr) {
@ -179,27 +187,11 @@ int Convolution3x3Int8CPUKernel::ReSize() {
    return ret;
  }

-  if (input_data_ != nullptr) {
-    free(input_data_);
-    input_data_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return RET_ERROR;
  }
-
-  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
-  size_t c8_input_size =
-    conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t);
-  input_data_ = reinterpret_cast<int16_t *>(malloc(c8_input_size));
-  if (input_data_ == nullptr) {
-    MS_LOG(ERROR) << "malloc input_data_ failed.";
-    return RET_ERROR;
-  }
-  memset(input_data_, 0, c8_input_size);
-
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@ -132,6 +132,26 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
 int ConvolutionInt8CPUKernel::InitTmpBuffer() {
  MS_ASSERT(ctx_->allocator != nullptr);

+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
+  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
+  int output_tile_count = UP_DIV(output_count, tile_num_);
+  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
+  int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
+  int packed_input_size = output_tile_count * tile_num_ * unit_size;
+  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc packed_input_ failed.";
+    return RET_ERROR;
+  }
+
+  size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_;
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
+    return RET_ERROR;
+  }
+
  size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
  tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
  if (tmp_dst_ == nullptr) {
@ -219,6 +239,14 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
 int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
  MS_ASSERT(ctx_->allocator != nullptr);

+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
+  size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_;
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
+    return RET_ERROR;
+  }
+
  size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
  tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
  if (tmp_dst_ == nullptr) {
@ -238,12 +266,6 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
 void ConvolutionInt8CPUKernel::ConfigInputOutput() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_tensor->SetFormat(schema::Format::Format_NHWC);
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto ret = CheckLayout(input_tensor);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Check layout failed.";
-    return;
-  }
 }

 int ConvolutionInt8CPUKernel::Init() {
@ -283,43 +305,11 @@ int ConvolutionInt8CPUKernel::ReSize() {
    return ret;
  }

-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-  if (packed_input_ != nullptr) {
-    free(packed_input_);
-    packed_input_ = nullptr;
-  }
-
  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return RET_ERROR;
  }
-
-  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_;
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
-  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
-  int output_tile_count = UP_DIV(output_count, tile_num_);
-  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
-  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
-  int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * tile_num_ * unit_size;
-  packed_input_ = reinterpret_cast<int8_t *>(malloc(conv_param_->input_batch_ * packed_input_size));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc packed_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size);
-
  return RET_OK;
 }

@ -353,25 +343,17 @@ int ConvolutionInt8CPUKernel::Run() {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }
-  if (support_optimize_) {
-    ret = InitTmpBufferOpt();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Init tmp buffer failed.";
-      return RET_ERROR;
-    }
-  } else {
-    // init tmp input, output
-    ret = InitTmpBuffer();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Init tmp buffer failed.";
-      return RET_ERROR;
-    }
+  // init tmp input, output
+  ret = InitTmpBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init tmp buffer failed.";
+    return RET_ERROR;
  }

  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = input_tensor->MutableData();
-  convert_func_(ori_input_data, nhwc4_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
-                conv_param_->input_channel_);
+  PackNHWCToNHWC4Int8(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
+                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionInt8Impl, this, thread_count_);
  if (error_code != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@ -36,10 +36,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
      free(packed_weight_);
      packed_weight_ = nullptr;
    }
-    if (packed_input_ != nullptr) {
-      free(packed_input_);
-      packed_input_ = nullptr;
-    }
    if (input_sum_ != nullptr) {
      free(input_sum_);
      input_sum_ = nullptr;
@ -59,6 +55,14 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {

 private:
  void FreeTmpBuffer() {
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
+    if (packed_input_ != nullptr) {
+      ctx_->allocator->Free(packed_input_);
+      packed_input_ = nullptr;
+    }
    if (tmp_dst_ != nullptr) {
      ctx_->allocator->Free(tmp_dst_);
      tmp_dst_ = nullptr;