diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 3c0e46f761..af4283c60f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -16,6 +16,7 @@
 
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
 #include "src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h"
+#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -177,10 +178,22 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
   }
 
   auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
+  float16_t *input_addr;
+  if (input_tensor->data_type() == kNumberTypeFloat32) {
+    input_addr =
+      reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t)));
+    if (input_addr == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+    Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum());
+  } else {
+    input_addr = reinterpret_cast<float16_t *>(input_tensor->Data());
+  }
+
   // pack input: to nhwc8
-  PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
-                          conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
+                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
   ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
@@ -188,10 +201,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
-                          conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data());
+  PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_,
+                      conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
 
+  if (input_tensor->data_type() == kNumberTypeFloat32) {
+    context_->allocator->Free(input_addr);
+  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
index 0bbd701d0e..2e3532aaf4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
@@ -334,31 +334,57 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
 }
 
 void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
-  int c8 = UP_DIV(channel, C8NUM);
-  int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
-  int nhwc8_batch_offset = 0;
+  int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
-    int batch_offset = b * channel * plane;
+    float16_t *dst_batch = dst + b * plane * c8_channel;
+    float *src_batch = src + b * plane * channel;
     for (int i = 0; i < plane; i++) {
+      float16_t *dst_plane = dst_batch + i * c8_channel;
+      float *src_plane = src_batch + i * channel;
       for (int c = 0; c < channel; c++) {
-        (dst + nhwc8_batch_offset + i * c8 * C8NUM)[c] = (float16_t)(src + batch_offset + i * channel)[c];
+        dst_plane[c] = (float16_t)(src_plane[c]);
       }
     }
-    nhwc8_batch_offset += nhwc8_batch_unit_offset;
   }
 }
 
 void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
-  int c8 = UP_DIV(channel, C8NUM);
-  int nhwc_batch_unit_offset = channel * plane;
-  int nhwc_batch_offset = 0;
+  int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
-    int batch_offset = b * c8 * C8NUM * plane;
+    float16_t *src_batch = src + b * plane * c8_channel;
+    float *dst_batch = dst + b * plane * channel;
     for (int i = 0; i < plane; i++) {
+      float16_t *src_plane = src_batch + i * c8_channel;
+      float *dst_plane = dst_batch + i * channel;
       for (int c = 0; c < channel; c++) {
-        (dst + nhwc_batch_offset + i * channel)[c] = (float)(src + batch_offset + i * c8 * C8NUM)[c];
+        dst_plane[c] = (float16_t)(src_plane[c]);
       }
     }
-    nhwc_batch_offset += nhwc_batch_unit_offset;
+  }
+}
+
+void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+  int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
+  for (int b = 0; b < batch; b++) {
+    float16_t *dst_batch = dst + b * plane * c8_channel;
+    float16_t *src_batch = src + b * plane * channel;
+    for (int i = 0; i < plane; i++) {
+      float16_t *dst_plane = dst_batch + i * c8_channel;
+      float16_t *src_plane = src_batch + i * channel;
+      memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
+    }
+  }
+}
+
+void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+  int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
+  for (int b = 0; b < batch; b++) {
+    float16_t *src_batch = src + b * plane * c8_channel;
+    float16_t *dst_batch = dst + b * plane * channel;
+    for (int i = 0; i < plane; i++) {
+      float16_t *src_plane = src_batch + i * c8_channel;
+      float16_t *dst_plane = dst_batch + i * channel;
+      memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
+    }
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
index 349f97b29b..188d9a0465 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
@@ -58,6 +58,10 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
 void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
 
 void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
+
+void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+
+void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
 #ifdef __cplusplus
 }
 #endif