!3751 add lite cpu op: conv_depthwise fp16, deconv_depthwise fp16

Merge pull request !3751 from yangruoqi713/lite
5 years ago · 9257fadf5f
parent 08d8c1031d bf8d3f153f
commit 9257fadf5f
17 changed files with 905 additions and 24 deletions
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@ -172,7 +172,8 @@ union PrimitiveType {
    TupleGetItem,
    Div,
    Where,
-    OneHot
+    OneHot,
+    Lstm
 }

 enum QuantType: int {
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@ -718,3 +718,7 @@ table Where{
 table OneHot {
    axis: int;
 }
+
+table Lstm{
+    bidirection: bool = false;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@ -25,6 +25,8 @@
 #ifdef ENABLE_FP16
 #include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
+#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
+#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
 #endif
 #include "src/runtime/kernel/arm/int8/deconvolution_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_int8.h"
@ -347,6 +349,19 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
  return kernel;
 }

+#ifdef ENABLE_FP16
+kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const Context *ctx) {
+  auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr.";
+    return nullptr;
+  }
+  return kernel;
+}
+#endif
+
 kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const Context *ctx) {
@ -372,12 +387,12 @@ kernel::LiteKernel *CpuConvDwKernelCreator(const std::vector<lite::tensor::Tenso
      break;
    case kNumberTypeUInt8:
      break;
-#ifdef ENABLE_FP16
-    case kNumberTypeFloat16:
-      break;
-#endif
    case kNumberTypeFloat32:
+#ifdef ENABLE_FP16
+      kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
+#else
      kernel = CpuConvDwFp32KernelCreator(inputs, outputs, opParameter, ctx);
+#endif
      break;
    default:
      break;
@ -407,6 +422,19 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor:
  return kernel;
 }

+#ifdef ENABLE_FP16
+kernel::LiteKernel *CpuDeconvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                 const std::vector<lite::tensor::Tensor *> &outputs,
+                                                 OpParameter *opParameter, const lite::Context *ctx) {
+  auto kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr.";
+    return nullptr;
+  }
+  return kernel;
+}
+#endif
+
 kernel::LiteKernel *CpuDeconvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                 const std::vector<lite::tensor::Tensor *> &outputs,
                                                 OpParameter *opParameter, const lite::Context *ctx) {
@ -432,7 +460,11 @@ kernel::LiteKernel *CpuDeconvDwKernelCreator(const std::vector<lite::tensor::Ten
      kernel = CpuDeconvDwInt8KernelCreator(inputs, outputs, opParameter, ctx);
      break;
    case kNumberTypeFloat32:
+#ifdef ENABLE_FP16
+      kernel = CpuDeconvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
+#else
      kernel = CpuDeconvDwFp32KernelCreator(inputs, outputs, opParameter, ctx);
+#endif
      break;
    default:
      break;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@ -0,0 +1,164 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DepthwiseConv2D;
+
+namespace mindspore::kernel {
+int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
+  // malloc pack input buffer
+  int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
+  packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
+
+  // malloc pack output buffer
+  int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
+  packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
+  if (packed_output_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
+  // init weight: o, h, w, i; o == group, i == 1
+  int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
+  auto weight_tensor = inputs_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
+  int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+
+  packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
+  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
+                           conv_param_->output_channel_);
+
+  // init bias
+  bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
+  auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
+  if (inputs_.size() == kInputSize2) {
+    auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
+    for (int i = 0; i < conv_param_->output_channel_; i++) {
+      bias_fp16[i] = (float16_t)ori_bias[i];
+    }
+  }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseFp16CPUKernel::Init() {
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  // init sliding_ window param
+  sliding_ = new SlidingWindowParam;
+  InitSlidingParam(sliding_, conv_param_, C8NUM);
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
+  free(packed_input_);
+  free(packed_output_);
+
+  ConvolutionBaseCPUKernel::Init();
+  InitSlidingParam(sliding_, conv_param_, C8NUM);
+
+  auto ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
+  ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
+               sliding_, task_id);
+  return RET_OK;
+}
+
+int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseFp16CPUKernel *>(cdata);
+  auto ret = conv_dw_fp16->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseFp16CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+
+  auto input_tensor = inputs_.at(kInputIndex);
+  auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
+  // pack input: to nhwc8
+  PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
+                          conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+
+  auto ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+
+  auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
+  PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
+                          conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+
+  return RET_OK;
+}
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
+
+namespace mindspore::kernel {
+class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                    const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+  ~ConvolutionDepthwiseFp16CPUKernel() override {
+    delete sliding_;
+    free(packed_weight_);
+    free(packed_input_);
+    free(packed_output_);
+  }
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+  int InitBuffer();
+  int InitWeightBias();
+  int Execute(int task_id);
+
+ private:
+  SlidingWindowParam *sliding_;
+  float16_t *packed_weight_;
+  float16_t *packed_input_;
+  float16_t *packed_output_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@ -0,0 +1,174 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DepthwiseConv2D;
+
+namespace mindspore::kernel {
+int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
+  conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
+  conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
+  conv_param_->input_w_ = outputs_.front()->shape().at(kNHWC_W);
+  conv_param_->input_channel_ = outputs_.front()->shape().at(kNHWC_C);
+  conv_param_->output_batch_ = inputs_.front()->shape().at(kNHWC_N);
+  conv_param_->output_h_ = inputs_.front()->shape().at(kNHWC_H);
+  conv_param_->output_w_ = inputs_.front()->shape().at(kNHWC_W);
+  conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);
+
+  // init sliding_ window param
+  InitSlidingParam(sliding_, conv_param_, C8NUM);
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
+  // malloc pack input buffer
+  int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
+  packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
+
+  int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
+  packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
+  if (packed_output_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
+  // init weight: o, h, w, i; o == group, i == 1
+  int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
+  auto weight_tensor = inputs_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
+  int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+
+  packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
+  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
+                           conv_param_->output_channel_);
+
+  // init bias
+  bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
+  if (inputs_.size() == kInputSize2) {
+    auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
+    for (int i = 0; i < conv_param_->output_channel_; i++) {
+      reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
+    }
+  }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::Init() {
+  sliding_ = new SlidingWindowParam;
+  InitSlideParam();
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
+  free(packed_input_);
+  free(packed_output_);
+
+  InitSlideParam();
+  ConvolutionBaseCPUKernel::Init();
+
+  auto ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
+  DeconvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
+                 sliding_, task_id);
+  return RET_OK;
+}
+
+int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto deconv_dw_fp16 = reinterpret_cast<DeconvolutionDepthwiseFp16CPUKernel *>(cdata);
+  auto ret = deconv_dw_fp16->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "DeconvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseFp16CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+
+  auto input_tensor = inputs_.at(kInputIndex);
+  auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
+  // pack input: to nhwc8
+  PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
+                          conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+
+  auto ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+
+  auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
+  PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
+                          conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  return RET_OK;
+}
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
+
+namespace mindspore::kernel {
+class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+  ~DeconvolutionDepthwiseFp16CPUKernel() override {
+    delete sliding_;
+    free(packed_weight_);
+    if (need_align_) {
+      free(packed_input_);
+      free(packed_output_);
+    }
+  };
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+  int InitBuffer();
+  int InitWeightBias();
+  int InitSlideParam();
+  int Execute(int task_id);
+
+ private:
+  SlidingWindowParam *sliding_;
+  float16_t *packed_weight_;
+  float16_t *packed_input_;
+  float16_t *packed_output_;
+  bool need_align_ = false;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@ -32,8 +32,8 @@ int ConvolutionDepthwiseCPUKernel::Init() {
  ConvolutionBaseCPUKernel::Init();

  // init sliding window param
-  sliding = new SlidingWindowParam;
-  InitSlidingParam(sliding, conv_param_, C4NUM);
+  sliding_ = new SlidingWindowParam;
+  InitSlidingParam(sliding_, conv_param_, C4NUM);

  // pack input function: convert_func_
  auto input_tensor = inputs_[kInputIndex];
@ -97,7 +97,7 @@ int ConvolutionDepthwiseCPUKernel::ReSize() {

 int ConvolutionDepthwiseCPUKernel::Execute(int task_id) {
  ConvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
-               sliding, task_id);
+               sliding_, task_id);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
@ -29,7 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
                                const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
  ~ConvolutionDepthwiseCPUKernel() override {
-    delete sliding;
+    delete sliding_;
    free(packed_weight_);
    if (convert_func_ != nullptr) {
      free(packed_input_);
@ -46,7 +46,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
  int Execute(int task_id);

 private:
-  SlidingWindowParam *sliding;
+  SlidingWindowParam *sliding_;
  float *packed_weight_;
  float *packed_input_;
  float *packed_output_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@ -38,8 +38,8 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
  conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);

  // init sliding window param
-  sliding = new SlidingWindowParam;
-  InitSlidingParam(sliding, conv_param_, C4NUM);
+  sliding_ = new SlidingWindowParam;
+  InitSlidingParam(sliding_, conv_param_, C4NUM);
  return RET_OK;
 }

@ -110,7 +110,7 @@ int DeconvolutionDepthwiseCPUKernel::ReSize() {

 int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) {
  DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
-                 sliding, task_id);
+                 sliding_, task_id);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
@ -29,7 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
                                  const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
  ~DeconvolutionDepthwiseCPUKernel() override {
-    delete sliding;
+    delete sliding_;
    free(packed_weight_);
    free(packed_input_);
    free(packed_output_);
@ -43,7 +43,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
  int DoExcute(int task_id);

 private:
-  SlidingWindowParam *sliding;
+  SlidingWindowParam *sliding_;
  float *packed_weight_;
  float *packed_input_;
  float *packed_output_;
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h
@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
+
+#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
+#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
+
+#ifdef ENABLE_FP16
+void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                  const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
+                  int task_id);
+
+void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                    const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
+                    int task_id);
+#endif
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_

 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"

@ -45,5 +45,5 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
 void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
                    const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
@ -292,6 +292,55 @@ void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int
    }
  }
 }
+
+void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * channel;
+    int dst_offset = b * plane * c8 * C8NUM;
+    for (int c = 0; c < channel; c++) {
+      int c8_block_num = c / C8NUM;
+      int c8_block_rem = c % C8NUM;
+      int src_c_offset = src_offset + c * plane;
+      int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
+      for (int k = 0; k < plane; k++) {
+        int src_kernel_offset = src_c_offset + k;
+        int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
+        (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
+      }
+    }
+  }
+}
+
+void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
+  int nhwc8_batch_offset = 0;
+  for (int b = 0; b < batch; b++) {
+    int batch_offset = b * channel * plane;
+    for (int i = 0; i < plane; i++) {
+      for (int c = 0; c < channel; c++) {
+        (dst + nhwc8_batch_offset + i * c8 * C8NUM)[c] = (float16_t)(src + batch_offset + i * channel)[c];
+      }
+    }
+    nhwc8_batch_offset += nhwc8_batch_unit_offset;
+  }
+}
+
+void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  int nhwc_batch_unit_offset = channel * plane;
+  int nhwc_batch_offset = 0;
+  for (int b = 0; b < batch; b++) {
+    int batch_offset = b * c8 * C8NUM * plane;
+    for (int i = 0; i < plane; i++) {
+      for (int c = 0; c < channel; c++) {
+        (dst + nhwc_batch_offset + i * channel)[c] = (float)(src + batch_offset + i * c8 * C8NUM)[c];
+      }
+    }
+    nhwc_batch_offset += nhwc_batch_unit_offset;
+  }
+}
 #endif

 void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) {
@ -1070,7 +1119,7 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
      auto src_k = src_b + k * conv_param->input_channel_;
      auto dst_k = dst_b + k * ic4 * C4NUM;
      for (int c = 0; c < conv_param->input_channel_; c++) {
-        dst_k[c] = (int16_t)((int32_t)(src_k[c]) - input_zp);
+        dst_k[c] = (int16_t)(src_k[c] - input_zp);
      }
    }
  }
@ -1087,7 +1136,7 @@ void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight
    for (int k = 0; k < unit; k++) {
      auto src_kernel = src_c + k;
      auto dst_kernel = dst_c + C4NUM * k + c4_block_rem;
-      *dst_kernel = (int16_t)((int32_t)(src_kernel[0]) - weight_zp);
+      *dst_kernel = (int16_t)(src_kernel[0] - weight_zp);
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
@ -46,6 +46,14 @@ void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int
 void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);

 void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
+
+void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
+
+void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
+
+void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
+
+void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
 #endif
 void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
                        int block_index);
@ -163,4 +171,3 @@ inline void C4UnpackToHwcInt8(int8_t *src_ptr, int8_t *dst_ptr, int channel, int
 }

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_H_
-
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@ -72,7 +72,7 @@ else()
            )
 endif()
 ### cpu kernel
-file(GLOB_RECURSE KERNEL_OP_SRC
+file(GLOB KERNEL_OP_SRC
        ${LITE_DIR}/src/runtime/kernel/arm/base/*.cc
        ${LITE_DIR}/src/runtime/kernel/arm/fp32/*.cc
        ${LITE_DIR}/src/runtime/kernel/arm/int8/*.cc
@ -103,10 +103,13 @@ if (PLATFORM_ARM32)
            )
 endif()
 if (ENABLE_FP16)
+    file(GLOB KERNEL_OP_FP16_SRC
+            ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc
+            ${LITE_DIR}/src/runtime/kernel/arm/opclib/fp16/*.cc
+            )
    set(KERNEL_OP_SRC
            ${KERNEL_OP_SRC}
-            ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_fp16.cc
-            ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+            ${KERNEL_OP_FP16_SRC}
            )
 endif ()
 ### gpu kernel