optimize op performance

4 years ago · e36e11e095
parent 9d00e30ed0
commit e36e11e095
20 changed files with 271 additions and 299 deletions
--- a/mindspore/lite/nnacl/base/reshape_base.h
+++ b/mindspore/lite/nnacl/base/reshape_base.h
@ -1,35 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_RESHAHPE_BASE_H_
-#define MINDSPORE_LITE_NNACL_RESHAHPE_BASE_H_
-
-#include <string.h>
-#include "nnacl/op_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-inline void Reshape(const void *input_ptr, void *output_ptr, size_t data_size) {
-  memcpy(output_ptr, input_ptr, data_size);
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // MINDSPORE_LITE_NNACL_RESHAHPE_BASE_H_
--- a/mindspore/lite/nnacl/base/squeeze_base.h
+++ b/mindspore/lite/nnacl/base/squeeze_base.h
@ -1,38 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_SQUEEZE_BASE_H_
-#define MINDSPORE_LITE_NNACL_SQUEEZE_BASE_H_
-
-#include "nnacl/errorcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static inline int DoSqueeze(const void *input_ptr, void *output_ptr, size_t data_size) {
-  if (input_ptr == NULL || output_ptr == NULL) {
-    return NNACL_ERR;
-  }
-  (void)memcpy(output_ptr, input_ptr, data_size);
-  return NNACL_OK;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // MINDSPORE_LITE_NNACL_SQUEEZE_BASE_H_
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@ -64,6 +64,7 @@

 typedef enum LiteDataType {
  kDataTypeFloat,
+  kDataTypeFloat16,
  kDataTypeInt,
  kDataTypeInt8,
  KDataTypeBool,
--- a/mindspore/lite/nnacl/strided_slice.c
+++ b/mindspore/lite/nnacl/strided_slice.c
@ -108,6 +108,10 @@ int DoStridedSlice(const void *in_data, void *out_data, StridedSliceParameter *p
                *((int8_t *)out_data + out_offset) = *((int8_t *)in_data + in_offset);
              } else if (param->data_type == kDataTypeInt) {
                *((int32_t *)out_data + out_offset) = *((int32_t *)in_data + in_offset);
+#ifdef ENABLE_ARM64
+              } else if (param->data_type == kDataTypeFloat16) {
+                *((float16_t *)out_data + out_offset) = *((float16_t *)in_data + in_offset);
+#endif
              } else {
                return NNACL_ERR;
              }
@ -120,3 +124,15 @@ int DoStridedSlice(const void *in_data, void *out_data, StridedSliceParameter *p
  }
  return NNACL_OK;
 }
+
+void FastStride(const uint8_t *input, uint8_t *output, int split_len, int stride, size_t outer, size_t inner_size,
+                size_t in_offset) {
+  for (size_t i = 0; i < outer; ++i) {
+    const uint8_t *input_ptr = input + i * in_offset;
+    for (int j = 0; j < split_len; ++j) {
+      memcpy(output, input_ptr, inner_size);
+      output += inner_size;
+      input_ptr += inner_size * stride;
+    }
+  }
+}
--- a/mindspore/lite/nnacl/strided_slice.h
+++ b/mindspore/lite/nnacl/strided_slice.h
@ -39,6 +39,9 @@ typedef struct StridedSliceParameter {
 extern "C" {
 #endif
 int DoStridedSlice(const void *inputs, void *output, StridedSliceParameter *param);
+
+void FastStride(const uint8_t *input, uint8_t *output, int split_len, int stride, size_t outer, size_t inner_size,
+                size_t in_offset);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/src/ops/populate/expand_dims_populate.cc
+++ b/mindspore/lite/src/ops/populate/expand_dims_populate.cc
@ -26,6 +26,7 @@ OpParameter *PopulateExpandDimsParameter(const mindspore::lite::PrimitiveC *prim
    MS_LOG(ERROR) << "malloc ExpandDimsParameter failed.";
    return nullptr;
  }
+  expand_dims_param->type_ = primitive->Type();
  memset(expand_dims_param, 0, sizeof(OpParameter));
  return reinterpret_cast<OpParameter *>(expand_dims_param);
 }
--- a/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/base/reshape_base.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Reshape;
+
+namespace mindspore::kernel {
+int ReshapeBaseCPUKernel::Init() { return ReSize(); }
+
+int ReshapeBaseCPUKernel::ReSize() {
+  int in_data_size = in_tensors_.front()->Size();
+  int thread_num = context_->thread_num_;
+  cal_max_num_per_thread_ = UP_DIV(in_data_size, thread_num);
+  return RET_OK;
+}
+
+int ReshapeBaseCPUKernel::RunImpl(int task_id) {
+  size_t start_index = task_id * cal_max_num_per_thread_;
+  auto cur_in_ptr = input_ptr_ + start_index;
+  auto cur_out_ptr = output_ptr_ + start_index;
+  size_t data_size = in_tensors_.front()->Size() - start_index;
+  data_size = data_size > cal_max_num_per_thread_ ? cal_max_num_per_thread_ : data_size;
+  memcpy(cur_out_ptr, cur_in_ptr, data_size);
+  return RET_OK;
+}
+
+int ReshapeRun(void *cdata, int task_id) {
+  auto reshape = reinterpret_cast<ReshapeBaseCPUKernel *>(cdata);
+  auto ret = reshape->RunImpl(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ReshapeRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int ReshapeBaseCPUKernel::Run() {
+  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.at(kInputIndex)->data_c());
+  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.at(kOutputIndex)->data_c());
+  auto ret = ParallelLaunch(this->context_->thread_pool_, ReshapeRun, this, context_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Reshape run error error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reshape, LiteKernelCreator<ReshapeBaseCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reshape, LiteKernelCreator<ReshapeBaseCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reshape, LiteKernelCreator<ReshapeBaseCPUKernel>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.h
@ -13,32 +13,33 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_RESHAPE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_RESHAPE_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_RESHAPE_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_RESHAPE_BASE_H_

 #include <vector>
-#include "nnacl/fp16/cast_fp16.h"
-#include "nnacl/base/reshape_base.h"
 #include "src/lite_kernel.h"
 #include "include/context.h"
-#include "src/runtime/kernel/arm/fp32/reshape_fp32.h"

 using mindspore::lite::InnerContext;
-
 namespace mindspore::kernel {
-class ReshapeFp16CPUKernel : public ReshapeCPUKernel {
+class ReshapeBaseCPUKernel : public LiteKernel {
 public:
-  ReshapeFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+  ReshapeBaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                       const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                       const mindspore::lite::PrimitiveC *primitive)
-      : ReshapeCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ReshapeFp16CPUKernel() = default;
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~ReshapeBaseCPUKernel() override = default;

+  int Init() override;
+  int ReSize() override;
  int Run() override;
+  int RunImpl(int task_id);

 private:
+  size_t cal_max_num_per_thread_ = 0;
+  uint8_t *input_ptr_ = nullptr;
+  uint8_t *output_ptr_ = nullptr;
 };
 }  // namespace mindspore::kernel

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_RESHAPE_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_RESHAPE_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/squeeze_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/squeeze_base.cc
@ -13,34 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
-#include "src/runtime/kernel/arm/fp32/squeeze_fp32.h"
-#include "schema/model_generated.h"
+#include "src/runtime/kernel/arm/base/squeeze_base.h"
 #include "src/kernel_registry.h"
-#include "include/errorcode.h"
+#include "schema/model_generated.h"

 using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Squeeze;
-
 namespace mindspore::kernel {
-int SqueezeCPUKernel::Init() { return RET_OK; }
-
-int SqueezeCPUKernel::ReSize() { return RET_OK; }
-
-int SqueezeCPUKernel::Run() {
-  size_t data_size = in_tensors_.front()->Size();
-  int ret = DoSqueeze(in_tensors_.front()->data_c(), out_tensors_.front()->data_c(), data_size);
-
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Do squeeze fail!ret: " << ret;
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeBaseCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeBaseCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeBaseCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeBaseCPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/squeeze_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/squeeze_base.h
@ -13,30 +13,22 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_RESHAPE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_RESHAPE_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SQUEEZE_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SQUEEZE_BASE_H_

 #include <vector>
-#include "src/lite_kernel.h"
-#include "include/context.h"
-#include "nnacl/base/reshape_base.h"
+#include "src/runtime/kernel/arm/base/reshape_base.h"

 using mindspore::lite::InnerContext;
-
 namespace mindspore::kernel {
-class ReshapeCPUKernel : public LiteKernel {
+class SqueezeBaseCPUKernel : public ReshapeBaseCPUKernel {
 public:
-  ReshapeCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                   const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
-                   const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ReshapeCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
+  SqueezeBaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                       const mindspore::lite::PrimitiveC *primitive)
+      : ReshapeBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~SqueezeBaseCPUKernel() override = default;
 };
 }  // namespace mindspore::kernel

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_RESHAPE_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SQUEEZE_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
@ -27,7 +27,7 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Stack;

 namespace mindspore::kernel {
-static int GetCopyNum(const std::vector<int> &in_shape, int axis, int n_dim) {
+static inline int GetCopyNum(const std::vector<int> &in_shape, int axis, int n_dim) {
  int copy_num = 1;
  if (axis > 0) {
    for (int j = n_dim - 1; j > axis - 1; j--) {
@ -41,12 +41,12 @@ static int GetCopyNum(const std::vector<int> &in_shape, int axis, int n_dim) {
  return copy_num;
 }

-static size_t GetOutterSize(const std::vector<int> &in_shape, int axis) {
-  size_t outter_size = 1;
+static inline size_t GetOuterSize(const std::vector<int> &in_shape, int axis) {
+  size_t outer_size = 1;
  for (int i = 0; i < axis; ++i) {
-    outter_size *= in_shape[i];
+    outer_size *= in_shape[i];
  }
-  return outter_size;
+  return outer_size;
 }

 int StackBaseCPUKernel::ReSize() {
@ -59,14 +59,13 @@ int StackBaseCPUKernel::ReSize() {
  } else {
    MS_ASSERT(input_nums > 1);
    copy_size_ = GetCopyNum(input0_shape, axis_, input0_shape.size()) * data_type_size_;
-    outter_size_ = GetOutterSize(input0_shape, axis_);
+    outer_size_ = GetOuterSize(input0_shape, axis_);
  }
  return RET_OK;
 }

 int StackBaseCPUKernel::Init() {
-  auto input0_tensor = in_tensors_.front();
-  data_type_size_ = input0_tensor->Size() / input0_tensor->ElementsNum();
+  data_type_size_ = sizeof(float);
  if (!InferShapeDone()) {
    return RET_OK;
  }
@ -74,13 +73,21 @@ int StackBaseCPUKernel::Init() {
 }

 int StackBaseCPUKernel::Run() {
+  // malloc temporary memory to store all the inputs
  size_t inputs_num = in_tensors_.size();
  char **all_inputs = static_cast<char **>(context_->allocator->Malloc(inputs_num * sizeof(char *)));
+  if (all_inputs == nullptr) {
+    MS_LOG(ERROR) << "malloc all_inputs failed.";
+    return RET_ERROR;
+  }
  for (size_t j = 0; j < inputs_num; ++j) {
    all_inputs[j] = reinterpret_cast<char *>(in_tensors_.at(j)->data_c());
  }
+  // run stack
  auto output_data = reinterpret_cast<char *>(out_tensors_.at(0)->data_c());
-  Stack(all_inputs, output_data, in_tensors_.size(), copy_size_, outter_size_);
+  Stack(all_inputs, output_data, in_tensors_.size(), copy_size_, outer_size_);
+
+  // free temporary variable all_inputs
  context_->allocator->Free(all_inputs);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/base/stack_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/stack_base.h
@ -38,7 +38,7 @@ class StackBaseCPUKernel : public LiteKernel {
  int axis_ = 0;
  size_t data_type_size_ = 0;
  size_t copy_size_ = 0;
-  size_t outter_size_ = 1;
+  size_t outer_size_ = 1;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_STACK_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
@ -33,11 +33,37 @@ int StridedSliceCPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
  }
-
  return ReSize();
 }

+void StridedSliceCPUKernel::InitFastRunParam() {
+  auto in_shape = in_tensors_.front()->shape();
+  auto out_shape = out_tensors_.front()->shape();
+  // cal inner, outer
+  for (int i = 0; i < split_axis_; ++i) {
+    outer_ *= in_shape[i];
+  }
+  int inner = 1;
+  for (size_t i = split_axis_ + 1; i < in_shape.size(); i++) {
+    inner *= in_shape[i];
+  }
+  inner_size_ = in_tensors_.front()->Size() / in_tensors_.front()->ElementsNum() * inner;
+
+  // decide multi-thread launch strategy
+  if (outer_ == 1) {
+    parallel_on_split_axis_ = true;
+    cal_num_per_thread_ = UP_DIV(out_shape[split_axis_], context_->thread_num_);
+  } else {
+    parallel_on_outer_ = true;
+    cal_num_per_thread_ = UP_DIV(outer_, context_->thread_num_);
+  }
+}
+
 int StridedSliceCPUKernel::ReSize() {
+  fast_run_ = MatchFastPattern();
+  if (fast_run_) {
+    InitFastRunParam();
+  }
  if (op_parameter_ != nullptr) {
    free(op_parameter_);
    op_parameter_ = nullptr;
@ -51,7 +77,82 @@ int StridedSliceCPUKernel::ReSize() {
  return RET_OK;
 }

-int StridedSliceCPUKernel::Run() {
+bool StridedSliceCPUKernel::MatchFastPattern() {
+  // This function is seeking if that the number of only one dimension
+  // is different between input and output. If so, we can do some trick.
+  // Example 1:
+  // input shape info:  [1, 80, 46, 40]
+  // output shape info: [1, 80, 20, 40]
+  // Example 2:
+  // input shape info:  [1, 46, 40]
+  // output shape info: [1, 20, 40]
+  auto in_shape = in_tensors_.front()->shape();
+  auto out_shape = out_tensors_.front()->shape();
+  if (in_shape.size() != out_shape.size()) {
+    return false;
+  }
+  std::vector<int> axis_list;
+  for (size_t i = 0; i < in_shape.size(); ++i) {
+    if (in_shape[i] != out_shape[i]) {
+      axis_list.emplace_back(i);
+    }
+  }
+  if (axis_list.size() == 1) {
+    split_axis_ = axis_list.front();
+    return true;
+  }
+  return false;
+}
+
+int StridedSliceCPUKernel::FastRunImpl(int task_id) {
+  auto in_shape = in_tensors_.front()->shape();
+  auto out_shape = out_tensors_.front()->shape();
+  int begin_index = param_->begins_[split_axis_];
+  int caled_num = task_id * cal_num_per_thread_;
+  if (parallel_on_outer_) {
+    uint8_t *cur_in_ptr = input_ptr_ + (caled_num * in_shape[split_axis_] + begin_index) * inner_size_;
+    uint8_t *cur_out_ptr = output_ptr_ + caled_num * out_shape[split_axis_] * inner_size_;
+    int cur_outer = outer_ - caled_num;
+    if (cur_outer > cal_num_per_thread_) {
+      cur_outer = cal_num_per_thread_;
+    }
+    FastStride(cur_in_ptr, cur_out_ptr, out_shape[split_axis_], param_->strides_[split_axis_], cur_outer, inner_size_,
+               in_shape[split_axis_] * inner_size_);
+  } else {
+    MS_ASSERT(parallel_on_split_axis_);
+    uint8_t *cur_in_ptr = input_ptr_ + (caled_num * param_->strides_[split_axis_] + begin_index) * inner_size_;
+    uint8_t *cur_out_ptr = output_ptr_ + caled_num * inner_size_;
+    int cal_axis_num = out_shape[split_axis_] - caled_num;
+    if (cal_axis_num > cal_num_per_thread_) {
+      cal_axis_num = cal_num_per_thread_;
+    }
+    FastStride(cur_in_ptr, cur_out_ptr, cal_axis_num, param_->strides_[split_axis_], 1, inner_size_, 0);
+  }
+  return RET_OK;
+}
+
+int StrideRun(void *cdata, int task_id) {
+  auto stride = reinterpret_cast<StridedSliceCPUKernel *>(cdata);
+  auto ret = stride->FastRunImpl(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "StrideRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int StridedSliceCPUKernel::FastRun() {
+  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.front()->data_c());
+  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.front()->data_c());
+  auto ret = ParallelLaunch(this->context_->thread_pool_, StrideRun, this, context_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Stride run error error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int StridedSliceCPUKernel::NormalRun() {
  auto input = in_tensors_.at(0);
  MS_ASSERT(input);
  switch (input->data_type()) {
@ -61,6 +162,9 @@ int StridedSliceCPUKernel::Run() {
    case kNumberTypeFloat32:
      param_->data_type = kDataTypeFloat;
      break;
+    case kNumberTypeFloat16:
+      param_->data_type = kDataTypeFloat16;
+      break;
    case kNumberTypeInt32:
      param_->data_type = kDataTypeInt;
      break;
@ -78,7 +182,15 @@ int StridedSliceCPUKernel::Run() {
  return RET_OK;
 }

+int StridedSliceCPUKernel::Run() {
+  if (fast_run_) {
+    return FastRun();
+  }
+  return NormalRun();
+}
+
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_StridedSlice, LiteKernelCreator<StridedSliceCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, LiteKernelCreator<StridedSliceCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, LiteKernelCreator<StridedSliceCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_StridedSlice, LiteKernelCreator<StridedSliceCPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.h
@ -35,9 +35,23 @@ class StridedSliceCPUKernel : public LiteKernel {
  int Init() override;
  int ReSize() override;
  int Run() override;
+  bool MatchFastPattern();
+  void InitFastRunParam();
+  int NormalRun();
+  int FastRun();
+  int FastRunImpl(int task_id);

 private:
  StridedSliceParameter *param_;
+  uint8_t *input_ptr_ = nullptr;
+  uint8_t *output_ptr_ = nullptr;
+  int split_axis_{-1};
+  int outer_{1};
+  int cal_num_per_thread_{1};
+  size_t inner_size_{0};
+  bool fast_run_{false};
+  bool parallel_on_split_axis_{false};
+  bool parallel_on_outer_{false};
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc
@ -1,43 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/fp16/reshape_fp16.h"
-#include "schema/model_generated.h"
-#include "src/kernel_registry.h"
-#include "include/errorcode.h"
-
-using mindspore::kernel::KERNEL_ARCH::kCPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Reshape;
-
-namespace mindspore::kernel {
-
-int ReshapeFp16CPUKernel::Run() {
-  auto in_tensor = in_tensors_.at(kInputIndex);
-  auto out_tensor = out_tensors_.at(kOutputIndex);
-
-  float16_t *input_ptr = reinterpret_cast<float16_t *>(in_tensor->data_c());
-  float16_t *output_ptr = reinterpret_cast<float16_t *>(out_tensor->data_c());
-
-  Reshape(input_ptr, output_ptr, out_tensor->Size());
-
-  return RET_OK;
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reshape, LiteKernelCreator<ReshapeFp16CPUKernel>)
-}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@ -82,7 +82,7 @@ int StackFp16CPUKernel::Run() {
    FreeBuffer();
    return ret;
  }
-  Stack(buffers_.data(), reinterpret_cast<char *>(out_buffer_), in_tensors_.size(), copy_size_, outter_size_);
+  Stack(buffers_.data(), reinterpret_cast<char *>(out_buffer_), in_tensors_.size(), copy_size_, outer_size_);
  // if output tensor is fp32, we need to transform
  if (malloc_out_) {
    auto out_tensor = out_tensors_.at(0);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims_fp32.cc
@ -82,8 +82,8 @@ int ExpandDimsRun(void *cdata, int task_id) {
 }

 int ExpandDimsCPUKernel::Run() {
-  in_ptr_ = in_tensors_.at(0)->MutableData();
-  out_ptr_ = out_tensors_.at(0)->MutableData();
+  in_ptr_ = in_tensors_.at(0)->data_c();
+  out_ptr_ = out_tensors_.at(0)->data_c();
  auto ret = ParallelLaunch(this->context_->thread_pool_, ExpandDimsRun, this, thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ExpandDimsRun error error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reshape_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reshape_fp32.cc
@ -1,45 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/fp32/reshape_fp32.h"
-#include "schema/model_generated.h"
-#include "src/kernel_registry.h"
-#include "include/errorcode.h"
-
-using mindspore::kernel::KERNEL_ARCH::kCPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Reshape;
-
-namespace mindspore::kernel {
-int ReshapeCPUKernel::Init() { return RET_OK; }
-
-int ReshapeCPUKernel::ReSize() { return RET_OK; }
-
-int ReshapeCPUKernel::Run() {
-  auto input_ptr = in_tensors_.at(kInputIndex)->data_c();
-  auto output_ptr = out_tensors_.at(kOutputIndex)->data_c();
-  size_t data_size = in_tensors_.at(kInputIndex)->Size();
-  MS_ASSERT(input_ptr);
-  MS_ASSERT(output_ptr);
-  Reshape(input_ptr, output_ptr, data_size);
-  return RET_OK;
-}
-
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reshape, LiteKernelCreator<ReshapeCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reshape, LiteKernelCreator<ReshapeCPUKernel>)
-}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/split_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/split_fp32.cc
@ -75,44 +75,18 @@ int SplitRun(void *cdata, int task_id) {

 int SplitCPUKernel::Run() {
  auto in_tensor = in_tensors_.front();
-  input_ptr_ = reinterpret_cast<float *>(in_tensor->MutableData());
+  input_ptr_ = reinterpret_cast<float *>(in_tensor->data_c());
  for (int i = 0; i < param->num_split_; i++) {
-    output_ptr_.at(i) = reinterpret_cast<float *>(out_tensors_.at(i)->MutableData());
+    output_ptr_.at(i) = reinterpret_cast<float *>(out_tensors_.at(i)->data_c());
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, SplitRun, this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    return RET_ERROR;
  }
-
  return RET_OK;
 }

-kernel::LiteKernel *CpuSplitInt32KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                               const InnerContext *ctx, const kernel::KernelKey &desc,
-                                               const mindspore::lite::PrimitiveC *primitive) {
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "Input opParameter is nullptr!";
-    return nullptr;
-  }
-  MS_ASSERT(desc.type == schema::PrimitiveType_Split);
-  auto *kernel = new (std::nothrow) SplitCPUKernel(opParameter, inputs, outputs, ctx, primitive);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new SplitCPUKernel fail!";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    delete kernel;
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    return nullptr;
-  }
-  return kernel;
-}
-
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Split, LiteKernelCreator<SplitCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Split, LiteKernelCreator<SplitCPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze_fp32.h
@ -1,40 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SQUEEZE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SQUEEZE_H_
-
-#include <vector>
-#include "src/lite_kernel.h"
-#include "nnacl/base/squeeze_base.h"
-
-namespace mindspore::kernel {
-
-class SqueezeCPUKernel : public LiteKernel {
- public:
-  explicit SqueezeCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                            const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
-                            const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~SqueezeCPUKernel() override = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SQUEEZE_H_