From 0ce8708deeae65a93919c715964df6bf59795758 Mon Sep 17 00:00:00 2001
From: xuanyue <xuanyue@huawei.com>
Date: Fri, 28 Aug 2020 22:11:31 +0800
Subject: [PATCH] sync lite to r0.7

---
 build.bat                                     |  45 +-
 build.sh                                      |   4 +-
 cmake/package_lite.cmake                      |  45 +-
 mindspore/lite/CMakeLists.txt                 |  47 +-
 mindspore/lite/README.md                      |  56 ++
 mindspore/lite/README_CN.md                   |  66 ++
 mindspore/lite/include/context.h              |   6 +-
 mindspore/lite/include/lite_session.h         |  23 +-
 mindspore/lite/include/version.h              |  11 +-
 mindspore/lite/java/build_aar.sh              |   6 +-
 .../java/com/mindspore/lite/LiteSession.java  |  39 +-
 .../java/com/mindspore/lite/MSTensor.java     |  54 +-
 .../main/java/com/mindspore/lite/Model.java   |   7 +
 mindspore/lite/java/native/CMakeLists.txt     |   6 +-
 .../lite/java/native/common/jni_utils.cpp     |   3 +-
 .../lite/java/native/runtime/context.cpp      |   7 +-
 .../lite/java/native/runtime/lite_session.cpp |  91 +-
 mindspore/lite/java/native/runtime/model.cpp  |  43 +-
 .../lite/java/native/runtime/ms_tensor.cpp    | 145 +++-
 mindspore/lite/nnacl/CMakeLists.txt           |   4 +-
 .../runtime/kernel/arm => }/nnacl/README.md   |   0
 mindspore/lite/nnacl/arithmetic_common.h      |   2 +
 .../lite/nnacl/assembly/arm64/ConvDwFp32Row.S |  18 +-
 .../nnacl/assembly/arm64/ConvDwInt8Center.S   |  48 +-
 .../assembly/arm64/ConvDwInt8PostAlign4.S     | 169 ++++
 .../lite/nnacl/assembly/arm64/ConvDwInt8Row.S | 122 +++
 .../lite/nnacl/assembly/arm64/MatmulFp32Opt.S | 812 +++++++++++++++++
 .../assembly/arm64/MatmulFp32OptRemain.S      | 144 +++
 .../lite/nnacl/assembly/arm64/MatmulInt8.S    | 141 ++-
 .../lite/nnacl/assembly/opt/ConvDwFp16Row.S   | 117 +++
 .../assembly/opt/IndirectGemmInt8_24x4_dp.S   |  14 +-
 .../lite/nnacl/assembly/opt/MatmulDpInt8.S    | 820 ++++++++++++++++++
 mindspore/lite/nnacl/common_func.c            |  16 -
 mindspore/lite/nnacl/conv_parameter.h         |   5 +-
 mindspore/lite/nnacl/fp16/activation_fp16.c   |  98 +++
 mindspore/lite/nnacl/fp16/activation_fp16.h   |  44 +
 mindspore/lite/nnacl/fp16/arithmetic_fp16.c   | 378 +++++---
 mindspore/lite/nnacl/fp16/batchnorm_fp16.c    |  17 +-
 mindspore/lite/nnacl/fp16/batchnorm_fp16.h    |   4 +-
 mindspore/lite/nnacl/fp16/common_func.c       |  61 --
 mindspore/lite/nnacl/fp16/common_func.h       |  51 --
 .../lite/nnacl/fp16/conv_depthwise_fp16.c     |  93 +-
 .../lite/nnacl/fp16/conv_depthwise_fp16.h     |  20 +
 mindspore/lite/nnacl/fp16/conv_fp16.c         |  26 +-
 mindspore/lite/nnacl/fp16/deconv_fp16.c       |   8 +-
 mindspore/lite/nnacl/fp16/pack_fp16.c         |  57 +-
 mindspore/lite/nnacl/fp16/pack_fp16.h         |   2 +
 mindspore/lite/nnacl/fp16/softmax_fp16.c      |  67 ++
 mindspore/lite/nnacl/fp16/softmax_fp16.h      |  33 +
 .../lite/nnacl/fp16/winograd_transform_fp16.c |  10 +-
 mindspore/lite/nnacl/fp32/activation.c        |  22 +-
 mindspore/lite/nnacl/fp32/arithmetic.c        | 442 +++++++++-
 mindspore/lite/nnacl/fp32/arithmetic.h        |   6 +
 mindspore/lite/nnacl/fp32/batchnorm.c         |   1 -
 mindspore/lite/nnacl/fp32/conv.c              | 120 +--
 mindspore/lite/nnacl/fp32/conv.h              |   9 +-
 mindspore/lite/nnacl/fp32/conv_depthwise.c    | 452 +---------
 mindspore/lite/nnacl/fp32/conv_depthwise.h    |   5 -
 mindspore/lite/nnacl/fp32/deconv.c            |  60 +-
 mindspore/lite/nnacl/fp32/deconv.h            |  11 +-
 mindspore/lite/nnacl/fp32/gather.h            |  10 +-
 mindspore/lite/nnacl/fp32/matmul.c            | 183 +++-
 mindspore/lite/nnacl/fp32/matmul.h            |  10 +-
 mindspore/lite/nnacl/fp32/pooling.c           | 469 +++++++++-
 mindspore/lite/nnacl/fp32/pooling.h           |   8 +
 mindspore/lite/nnacl/fp32/resize.c            | 172 ++--
 mindspore/lite/nnacl/fp32/resize.h            |   7 +-
 mindspore/lite/nnacl/fp32/scale.c             |  79 ++
 mindspore/lite/nnacl/fp32/scale.h             |  30 +
 mindspore/lite/nnacl/fp32/space_to_batch.c    | 181 ++--
 mindspore/lite/nnacl/fp32/space_to_batch.h    |  21 +-
 mindspore/lite/nnacl/fp32/strassen_matmul.c   | 204 -----
 mindspore/lite/nnacl/fp32/strassen_matmul.h   |  45 -
 mindspore/lite/nnacl/fp32/topk.c              |  22 +-
 mindspore/lite/nnacl/fp32_grad/pack_ext.c     |   8 +-
 .../{strassen_matmul.h => gather_parameter.h} |  21 +-
 mindspore/lite/nnacl/int8/common_func.h       |   4 +
 .../lite/nnacl/int8/conv_depthwise_int8.c     | 115 ++-
 .../lite/nnacl/int8/conv_depthwise_int8.h     |   8 +-
 mindspore/lite/nnacl/int8/conv_int8.c         | 293 ++++++-
 mindspore/lite/nnacl/int8/conv_int8.h         |  11 +
 mindspore/lite/nnacl/int8/deconv.c            |  76 +-
 mindspore/lite/nnacl/int8/div_int8.c          |   4 +-
 mindspore/lite/nnacl/int8/gatherNd_int8.c     |  34 +
 mindspore/lite/nnacl/int8/gatherNd_int8.h     |  31 +
 mindspore/lite/nnacl/int8/gather_int8.c       |  44 +
 mindspore/lite/nnacl/int8/gather_int8.h       |  32 +
 mindspore/lite/nnacl/int8/matmul_int8.c       | 210 ++++-
 mindspore/lite/nnacl/int8/matmul_int8.h       |  28 +-
 mindspore/lite/nnacl/int8/pooling_int8.c      | 398 ++++++---
 mindspore/lite/nnacl/int8/pooling_int8.h      |   2 +
 mindspore/lite/nnacl/int8/resize.c            |  72 ++
 mindspore/lite/nnacl/int8/resize.h            |  19 +-
 mindspore/lite/nnacl/int8/softmax_int8.c      |   2 +-
 mindspore/lite/nnacl/matmul_parameter.h       |  12 +-
 mindspore/lite/nnacl/op_base.h                |   9 +-
 mindspore/lite/nnacl/opt_op_handler.c         |  16 +-
 mindspore/lite/nnacl/pack.c                   | 193 ++++-
 mindspore/lite/nnacl/pack.h                   |  14 +-
 mindspore/lite/nnacl/pooling_parameter.h      |  14 +-
 .../lite/nnacl/quantization/fixed_point.c     | 200 ++---
 .../lite/nnacl/quantization/fixed_point.h     |  34 +-
 mindspore/lite/nnacl/quantization/quantize.h  |   6 +
 mindspore/lite/nnacl/scale.c                  |  49 --
 mindspore/lite/nnacl/scale.h                  |  10 -
 mindspore/lite/nnacl/winograd_transform.c     | 162 ++--
 mindspore/lite/nnacl/winograd_transform.h     |   8 +-
 mindspore/lite/nnacl/winograd_utils.c         |  67 +-
 mindspore/lite/schema/model.fbs               |   3 +-
 mindspore/lite/schema/ops.fbs                 |   8 +-
 mindspore/lite/src/CMakeLists.txt             |  39 +-
 mindspore/lite/src/common/graph_util.cc       |  57 +-
 mindspore/lite/src/common/graph_util.h        | 209 +----
 mindspore/lite/src/kernel_registry.cc         |  12 +-
 mindspore/lite/src/lite_kernel.cc             |  24 +
 mindspore/lite/src/lite_kernel.h              |  19 +-
 mindspore/lite/src/lite_session.cc            | 160 +++-
 mindspore/lite/src/lite_session.h             |  25 +-
 mindspore/lite/src/model.cc                   | 102 +--
 mindspore/lite/src/ops/CMakeLists.txt         |   0
 mindspore/lite/src/ops/abs.cc                 |  32 +
 mindspore/lite/src/ops/abs.h                  |  10 +-
 mindspore/lite/src/ops/activation.cc          |  36 +-
 mindspore/lite/src/ops/activation.h           |  11 +-
 mindspore/lite/src/ops/activation_grad.cc     |  15 +-
 mindspore/lite/src/ops/activation_grad.h      |   7 +-
 mindspore/lite/src/ops/add.cc                 |  17 +-
 mindspore/lite/src/ops/add.h                  |  14 +-
 mindspore/lite/src/ops/addn.cc                |  15 +-
 mindspore/lite/src/ops/addn.h                 |   7 +-
 mindspore/lite/src/ops/argmax.cc              |  20 +-
 mindspore/lite/src/ops/argmax.h               |  15 +-
 mindspore/lite/src/ops/argmin.cc              |  20 +-
 mindspore/lite/src/ops/argmin.h               |  15 +-
 mindspore/lite/src/ops/arithmetic.h           |   7 +-
 mindspore/lite/src/ops/arithmetic_self.h      |   7 +-
 mindspore/lite/src/ops/batch_norm.cc          |  36 +-
 mindspore/lite/src/ops/batch_norm.h           |   9 +-
 mindspore/lite/src/ops/batch_to_space.cc      |  28 +-
 mindspore/lite/src/ops/batch_to_space.h       |  10 +-
 mindspore/lite/src/ops/bias_add.cc            |  52 +-
 mindspore/lite/src/ops/bias_add.h             |  10 +-
 mindspore/lite/src/ops/bias_grad.cc           |  21 +-
 mindspore/lite/src/ops/bias_grad.h            |   9 +-
 mindspore/lite/src/ops/bn_grad_input.cc       |  16 +-
 mindspore/lite/src/ops/bn_grad_input.h        |   9 +-
 mindspore/lite/src/ops/broadcast_to.cc        |  21 +-
 mindspore/lite/src/ops/broadcast_to.h         |   9 +-
 mindspore/lite/src/ops/cast.cc                |  16 +-
 mindspore/lite/src/ops/cast.h                 |   9 +-
 mindspore/lite/src/ops/ceil.h                 |  14 +-
 mindspore/lite/src/ops/clip.cc                |  16 +-
 mindspore/lite/src/ops/clip.h                 |   9 +-
 mindspore/lite/src/ops/concat.cc              |  54 +-
 mindspore/lite/src/ops/concat.h               |  11 +-
 mindspore/lite/src/ops/constant_of_shape.cc   |  15 +-
 mindspore/lite/src/ops/constant_of_shape.h    |   7 +-
 mindspore/lite/src/ops/conv2d.cc              |  64 +-
 mindspore/lite/src/ops/conv2d.h               |  41 +-
 mindspore/lite/src/ops/conv2d_grad_filter.cc  |  34 +-
 mindspore/lite/src/ops/conv2d_grad_filter.h   |  39 +-
 mindspore/lite/src/ops/conv2d_grad_input.cc   |  34 +-
 mindspore/lite/src/ops/conv2d_grad_input.h    |  39 +-
 mindspore/lite/src/ops/cos.cc                 |  32 +
 mindspore/lite/src/ops/cos.h                  |   6 +-
 mindspore/lite/src/ops/crop.cc                |  22 +-
 mindspore/lite/src/ops/crop.h                 |  10 +-
 mindspore/lite/src/ops/deconv2d.cc            |  46 +-
 mindspore/lite/src/ops/deconv2d.h             |  39 +-
 mindspore/lite/src/ops/dedepthwise_conv2d.cc  |  33 +-
 mindspore/lite/src/ops/dedepthwise_conv2d.h   |  37 +-
 mindspore/lite/src/ops/depth_to_space.cc      |  16 +-
 mindspore/lite/src/ops/depth_to_space.h       |   9 +-
 mindspore/lite/src/ops/depthwise_conv2d.cc    |  46 +-
 mindspore/lite/src/ops/depthwise_conv2d.h     |  41 +-
 mindspore/lite/src/ops/dequant.cc             |  28 +-
 mindspore/lite/src/ops/dequant.h              |   5 +-
 .../lite/src/ops/detection_post_process.cc    |  30 +-
 .../lite/src/ops/detection_post_process.h     |  31 +-
 mindspore/lite/src/ops/div.cc                 |  15 +-
 mindspore/lite/src/ops/div.h                  |   8 +-
 mindspore/lite/src/ops/dropout.cc             |  15 +-
 mindspore/lite/src/ops/dropout.h              |  10 +-
 mindspore/lite/src/ops/eltwise.cc             |  15 +-
 mindspore/lite/src/ops/eltwise.h              |   8 +-
 mindspore/lite/src/ops/elu.cc                 |  15 +-
 mindspore/lite/src/ops/elu.h                  |   8 +-
 mindspore/lite/src/ops/embedding_lookup.cc    |  15 +-
 mindspore/lite/src/ops/embedding_lookup.h     |   8 +-
 .../lite/src/ops/embedding_lookup_sparse.cc   |  30 +-
 .../lite/src/ops/embedding_lookup_sparse.h    |  12 +-
 mindspore/lite/src/ops/equal.cc               |  33 +
 mindspore/lite/src/ops/equal.h                |   5 +-
 mindspore/lite/src/ops/exp.cc                 |  33 +
 mindspore/lite/src/ops/exp.h                  |   5 +-
 mindspore/lite/src/ops/expand_dims.cc         |  14 +-
 mindspore/lite/src/ops/expand_dims.h          |   8 +-
 .../src/ops/fake_quant_with_min_max_vars.cc   |  16 +-
 .../src/ops/fake_quant_with_min_max_vars.h    |   9 +-
 mindspore/lite/src/ops/fill.cc                |  21 +-
 mindspore/lite/src/ops/fill.h                 |   9 +-
 mindspore/lite/src/ops/flatten.cc             |  37 +-
 mindspore/lite/src/ops/flatten.h              |   8 +-
 mindspore/lite/src/ops/floor.cc               |  34 +
 mindspore/lite/src/ops/floor.h                |   7 +-
 mindspore/lite/src/ops/floor_div.cc           |  34 +
 mindspore/lite/src/ops/floor_div.h            |   5 +-
 mindspore/lite/src/ops/floor_mod.cc           |  34 +
 mindspore/lite/src/ops/floor_mod.h            |   5 +-
 mindspore/lite/src/ops/full_connection.cc     |  18 +-
 mindspore/lite/src/ops/full_connection.h      |  13 +-
 mindspore/lite/src/ops/fused_batchnorm.cc     |  16 +-
 mindspore/lite/src/ops/fused_batchnorm.h      |  11 +-
 mindspore/lite/src/ops/gather.cc              |  19 +-
 mindspore/lite/src/ops/gather.h               |   9 +-
 mindspore/lite/src/ops/gather_nd.cc           |  14 +-
 mindspore/lite/src/ops/gather_nd.h            |   8 +-
 mindspore/lite/src/ops/greater.cc             |  33 +
 mindspore/lite/src/ops/greater.h              |   5 +-
 mindspore/lite/src/ops/greater_equal.cc       |  32 +
 mindspore/lite/src/ops/greater_equal.h        |   5 +-
 mindspore/lite/src/ops/l2_norm.cc             |  21 +-
 mindspore/lite/src/ops/l2_norm.h              |  10 +-
 mindspore/lite/src/ops/leaky_relu.cc          |  14 +-
 mindspore/lite/src/ops/leaky_relu.h           |   8 +-
 .../lite/src/ops/{caffe_p_relu.cc => less.cc} |  19 +-
 mindspore/lite/src/ops/less.h                 |   5 +-
 mindspore/lite/src/ops/less_equal.cc          |  33 +
 mindspore/lite/src/ops/less_equal.h           |   5 +-
 .../src/ops/local_response_normalization.cc   |  20 +-
 .../src/ops/local_response_normalization.h    |  13 +-
 mindspore/lite/src/ops/log.cc                 |  33 +
 mindspore/lite/src/ops/log.h                  |   5 +-
 mindspore/lite/src/ops/logical_and.cc         |  33 +
 mindspore/lite/src/ops/logical_and.h          |   7 +-
 mindspore/lite/src/ops/logical_not.cc         |  33 +
 mindspore/lite/src/ops/logical_not.h          |   7 +-
 mindspore/lite/src/ops/logical_or.cc          |  33 +
 mindspore/lite/src/ops/logical_or.h           |   7 +-
 mindspore/lite/src/ops/lrn.cc                 |  17 +-
 mindspore/lite/src/ops/lrn.h                  |  13 +-
 mindspore/lite/src/ops/lstm.cc                |  15 +-
 mindspore/lite/src/ops/lstm.h                 |   8 +-
 mindspore/lite/src/ops/make_tuple.cc          |  37 +-
 mindspore/lite/src/ops/make_tuple.h           |   7 +-
 mindspore/lite/src/ops/matmul.cc              |  50 +-
 mindspore/lite/src/ops/matmul.h               |  12 +-
 mindspore/lite/src/ops/matrix_diag.cc         |  18 +-
 mindspore/lite/src/ops/matrix_diag.h          |  13 +-
 mindspore/lite/src/ops/maximum.cc             |  33 +
 mindspore/lite/src/ops/maximum.h              |   7 +-
 mindspore/lite/src/ops/mean.cc                |  22 +-
 mindspore/lite/src/ops/mean.h                 |  10 +-
 mindspore/lite/src/ops/minimum.cc             |  33 +
 mindspore/lite/src/ops/minimum.h              |   7 +-
 mindspore/lite/src/ops/mul.cc                 |  43 +-
 mindspore/lite/src/ops/mul.h                  |  10 +-
 mindspore/lite/src/ops/nchw2nhwc.cc           |  12 +
 mindspore/lite/src/ops/nchw2nhwc.h            |   5 +-
 mindspore/lite/src/ops/nhwc2nchw.cc           |  13 +
 mindspore/lite/src/ops/nhwc2nchw.h            |   5 +-
 mindspore/lite/src/ops/not_equal.cc           |  33 +
 mindspore/lite/src/ops/not_equal.h            |   7 +-
 mindspore/lite/src/ops/one_hot.cc             |  14 +-
 mindspore/lite/src/ops/one_hot.h              |   8 +-
 mindspore/lite/src/ops/p_relu.cc              |  51 ++
 .../lite/src/ops/{caffe_p_relu.h => p_relu.h} |  21 +-
 mindspore/lite/src/ops/pad.cc                 |  22 +-
 mindspore/lite/src/ops/pad.h                  |  12 +-
 mindspore/lite/src/ops/permute.cc             |  19 +
 mindspore/lite/src/ops/permute.h              |   7 +-
 mindspore/lite/src/ops/pooling.cc             | 133 +--
 mindspore/lite/src/ops/pooling.h              |  38 +-
 mindspore/lite/src/ops/pooling_grad.cc        |  29 +-
 mindspore/lite/src/ops/pooling_grad.h         |  31 +-
 mindspore/lite/src/ops/power.cc               |  17 +-
 mindspore/lite/src/ops/power.h                |  11 +-
 mindspore/lite/src/ops/power_grad.cc          |  18 +-
 mindspore/lite/src/ops/power_grad.h           |  11 +-
 mindspore/lite/src/ops/primitive_c.cc         | 241 ++---
 mindspore/lite/src/ops/primitive_c.h          |  74 +-
 mindspore/lite/src/ops/prior_box.cc           |  48 +-
 mindspore/lite/src/ops/prior_box.h            |  28 +-
 mindspore/lite/src/ops/quant.cc               |  29 +-
 mindspore/lite/src/ops/quant.h                |   5 +-
 mindspore/lite/src/ops/quant_dtype_cast.cc    |  16 +-
 mindspore/lite/src/ops/quant_dtype_cast.h     |  10 +-
 mindspore/lite/src/ops/range.cc               |  18 +-
 mindspore/lite/src/ops/range.h                |  13 +-
 mindspore/lite/src/ops/rank.cc                |  12 +-
 mindspore/lite/src/ops/rank.h                 |   5 +-
 mindspore/lite/src/ops/reduce.cc              |  87 +-
 mindspore/lite/src/ops/reduce.h               |  14 +-
 mindspore/lite/src/ops/reshape.cc             |  80 +-
 mindspore/lite/src/ops/reshape.h              |  12 +-
 mindspore/lite/src/ops/resize.cc              |  26 +-
 mindspore/lite/src/ops/resize.h               |  17 +-
 mindspore/lite/src/ops/return.cc              |  79 ++
 mindspore/lite/src/ops/{prelu.h => return.h}  |  22 +-
 mindspore/lite/src/ops/reverse.cc             |  21 +-
 mindspore/lite/src/ops/reverse.h              |   9 +-
 mindspore/lite/src/ops/reverse_sequence.cc    |  22 +-
 mindspore/lite/src/ops/reverse_sequence.h     |  12 +-
 mindspore/lite/src/ops/roi_pooling.cc         |  17 +-
 mindspore/lite/src/ops/roi_pooling.h          |  11 +-
 mindspore/lite/src/ops/round.cc               |  33 +
 mindspore/lite/src/ops/round.h                |   7 +-
 mindspore/lite/src/ops/{prelu.cc => rsqrt.cc} |  18 +-
 mindspore/lite/src/ops/rsqrt.h                |   7 +-
 mindspore/lite/src/ops/scale.cc               |  15 +-
 mindspore/lite/src/ops/scale.h                |   8 +-
 mindspore/lite/src/ops/scatter_nd.cc          |  12 +
 mindspore/lite/src/ops/scatter_nd.h           |   5 +-
 mindspore/lite/src/ops/shape.cc               |  13 +
 mindspore/lite/src/ops/shape.h                |   7 +-
 mindspore/lite/src/ops/sin.cc                 |  34 +
 mindspore/lite/src/ops/sin.h                  |   7 +-
 mindspore/lite/src/ops/slice.cc               |  67 +-
 mindspore/lite/src/ops/slice.h                |  21 +-
 mindspore/lite/src/ops/softmax.cc             |  15 +-
 mindspore/lite/src/ops/softmax.h              |   8 +-
 .../lite/src/ops/softmax_cross_entropy.cc     |  21 +-
 .../lite/src/ops/softmax_cross_entropy.h      |   9 +-
 mindspore/lite/src/ops/space_to_batch.cc      |  32 +-
 mindspore/lite/src/ops/space_to_batch.h       |  13 +-
 mindspore/lite/src/ops/space_to_batch_nd.cc   |  80 +-
 mindspore/lite/src/ops/space_to_batch_nd.h    |  13 +-
 mindspore/lite/src/ops/space_to_depth.cc      |  16 +-
 mindspore/lite/src/ops/space_to_depth.h       |   9 +-
 mindspore/lite/src/ops/sparse_to_dense.cc     |  36 +-
 mindspore/lite/src/ops/sparse_to_dense.h      |  14 +-
 mindspore/lite/src/ops/split.cc               |  22 +-
 mindspore/lite/src/ops/split.h                |  12 +-
 mindspore/lite/src/ops/sqrt.cc                |  34 +
 mindspore/lite/src/ops/sqrt.h                 |   7 +-
 mindspore/lite/src/ops/square.cc              |  34 +
 mindspore/lite/src/ops/square.h               |   7 +-
 mindspore/lite/src/ops/squared_difference.cc  |  34 +
 mindspore/lite/src/ops/squared_difference.h   |   7 +-
 mindspore/lite/src/ops/squeeze.cc             |  21 +-
 mindspore/lite/src/ops/squeeze.h              |   9 +-
 mindspore/lite/src/ops/stack.cc               |  23 +-
 mindspore/lite/src/ops/stack.h                |  12 +-
 mindspore/lite/src/ops/strided_slice.cc       |  49 +-
 mindspore/lite/src/ops/strided_slice.h        |  24 +-
 mindspore/lite/src/ops/sub.cc                 |  15 +-
 mindspore/lite/src/ops/sub.h                  |   8 +-
 mindspore/lite/src/ops/tile.cc                |  29 +-
 mindspore/lite/src/ops/tile.h                 |  11 +-
 mindspore/lite/src/ops/topk.cc                |  16 +-
 mindspore/lite/src/ops/topk.h                 |   9 +-
 mindspore/lite/src/ops/transpose.cc           |  75 +-
 mindspore/lite/src/ops/transpose.h            |  12 +-
 mindspore/lite/src/ops/tuple_get_item.cc      |  38 +-
 mindspore/lite/src/ops/tuple_get_item.h       |   6 +-
 mindspore/lite/src/ops/unique.cc              |  15 +-
 mindspore/lite/src/ops/unique.h               |   8 +-
 mindspore/lite/src/ops/unsqueeze.cc           |  23 +-
 mindspore/lite/src/ops/unsqueeze.h            |   9 +-
 mindspore/lite/src/ops/unstack.cc             |  16 +-
 mindspore/lite/src/ops/unstack.h              |   9 +-
 mindspore/lite/src/ops/upsample.cc            |  22 +-
 mindspore/lite/src/ops/upsample.h             |  10 +-
 mindspore/lite/src/ops/where.cc               |  21 +-
 mindspore/lite/src/ops/where.h                |   9 +-
 mindspore/lite/src/ops/zeros_like.cc          |  15 +
 mindspore/lite/src/ops/zeros_like.h           |   4 +-
 mindspore/lite/src/param_value_lite.h         |   4 +-
 mindspore/lite/src/populate_parameter.cc      | 522 ++++++-----
 mindspore/lite/src/runtime/allocator.h        |   2 +-
 .../kernel/arm/base/convolution_base.cc       |  23 +-
 .../kernel/arm/base/leaky_relu_base.cc        |   6 +-
 .../runtime/kernel/arm/base/matmul_base.cc    |  31 +-
 .../runtime/kernel/arm/base/pooling_base.cc   |   6 +
 .../src/runtime/kernel/arm/base/prior_box.cc  |  14 +-
 .../kernel/arm/base/quant_dtype_cast.cc       |   4 +-
 .../runtime/kernel/arm/base/resize_base.cc    |   3 +-
 .../kernel/arm/fp16/activation_fp16.cc        | 156 ++++
 .../runtime/kernel/arm/fp16/activation_fp16.h |  52 ++
 .../kernel/arm/fp16/arithmetic_fp16.cc        | 486 ++++-------
 .../runtime/kernel/arm/fp16/arithmetic_fp16.h |  36 +-
 .../runtime/kernel/arm/fp16/batchnorm_fp16.cc |  95 +-
 .../runtime/kernel/arm/fp16/batchnorm_fp16.h  |  11 +-
 .../src/runtime/kernel/arm/fp16/cast_fp16.cc  |   6 +-
 .../kernel/arm/fp16/convolution_1x1_fp16.cc   |  45 +-
 .../kernel/arm/fp16/convolution_3x3_fp16.cc   |  73 +-
 .../kernel/arm/fp16/convolution_3x3_fp16.h    |   9 +-
 .../arm/fp16/convolution_depthwise_fp16.cc    | 111 +--
 .../arm/fp16/convolution_depthwise_fp16.h     |  12 +-
 .../convolution_depthwise_slidewindow_fp16.cc | 190 ++++
 .../convolution_depthwise_slidewindow_fp16.h  |  61 ++
 .../kernel/arm/fp16/convolution_fp16.cc       |  67 +-
 .../kernel/arm/fp16/convolution_fp16.h        |  26 +-
 .../kernel/arm/fp16/convolution_sw_fp16.cc    |  10 +-
 .../arm/fp16/convolution_winograd_fp16.cc     |  70 +-
 .../arm/fp16/convolution_winograd_fp16.h      |  11 +-
 .../arm/fp16/deconvolution_depthwise_fp16.cc  |  39 +-
 .../arm/fp16/deconvolution_depthwise_fp16.h   |   1 -
 .../kernel/arm/fp16/deconvolution_fp16.cc     |   4 +-
 .../runtime/kernel/arm/fp16/pooling_fp16.cc   |  61 +-
 .../runtime/kernel/arm/fp16/pooling_fp16.h    |  10 +-
 .../runtime/kernel/arm/fp16/reduce_fp16.cc    |   4 +-
 .../src/runtime/kernel/arm/fp16/reduce_fp16.h |   1 -
 .../runtime/kernel/arm/fp16/reshape_fp16.cc   |  29 +-
 .../runtime/kernel/arm/fp16/softmax_fp16.cc   | 156 ++++
 .../runtime/kernel/arm/fp16/softmax_fp16.h    |  47 +
 .../src/runtime/kernel/arm/fp16/split_fp16.cc |   6 +-
 .../runtime/kernel/arm/fp16/transpose_fp16.cc |   8 +-
 .../src/runtime/kernel/arm/fp32/activation.cc |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/addn.cc  |   8 +-
 .../src/runtime/kernel/arm/fp32/arithmetic.cc | 137 ++-
 .../src/runtime/kernel/arm/fp32/arithmetic.h  |  26 +-
 .../kernel/arm/fp32/arithmetic_self.cc        |   4 +-
 .../runtime/kernel/arm/fp32/arithmetic_self.h |   3 +-
 .../src/runtime/kernel/arm/fp32/batchnorm.cc  |   4 +-
 .../src/runtime/kernel/arm/fp32/batchnorm.h   |   2 +-
 .../lite/src/runtime/kernel/arm/fp32/cast.cc  |   6 +-
 .../kernel/arm/fp32/constant_of_shape.cc      |  10 +-
 .../runtime/kernel/arm/fp32/convolution.cc    |  83 +-
 .../src/runtime/kernel/arm/fp32/convolution.h |  12 +-
 .../kernel/arm/fp32/convolution_1x1.cc        |  70 +-
 .../kernel/arm/fp32/convolution_3x3.cc        |  82 +-
 .../runtime/kernel/arm/fp32/convolution_3x3.h |  16 +-
 .../kernel/arm/fp32/convolution_depthwise.cc  |  28 +-
 .../arm/fp32/convolution_depthwise_3x3.cc     | 218 -----
 .../fp32/convolution_depthwise_slidewindow.cc |  43 +-
 .../fp32/convolution_depthwise_slidewindow.h  |   1 -
 .../arm/fp32/convolution_slidewindow.cc       |  10 +-
 .../kernel/arm/fp32/convolution_winograd.cc   |  91 +-
 .../kernel/arm/fp32/convolution_winograd.h    |  14 +-
 .../lite/src/runtime/kernel/arm/fp32/crop.cc  |   6 +-
 .../runtime/kernel/arm/fp32/deconvolution.cc  |  81 +-
 .../runtime/kernel/arm/fp32/deconvolution.h   |   1 -
 .../arm/fp32/deconvolution_depthwise.cc       |  49 +-
 .../kernel/arm/fp32/deconvolution_depthwise.h |   1 -
 .../lite/src/runtime/kernel/arm/fp32/elu.cc   |   4 +-
 .../kernel/arm/fp32/embedding_lookup.cc       |   4 +-
 .../src/runtime/kernel/arm/fp32/expandDims.cc |   4 +-
 .../src/runtime/kernel/arm/fp32/expandDims.h  |   3 +-
 .../lite/src/runtime/kernel/arm/fp32/fill.cc  |  10 +-
 .../lite/src/runtime/kernel/arm/fp32/fill.h   |   3 +-
 .../runtime/kernel/arm/fp32/fullconnection.cc |  73 +-
 .../runtime/kernel/arm/fp32/fullconnection.h  |   4 +-
 .../src/runtime/kernel/arm/fp32/gather.cc     |  74 +-
 .../lite/src/runtime/kernel/arm/fp32/gather.h |  10 +-
 .../src/runtime/kernel/arm/fp32/gatherNd.cc   |   4 +-
 .../src/runtime/kernel/arm/fp32/gatherNd.h    |   3 +-
 .../src/runtime/kernel/arm/fp32/leaky_relu.cc |   6 +-
 .../kernel/arm/fp32/local_response_norm.cc    |   4 +-
 .../src/runtime/kernel/arm/fp32/matmul.cc     | 138 ++-
 .../lite/src/runtime/kernel/arm/fp32/matmul.h |   6 +-
 .../src/runtime/kernel/arm/fp32/one_hot.cc    |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/pad.cc   |   4 +-
 .../src/runtime/kernel/arm/fp32/pooling.cc    |  28 +-
 .../lite/src/runtime/kernel/arm/fp32/power.cc |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/power.h  |   2 -
 .../lite/src/runtime/kernel/arm/fp32/prelu.cc |  10 +-
 .../lite/src/runtime/kernel/arm/fp32/range.cc |   6 -
 .../lite/src/runtime/kernel/arm/fp32/rank.cc  |   6 -
 .../src/runtime/kernel/arm/fp32/reduce.cc     |   6 +-
 .../src/runtime/kernel/arm/fp32/resize.cc     | 104 ++-
 .../lite/src/runtime/kernel/arm/fp32/resize.h |  14 +-
 .../src/runtime/kernel/arm/fp32/reverse.cc    |   4 +-
 .../src/runtime/kernel/arm/fp32/reverse.h     |   3 +-
 .../runtime/kernel/arm/fp32/roi_pooling.cc    |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/scale.cc |  51 +-
 .../lite/src/runtime/kernel/arm/fp32/scale.h  |   2 +-
 .../src/runtime/kernel/arm/fp32/scatter_nd.cc |   6 +-
 .../lite/src/runtime/kernel/arm/fp32/shape.cc |   4 -
 .../lite/src/runtime/kernel/arm/fp32/slice.cc |  17 +-
 .../runtime/kernel/arm/fp32/space_to_batch.cc | 136 +--
 .../runtime/kernel/arm/fp32/space_to_batch.h  |  16 +-
 .../runtime/kernel/arm/fp32/space_to_depth.cc |   4 +-
 .../kernel/arm/fp32/sparse_to_dense.cc        |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/split.cc |   4 +-
 .../src/runtime/kernel/arm/fp32/squeeze.cc    |   5 -
 .../src/runtime/kernel/arm/fp32/transpose.cc  |   8 +-
 .../src/runtime/kernel/arm/fp32/unsqueeze.cc  |   4 +-
 .../lite/src/runtime/kernel/arm/fp32/where.cc |   4 +-
 .../src/runtime/kernel/arm/fp32/zeroslike.cc  |   3 -
 .../kernel/arm/fp32_grad/activation_grad.cc   |   4 +-
 .../kernel/arm/fp32_grad/arithmetic_grad.cc   |   2 +-
 .../kernel/arm/fp32_grad/pooling_grad.cc      |   2 +-
 .../src/runtime/kernel/arm/int8/add_int8.cc   |   8 +-
 .../src/runtime/kernel/arm/int8/add_int8.h    |   2 +-
 .../kernel/arm/int8/arithmetic_int8.cc        |   8 +-
 .../kernel/arm/int8/arithmetic_self_int8.cc   |   4 +-
 .../kernel/arm/int8/arithmetic_self_int8.h    |   3 +-
 .../runtime/kernel/arm/int8/batchnorm_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/concat_int8.cc    |   4 +-
 .../src/runtime/kernel/arm/int8/concat_int8.h |   2 +-
 .../kernel/arm/int8/convolution_1x1_int8.cc   | 351 ++++++++
 .../kernel/arm/int8/convolution_1x1_int8.h    |  76 ++
 .../kernel/arm/int8/convolution_3x3_int8.cc   |  41 +-
 .../arm/int8/convolution_depthwise_int8.cc    | 134 ++-
 .../arm/int8/convolution_depthwise_int8.h     |  10 +-
 .../convolution_depthwise_slidewindow_int8.cc | 182 ++++
 .../convolution_depthwise_slidewindow_int8.h} |  28 +-
 .../kernel/arm/int8/convolution_int8.cc       |  17 +-
 .../src/runtime/kernel/arm/int8/crop_int8.cc  |   4 +-
 .../src/runtime/kernel/arm/int8/crop_int8.h   |   2 +-
 .../arm/int8/deconvolution_depthwise_int8.cc  |  34 +-
 .../arm/int8/deconvolution_depthwise_int8.h   |   1 -
 .../kernel/arm/int8/deconvolution_int8.cc     |   4 +-
 .../src/runtime/kernel/arm/int8/div_int8.cc   |   6 +-
 .../kernel/arm/int8/fullconnection_int8.cc    |  87 +-
 .../kernel/arm/int8/fullconnection_int8.h     |  36 +-
 .../runtime/kernel/arm/int8/gatherNd_int8.cc  | 166 ++++
 .../runtime/kernel/arm/int8/gatherNd_int8.h   |  51 ++
 .../runtime/kernel/arm/int8/gather_int8.cc    | 163 ++++
 .../src/runtime/kernel/arm/int8/gather_int8.h |  51 ++
 .../runtime/kernel/arm/int8/hswish_int8.cc    |   4 +-
 .../kernel/arm/int8/leaky_relu_int8.cc        |  20 +-
 .../runtime/kernel/arm/int8/leaky_relu_int8.h |   1 -
 .../runtime/kernel/arm/int8/matmul_int8.cc    | 118 +--
 .../src/runtime/kernel/arm/int8/matmul_int8.h |  59 +-
 .../src/runtime/kernel/arm/int8/mul_int8.cc   |   8 +-
 .../src/runtime/kernel/arm/int8/mul_int8.h    |   2 +-
 .../src/runtime/kernel/arm/int8/pad_int8.cc   |   4 +-
 .../runtime/kernel/arm/int8/pooling_int8.cc   |  14 +-
 .../src/runtime/kernel/arm/int8/power_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/reduce_int8.cc    |   6 +-
 .../src/runtime/kernel/arm/int8/relux_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/reshape_int8.cc   |   4 +-
 .../runtime/kernel/arm/int8/reshape_int8.h    |   2 +-
 .../runtime/kernel/arm/int8/resize_int8.cc    |  48 +-
 .../runtime/kernel/arm/int8/sigmoid_int8.cc   |   4 +-
 .../src/runtime/kernel/arm/int8/slice_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/softmax_int8.cc   |   4 +-
 .../src/runtime/kernel/arm/int8/split_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/squeeze_int8.cc   |   4 +-
 .../runtime/kernel/arm/int8/squeeze_int8.h    |   2 +-
 .../src/runtime/kernel/arm/int8/sub_int8.cc   |   6 +-
 .../runtime/kernel/arm/int8/unsqueeze_int8.cc |   4 +-
 .../runtime/kernel/arm/int8/unsqueeze_int8.h  |   4 +-
 .../src/runtime/kernel/opencl/CMakeLists.txt  |  18 +-
 .../src/runtime/kernel/opencl/cl/batchnorm.cl |  13 +-
 .../src/runtime/kernel/opencl/cl/biasadd.cl   |  31 +
 .../runtime/kernel/opencl/cl/caffe_prelu.cl   |  23 -
 .../src/runtime/kernel/opencl/cl/concat.cl    |  22 +-
 .../kernel/opencl/cl/conv2d_transpose2x2.cl   |   1 +
 .../kernel/opencl/cl/depthwise_conv2d.cl      | 171 ++--
 .../src/runtime/kernel/opencl/cl/matmul.cl    |   1 +
 .../src/runtime/kernel/opencl/cl/prelu.cl     |  30 +
 .../src/runtime/kernel/opencl/cl/reshape.cl   |   1 +
 .../src/runtime/kernel/opencl/cl/slice.cl     |  29 +-
 .../src/runtime/kernel/opencl/cl/to_format.cl | 131 +--
 .../src/runtime/kernel/opencl/cl/transpose.cl |   9 +-
 .../kernel/opencl/kernel/activation.cc        |  91 +-
 .../runtime/kernel/opencl/kernel/activation.h |   3 +
 .../kernel/opencl/kernel/arithmetic.cc        |   6 +-
 .../runtime/kernel/opencl/kernel/batchnorm.cc |  10 +-
 .../runtime/kernel/opencl/kernel/biasadd.cc   | 167 ++++
 .../kernel/{caffe_prelu.h => biasadd.h}       |  21 +-
 .../kernel/opencl/kernel/caffe_prelu.cc       | 152 ----
 .../runtime/kernel/opencl/kernel/concat.cc    |  10 +-
 .../kernel/opencl/kernel/conv2d_transpose.cc  |  74 +-
 .../kernel/opencl/kernel/conv2d_transpose.h   |   6 +-
 .../kernel/opencl/kernel/convolution.cc       | 202 ++---
 .../kernel/opencl/kernel/convolution.h        |   6 +-
 .../kernel/opencl/kernel/depthwise_conv2d.cc  |  92 +-
 .../kernel/opencl/kernel/depthwise_conv2d.h   |   5 +-
 .../runtime/kernel/opencl/kernel/matmul.cc    |  69 +-
 .../src/runtime/kernel/opencl/kernel/matmul.h |   8 +-
 .../runtime/kernel/opencl/kernel/pooling2d.cc |   6 +-
 .../src/runtime/kernel/opencl/kernel/prelu.cc |  50 +-
 .../src/runtime/kernel/opencl/kernel/prelu.h  |   2 +
 .../runtime/kernel/opencl/kernel/reshape.cc   |  11 +-
 .../runtime/kernel/opencl/kernel/reshape.h    |   1 +
 .../src/runtime/kernel/opencl/kernel/slice.cc |  11 +-
 .../runtime/kernel/opencl/kernel/softmax.cc   |   2 +-
 .../runtime/kernel/opencl/kernel/to_format.cc |  18 +-
 .../runtime/kernel/opencl/kernel/transpose.cc |  11 +-
 .../runtime/kernel/opencl/kernel/transpose.h  |   3 +-
 .../kernel/opencl/subgraph_opencl_kernel.cc   |  27 +-
 .../kernel/opencl/subgraph_opencl_kernel.h    |   1 -
 .../lite/src/runtime/kernel/opencl/utils.h    |  68 ++
 .../src/runtime/opencl/opencl_allocator.cc    |  20 +-
 .../src/runtime/opencl/opencl_allocator.h     |   5 +-
 .../src/runtime/opencl/opencl_executor.cc     |   2 +-
 .../lite/src/runtime/opencl/opencl_runtime.cc | 283 +++---
 .../lite/src/runtime/opencl/opencl_runtime.h  |  93 +-
 .../lite/src/runtime/opencl/opencl_wrapper.cc |  21 +-
 .../lite/src/runtime/opencl/opencl_wrapper.h  |   3 +-
 .../lite/src/runtime/parallel_executor.cc     |  64 +-
 .../lite/src/runtime/parallel_executor.h      |   5 -
 mindspore/lite/src/runtime/runtime_api.cc     |  63 +-
 mindspore/lite/src/runtime/runtime_api.h      |  23 +-
 mindspore/lite/src/runtime/thread_pool.c      | 796 +++++++++++++++++
 mindspore/lite/src/runtime/thread_pool.cc     | 464 ----------
 mindspore/lite/src/runtime/thread_pool.h      | 150 ++--
 mindspore/lite/src/scheduler.cc               |  35 +-
 mindspore/lite/src/scheduler.h                |   3 +-
 mindspore/lite/test/CMakeLists.txt            |   9 +-
 mindspore/lite/test/models_caffe.cfg          |  14 +-
 mindspore/lite/test/models_fp16.cfg           |   6 +-
 mindspore/lite/test/models_tflite.cfg         |  10 +
 .../lite/test/models_tflite_awaretraining.cfg |   6 +
 mindspore/lite/test/run_benchmark_nets.sh     | 308 ++++---
 mindspore/lite/test/ut/src/infer_test.cc      |   8 +-
 .../runtime/kernel/arm/common/pack_tests.cc   |   4 +-
 .../kernel/arm/fp16/convolution_fp16_tests.cc |   8 +-
 .../kernel/arm/fp32/conv1x1_fp32_tests.cc     |  28 +-
 .../fp32/convolution_depthwise_fp32_tests.cc  |   4 +-
 .../arm/fp32/deconvolution_fp32_tests.cc      |  18 +-
 .../kernel/arm/fp32/matmul_fp32_tests.cc      |   2 -
 .../arm/fp32/space_to_batch_fp32_tests.cc     | 271 +++---
 .../kernel/arm/fp32/strassen_fp32_tests.cc    | 369 --------
 .../kernel/arm/fp32/topk_fp32_tests.cc        |   3 +-
 .../kernel/arm/fp32/transpose_fp32_tests.cc   |   6 +-
 .../fp32_grad/convolution_grad_fp32_tests.cc  |   7 +-
 .../arm/fp32_grad/pooling_grad_fp32_tests.cc  |  11 +-
 .../kernel/arm/int8/conv_1x1_int8_tests.cc    | 281 ++++++
 .../kernel/arm/int8/deconv_int8_tests.cc      |  50 +-
 .../arm/int8/fullconnection_int8_tests.cc     | 191 ++--
 .../kernel/arm/int8/gatherNd_int8_test.cc     | 101 +++
 .../kernel/arm/int8/gather_int8_test.cc       |  99 +++
 .../kernel/arm/int8/matmul_int8_tests.cc      | 335 +++++--
 .../kernel/arm/int8/prelu_int8_tests.cc       |   4 +-
 .../arm/int8/resize_bilinear_int8_tests.cc    |  71 +-
 .../runtime/kernel/opencl/activation_tests.cc | 397 +++++++--
 .../kernel/opencl/avg_pooling_tests.cc        |   4 +-
 .../runtime/kernel/opencl/batchnorm_tests.cc  | 157 +++-
 ...{caffe_prelu_tests.cc => biasadd_tests.cc} | 116 +--
 .../src/runtime/kernel/opencl/concat_tests.cc | 154 +++-
 .../kernel/opencl/conv2d_transpose_tests.cc   | 212 +++--
 .../kernel/opencl/convolution_tests.cc        | 112 +--
 .../kernel/opencl/depthwise_conv2d_tests.cc   | 497 ++++-------
 .../src/runtime/kernel/opencl/matmul_tests.cc | 139 ++-
 .../kernel/opencl/max_pooling_tests.cc        |   5 +-
 .../src/runtime/kernel/opencl/prelu_tests.cc  |  19 +-
 .../runtime/kernel/opencl/reshape_tests.cc    |  73 +-
 .../src/runtime/kernel/opencl/slice_tests.cc  | 127 ++-
 .../runtime/kernel/opencl/softmax_tests.cc    |   2 +-
 .../runtime/kernel/opencl/transpose_tests.cc  |  86 +-
 .../src/runtime/kernel/opencl/utils_tests.cc  |  30 -
 .../src/runtime/kernel/opencl/utils_tests.h   |  31 +-
 .../tflite/tflite_activation_parser_test.cc   |   9 -
 .../tflite/tflite_parsers_test_utils.cc       |   2 +-
 .../lite/tools/anf_exporter/anf_exporter.cc   |  47 +-
 .../lite/tools/anf_exporter/anf_exporter.h    |   4 +-
 .../anf_importer/import_from_protobuf.cc      |  26 +-
 mindspore/lite/tools/benchmark/CMakeLists.txt |   4 +-
 mindspore/lite/tools/benchmark/benchmark.cc   |  91 +-
 mindspore/lite/tools/benchmark/benchmark.h    |   3 +-
 mindspore/lite/tools/common/graph_util.cc     |  80 +-
 mindspore/lite/tools/common/graph_util.h      |  11 -
 mindspore/lite/tools/common/node_util.cc      |   2 +-
 mindspore/lite/tools/common/node_util.h       |   2 +-
 .../protobuf_utils.cc}                        |  11 +-
 .../protobuf_utils.h}                         |   7 +-
 mindspore/lite/tools/common/tensor_util.cc    |  10 +-
 mindspore/lite/tools/converter/CMakeLists.txt |  45 +-
 .../lite/tools/converter/anf_transform.cc     |  41 +-
 .../lite/tools/converter/anf_transform.h      |  12 +-
 mindspore/lite/tools/converter/converter.cc   |  86 +-
 mindspore/lite/tools/converter/converter.h    |   4 -
 .../lite/tools/converter/converter_flags.cc   |   8 +-
 .../tools/converter/graphdef_transform.cc     |  24 +-
 .../legacy_optimizer/fusion/CMakeLists.txt    |   2 +-
 .../fusion/batchnorm_convert_scale_pass.h     | 100 ---
 .../fusion/matmul_biasadd_fusion_pass.h       |   2 +-
 .../fusion/mul_add_fusion_pass.cc             | 148 ++++
 .../fusion/mul_add_fusion_pass.h              |  52 ++
 .../legacy_optimizer/graph/CMakeLists.txt     |   1 +
 .../batchnorm_convert_scale_pass.cc           | 200 ++---
 .../graph/batchnorm_convert_scale_pass.h      |  66 ++
 .../graph/dtype_trans_pass.cc                 |   2 +-
 .../legacy_optimizer/graph/dtype_trans_pass.h |   2 +-
 .../graph/eltwise_format_trans_pass.cc        |   3 +-
 mindspore/lite/tools/converter/model_parser.h |  22 +-
 .../converter/parser/caffe/CMakeLists.txt     |   4 +-
 .../converter/parser/caffe/caffe_converter.cc |   1 -
 .../parser/caffe/caffe_convolution_parser.cc  |  10 +-
 .../parser/caffe/caffe_model_parser.cc        |  51 +-
 .../parser/caffe/caffe_model_parser.h         |   2 +-
 .../parser/caffe/caffe_prelu_parser.cc        |   4 +-
 .../parser/caffe/caffe_relu6_parser.cc        |  29 +-
 .../parser/caffe/caffe_relu6_parser.h         |  10 +-
 .../parser/caffe/caffe_tanh_parser.cc         |  22 +-
 .../parser/caffe/caffe_tanh_parser.h          |  10 +-
 .../parser/onnx/onnx_model_parser.cc          | 107 +--
 .../converter/parser/onnx/onnx_model_parser.h | 103 +--
 .../converter/parser/onnx/onnx_relu_parser.cc |   4 +-
 .../parser/onnx/onnx_slice_parser.cc          |  32 +-
 .../parser/tflite/tflite_activation_parser.cc |  43 +-
 .../parser/tflite/tflite_activation_parser.h  |  12 -
 .../parser/tflite/tflite_arithmetic_parser.cc |  38 +-
 .../parser/tflite/tflite_model_parser.cc      | 133 +--
 .../parser/tflite/tflite_model_parser.h       |   3 +-
 .../parser/tflite/tflite_pooling_parser.cc    |   1 +
 .../parser/tflite/tflite_slice_parser.cc      |   7 +-
 .../converter/parser/tflite/tflite_util.cc    |   1 -
 .../converter/quantizer/aware_quantizer.cc    |   3 +-
 .../quantizer/post_training_quantizer.cc      | 446 +++++-----
 .../quantizer/post_training_quantizer.h       |  68 +-
 .../tools/converter/quantizer/quant_cast.cc   |  20 +-
 .../converter/quantizer/quantize_util.cc      |  28 +-
 .../tools/converter/quantizer/quantize_util.h |   2 +-
 .../tools/converter/quantizer/quantizer.h     |   2 +-
 .../lite/tools/optimizer/common/gllo_utils.cc |  15 +-
 .../lite/tools/optimizer/common/gllo_utils.h  |   2 +
 .../fusion/constant_folding_fusion.cc         | 101 +--
 .../fusion/constant_folding_fusion.h          |   5 +-
 .../fusion/conv_activation_fusion.cc          |  35 +-
 .../optimizer/fusion/conv_activation_fusion.h |   2 +-
 .../optimizer/fusion/conv_biasadd_fusion.cc   |  21 +-
 .../tools/optimizer/fusion/conv_bn_fusion.cc  |  18 +-
 .../optimizer/fusion/conv_transform_fusion.cc |  29 +-
 .../fusion/conv_tuple_activation_fusion.cc    |  99 +++
 .../fusion/conv_tuple_activation_fusion.h     |  40 +
 .../fusion/pooling_activation_fusion.cc       |  79 ++
 .../fusion/pooling_activation_fusion.h        |  40 +
 .../lite/tools/time_profile/CMakeLists.txt    |   9 +-
 .../lite/tools/time_profile/time_profile.cc   |  25 +-
 model_zoo/official/lite/.gitignore            |  81 ++
 716 files changed, 21003 insertions(+), 10916 deletions(-)
 create mode 100644 mindspore/lite/README.md
 create mode 100644 mindspore/lite/README_CN.md
 rename mindspore/lite/{src/runtime/kernel/arm => }/nnacl/README.md (100%)
 create mode 100644 mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
 create mode 100644 mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
 create mode 100644 mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
 create mode 100644 mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
 create mode 100644 mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
 create mode 100644 mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
 create mode 100644 mindspore/lite/nnacl/fp16/activation_fp16.c
 create mode 100644 mindspore/lite/nnacl/fp16/activation_fp16.h
 delete mode 100644 mindspore/lite/nnacl/fp16/common_func.c
 delete mode 100644 mindspore/lite/nnacl/fp16/common_func.h
 create mode 100644 mindspore/lite/nnacl/fp16/softmax_fp16.c
 create mode 100644 mindspore/lite/nnacl/fp16/softmax_fp16.h
 create mode 100644 mindspore/lite/nnacl/fp32/scale.c
 create mode 100644 mindspore/lite/nnacl/fp32/scale.h
 delete mode 100644 mindspore/lite/nnacl/fp32/strassen_matmul.c
 delete mode 100644 mindspore/lite/nnacl/fp32/strassen_matmul.h
 rename mindspore/lite/nnacl/{strassen_matmul.h => gather_parameter.h} (56%)
 create mode 100644 mindspore/lite/nnacl/int8/gatherNd_int8.c
 create mode 100644 mindspore/lite/nnacl/int8/gatherNd_int8.h
 create mode 100644 mindspore/lite/nnacl/int8/gather_int8.c
 create mode 100644 mindspore/lite/nnacl/int8/gather_int8.h
 delete mode 100644 mindspore/lite/nnacl/scale.c
 mode change 100755 => 100644 mindspore/lite/src/common/graph_util.cc
 mode change 100755 => 100644 mindspore/lite/src/common/graph_util.h
 delete mode 100644 mindspore/lite/src/ops/CMakeLists.txt
 create mode 100644 mindspore/lite/src/ops/abs.cc
 create mode 100644 mindspore/lite/src/ops/cos.cc
 create mode 100644 mindspore/lite/src/ops/equal.cc
 create mode 100644 mindspore/lite/src/ops/exp.cc
 create mode 100644 mindspore/lite/src/ops/floor.cc
 create mode 100644 mindspore/lite/src/ops/floor_div.cc
 create mode 100644 mindspore/lite/src/ops/floor_mod.cc
 create mode 100644 mindspore/lite/src/ops/greater.cc
 create mode 100644 mindspore/lite/src/ops/greater_equal.cc
 rename mindspore/lite/src/ops/{caffe_p_relu.cc => less.cc} (62%)
 create mode 100644 mindspore/lite/src/ops/less_equal.cc
 create mode 100644 mindspore/lite/src/ops/log.cc
 create mode 100644 mindspore/lite/src/ops/logical_and.cc
 create mode 100644 mindspore/lite/src/ops/logical_not.cc
 create mode 100644 mindspore/lite/src/ops/logical_or.cc
 create mode 100644 mindspore/lite/src/ops/maximum.cc
 create mode 100644 mindspore/lite/src/ops/minimum.cc
 create mode 100644 mindspore/lite/src/ops/not_equal.cc
 create mode 100644 mindspore/lite/src/ops/p_relu.cc
 rename mindspore/lite/src/ops/{caffe_p_relu.h => p_relu.h} (67%)
 create mode 100644 mindspore/lite/src/ops/return.cc
 rename mindspore/lite/src/ops/{prelu.h => return.h} (59%)
 create mode 100644 mindspore/lite/src/ops/round.cc
 rename mindspore/lite/src/ops/{prelu.cc => rsqrt.cc} (61%)
 create mode 100644 mindspore/lite/src/ops/sin.cc
 create mode 100644 mindspore/lite/src/ops/sqrt.cc
 create mode 100644 mindspore/lite/src/ops/square.cc
 create mode 100644 mindspore/lite/src/ops/squared_difference.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h
 delete mode 100644 mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
 rename mindspore/lite/src/runtime/kernel/arm/{fp32/convolution_depthwise_3x3.h => int8/convolution_depthwise_slidewindow_int8.h} (55%)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.h
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/biasadd.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
 rename mindspore/lite/src/runtime/kernel/opencl/kernel/{caffe_prelu.h => biasadd.h} (65%)
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
 create mode 100644 mindspore/lite/src/runtime/thread_pool.c
 delete mode 100644 mindspore/lite/src/runtime/thread_pool.cc
 delete mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
 rename mindspore/lite/test/ut/src/runtime/kernel/opencl/{caffe_prelu_tests.cc => biasadd_tests.cc} (63%)
 rename mindspore/lite/tools/{converter/parser/caffe/caffe_parse_utils.cc => common/protobuf_utils.cc} (88%)
 rename mindspore/lite/tools/{converter/parser/caffe/caffe_parse_utils.h => common/protobuf_utils.h} (85%)
 delete mode 100644 mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h
 create mode 100644 mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.cc
 create mode 100644 mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h
 rename mindspore/lite/tools/converter/legacy_optimizer/{fusion => graph}/batchnorm_convert_scale_pass.cc (62%)
 create mode 100644 mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
 mode change 100755 => 100644 mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
 create mode 100644 mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.cc
 create mode 100644 mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.h
 create mode 100644 mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc
 create mode 100644 mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.h
 create mode 100644 model_zoo/official/lite/.gitignore

diff --git a/build.bat b/build.bat
index 59aaee0778..e26019015b 100644
--- a/build.bat
+++ b/build.bat
@@ -16,20 +16,20 @@
 @title mindspore_build
 
 SET BASEPATH=%CD%
-IF NOT EXIST %BASEPATH%/build (
+IF NOT EXIST "%BASEPATH%/build" (
     md "build"
 )
 
-cd %BASEPATH%/build
+cd "%BASEPATH%/build"
 set BUILD_PATH=%CD%
 
-IF NOT EXIST %BUILD_PATH%/mindspore (
+IF NOT EXIST "%BUILD_PATH%/mindspore" (
     md "mindspore"
 )
 
-cd %CD%/mindspore
+cd "%CD%/mindspore"
 
-IF "%2%" == "lite" (
+IF "%1%" == "lite" (
     call :gene_gtest
     call :run_cmake
     IF errorlevel 1 (
@@ -47,14 +47,17 @@ IF "%2%" == "lite" (
     )
 
     cd %BUILD_PATH%/mindspore
-    IF "%1%" == "" (
-        cmake --build . -- -j6
+    IF "%2%" == "" (
+        cmake --build . --target package -- -j6
     ) ELSE (
-        cmake --build . -- -j%1%
+        cmake --build . --target package -- -j%2%
     )
     IF errorlevel 1 (
         echo "build fail."
         goto run_fail
+    ) ELSE (
+        cd "%BASEPATH%/output"
+        rd /s /q _CPack_Packages
     )
 ) ELSE (
     cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_CPU=ON -DENABLE_MINDDATA=ON -DUSE_GLOG=ON ^
@@ -75,40 +78,40 @@ IF "%2%" == "lite" (
     )
 )
 
-cd %BASEPATH%
+cd "%BASEPATH%"
 
 goto run_eof
 
 :run_cmake
-    cd %BUILD_PATH%/mindspore
+    cd "%BUILD_PATH%/mindspore"
     cmake -DBUILD_DEVICE=on -DBUILD_CONVERTER=on -DPLATFORM_ARM64=off -DSUPPORT_TRAIN=off ^
     -DCMAKE_BUILD_TYPE=Release -DSUPPORT_GPU=off -DBUILD_MINDDATA=off -DOFFLINE_COMPILE=off ^
-    -G "CodeBlocks - MinGW Makefiles" %BASEPATH%/mindspore/lite
+    -G "CodeBlocks - MinGW Makefiles" "%BASEPATH%/mindspore/lite"
 GOTO:EOF
 
 :gene_gtest
-    cd %BASEPATH%/third_party
+    cd "%BASEPATH%/third_party"
     IF EXIST googletest rd /s /q googletest
     git submodule update --init --recursive googletest
-    cd %BUILD_PATH%/mindspore
+    cd "%BUILD_PATH%/mindspore"
 GOTO:EOF
 
 :gene_protobuf
-    SET PROTOC=%BASEPATH%/build/mindspore/_deps/protobuf-src/_build/protoc
+    SET PROTOC="%BASEPATH%/build/mindspore/_deps/protobuf-src/_build/protoc"
 
-    SET PROTO_SRC_DIR=%BASEPATH%/mindspore/lite/tools/converter/parser/caffe
+    SET PROTO_SRC_DIR="%BASEPATH%/mindspore/lite/tools/converter/parser/caffe"
     cd %PROTO_SRC_DIR%
     %PROTOC% *.proto --proto_path=%PROTO_SRC_DIR% --cpp_out=%PROTO_SRC_DIR%
 
-    SET PROTO_SRC_DIR=%BASEPATH%/mindspore/lite/tools/converter/parser/onnx
+    SET PROTO_SRC_DIR="%BASEPATH%/mindspore/lite/tools/converter/parser/onnx"
     cd %PROTO_SRC_DIR%
     %PROTOC% *.proto --proto_path=%PROTO_SRC_DIR% --cpp_out=%PROTO_SRC_DIR%
     cd %BUILD_PATH%/mindspore
 GOTO:EOF
 
 :gene_flatbuffer
-    SET FLATC=%BASEPATH%/build/mindspore/_deps/flatbuffers-src/_build/flatc
-    SET FLAT_DIR=%BASEPATH%/mindspore/lite/schema
+    SET FLATC="%BASEPATH%/build/mindspore/_deps/flatbuffers-src/_build/flatc"
+    SET FLAT_DIR="%BASEPATH%/mindspore/lite/schema"
     cd %FLAT_DIR%
     IF EXIST inner rd /s /q inner
     md inner
@@ -116,14 +119,14 @@ GOTO:EOF
     %FLATC% -c -b *.fbs
     %FLATC% -c -b --reflect-types --gen-mutable --reflect-names --gen-object-api -o %FLAT_DIR%/inner *.fbs
 
-    SET FLAT_DIR=%BASEPATH%/mindspore/lite/tools/converter/parser/tflite
+    SET FLAT_DIR="%BASEPATH%/mindspore/lite/tools/converter/parser/tflite"
     cd %FLAT_DIR%
     %FLATC% -c -b --reflect-types --gen-mutable --reflect-names --gen-object-api -o %FLAT_DIR% *.fbs
-    cd %BUILD_PATH%/mindspore
+    cd "%BUILD_PATH%/mindspore"
 GOTO:EOF
 
 :run_fail
-    cd %BASEPATH%
+    cd "%BASEPATH%"
     set errorlevel=1
 
 :run_eof
diff --git a/build.sh b/build.sh
index 9e03268835..3ed4cbc461 100755
--- a/build.sh
+++ b/build.sh
@@ -393,7 +393,7 @@ build_mindspore()
       CMAKE_VERBOSE="--verbose"
     fi
     cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM
-    echo "success to build mindspore project!"
+    echo "success building mindspore project!"
 }
 
 checkndk() {
@@ -618,10 +618,12 @@ build_lite()
 
     if [[ "${COMPILE_RET}" -ne 0 ]]; then
         echo "---------------- mindspore lite: build failed ----------------"
+        exit 1
     else
         mv ${BASEPATH}/output/tmp/*.tar.gz* ${BASEPATH}/output/
         rm -rf ${BASEPATH}/output/tmp/
         echo "---------------- mindspore lite: build success ----------------"
+        exit 0
     fi
 }
 
diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index 996ea06dfc..4fb60c021c 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -1,12 +1,18 @@
 include(CMakePackageConfigHelpers)
 
-set(LIB_DIR ${MAIN_DIR}/lib)
-set(INC_DIR ${MAIN_DIR}/include)
-set(TURBO_DIR ${MAIN_DIR}/third_party/libjpeg-turbo)
-set(OPENCV_DIR ${MAIN_DIR}/third_party/opencv)
-set(PROTOBF_DIR ${MAIN_DIR}/third_party/protobuf)
-set(FLATBF_DIR ${MAIN_DIR}/third_party/flatbuffers)
+set(LIB_DIR ${MAIN_DIR}-${COMPONENT_NAME}/lib)
+set(INC_DIR ${MAIN_DIR}-${COMPONENT_NAME}/include)
+set(TURBO_DIR ${MAIN_DIR}-${COMPONENT_NAME}/third_party/libjpeg-turbo)
+set(OPENCV_DIR ${MAIN_DIR}-${COMPONENT_NAME}/third_party/opencv)
+set(PROTOBF_DIR ${MAIN_DIR}-${COMPONENT_NAME}/third_party/protobuf)
+set(FLATBF_DIR ${MAIN_DIR}-${COMPONENT_NAME}/third_party/flatbuffers)
 
+set(LIB_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/lib)
+set(INC_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/include)
+set(TURBO_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/third_party/libjpeg-turbo)
+set(OPENCV_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/third_party/opencv)
+set(PROTOBF_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/third_party/protobuf)
+set(FLATBF_DIR_RUN_X86 ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/third_party/flatbuffers)
 if (BUILD_MINDDATA)
     install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/ DESTINATION ${INC_DIR} COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
  	install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME})
@@ -41,19 +47,40 @@ elseif (PLATFORM_ARM32)
     install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${INC_DIR} COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
     install(DIRECTORY ${TOP_DIR}/mindspore/lite/schema/ DESTINATION ${INC_DIR}/schema COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "inner" EXCLUDE)
     install(DIRECTORY ${TOP_DIR}/third_party/flatbuffers/include DESTINATION ${FLATBF_DIR} COMPONENT ${COMPONENT_NAME})
+elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    get_filename_component(CXX_DIR ${CMAKE_CXX_COMPILER} PATH)
+    file(GLOB LIB_LIST ${CXX_DIR}/libstdc++-6.dll ${CXX_DIR}/libwinpthread-1.dll ${CXX_DIR}/libssp-0.dll ${CXX_DIR}/libgcc_s_seh-1.dll)
+    install(FILES ${TOP_DIR}/build/mindspore/tools/converter/converter_lite.exe DESTINATION ${TOP_DIR}/build/mindspore/package COMPONENT ${COMPONENT_NAME})
+    install(FILES ${LIB_LIST} DESTINATION ${TOP_DIR}/build/mindspore/package COMPONENT ${COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/build/mindspore/tools/converter/libconverter_parser.a DESTINATION ${TOP_DIR}/build/mindspore/package COMPONENT ${PARSER_NAME})
 else ()
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${LIB_DIR} COMPONENT ${RUN_X86_COMPONENT_NAME})
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${INC_DIR_RUN_X86} COMPONENT ${RUN_X86_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/schema/ DESTINATION ${INC_DIR_RUN_X86}/schema COMPONENT ${RUN_X86_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "inner" EXCLUDE)
+    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${INC_DIR_RUN_X86}/ir/dtype COMPONENT ${RUN_X86_COMPONENT_NAME})
+    install(DIRECTORY ${TOP_DIR}/third_party/flatbuffers/include DESTINATION ${FLATBF_DIR_RUN_X86} COMPONENT ${RUN_X86_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${LIB_DIR_RUN_X86} COMPONENT ${RUN_X86_COMPONENT_NAME})
+
     install(FILES ${TOP_DIR}/third_party/protobuf/build/lib/libprotobuf.so.19.0.0 DESTINATION ${PROTOBF_DIR}/lib RENAME libprotobuf.so.19 COMPONENT ${COMPONENT_NAME})
 endif ()
 
-set(CPACK_GENERATOR TGZ)
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(CPACK_GENERATOR ZIP)
+else ()
+    set(CPACK_GENERATOR TGZ)
+endif ()
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 if (PLATFORM_ARM64 OR PLATFORM_ARM32)
     set(CPACK_COMPONENTS_ALL ${COMPONENT_NAME})
+elseif (WIN32)
+    set(CPACK_COMPONENTS_ALL ${COMPONENT_NAME})
 else ()
     set(CPACK_COMPONENTS_ALL ${COMPONENT_NAME} ${RUN_X86_COMPONENT_NAME})
 endif ()
 set(CPACK_PACKAGE_FILE_NAME ${MAIN_DIR})
-set(CPACK_PACKAGE_DIRECTORY ${TOP_DIR}/output/tmp)
+if (WIN32)
+    set(CPACK_PACKAGE_DIRECTORY ${TOP_DIR}/output)
+else ()
+    set(CPACK_PACKAGE_DIRECTORY ${TOP_DIR}/output/tmp)
+endif()
 set(CPACK_PACKAGE_CHECKSUM SHA256)
 include(CPack)
\ No newline at end of file
diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index 8409e6c7c2..62da9d7cae 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -5,15 +5,15 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_
     message(FATAL_ERROR "GCC vesion ${CMAKE_CXX_COMPILER_VERSION} must not be less than 7.3.0")
 endif ()
 
-set(MS_VERSION_MAJOY 0)
+set(MS_VERSION_MAJOR 0)
 set(MS_VERSION_MINOR 7)
 set(MS_VERSION_REVISION 0)
 
 set(DIR_PREFIX mindspore-lite)
-set(MS_VERSION ${MS_VERSION_MAJOY}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
+set(MS_VERSION ${MS_VERSION_MAJOR}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
 set(MAIN_DIR ${DIR_PREFIX}-${MS_VERSION})
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMS_VERSION_MAJOY=${MS_VERSION_MAJOY} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMS_VERSION_MAJOY=${MS_VERSION_MAJOY} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMS_VERSION_MAJOR=${MS_VERSION_MAJOR} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMS_VERSION_MAJOR=${MS_VERSION_MAJOR} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
 
 if (SUPPORT_GPU)
     set(PROCESS_UNIT gpu)
@@ -25,13 +25,16 @@ if (PLATFORM_ARM64)
     set(COMPONENT_NAME runtime-arm64-${PROCESS_UNIT})
 elseif (PLATFORM_ARM32)
     set(COMPONENT_NAME runtime-arm32-${PROCESS_UNIT})
+elseif (WIN32)
+    set(PARSER_NAME libconverter-parser-win-${PROCESS_UNIT})
+    set(COMPONENT_NAME converter-win-${PROCESS_UNIT})
 else ()
     set(COMPONENT_NAME convert-ubuntu)
 endif()
 set(RUN_X86_COMPONENT_NAME runtime-x86-${PROCESS_UNIT})
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
-set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+string(REPLACE "/mindspore/lite" "" TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(CORE_DIR ${TOP_DIR}/mindspore/core)
 set(CCSRC_DIR ${TOP_DIR}/mindspore/ccsrc)
 include_directories(${TOP_DIR})
@@ -65,20 +68,20 @@ set(CMAKE_VERBOSE_MAKEFILE on)
 add_compile_definitions(USE_ANDROID_LOG)
 add_compile_definitions(NO_DLIB)
 add_compile_options(-fPIC)
-if (NOT PLATFORM_ARM64 AND NOT PLATFORM_ARM32)
-    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDebug -g")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDebug -g")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
-    else ()
-        ## enable for binscope for release
-        set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes -Wno-deprecated-declarations ${CMAKE_C_FLAGS}")
-        set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes -Wno-deprecated-declarations ${CMAKE_CXX_FLAGS}")
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDebug -g")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDebug -g")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
+else ()
+    ## enable for binscope for release
+    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}")
+    if (NOT WIN32)
         set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack ${CMAKE_SHARED_LINKER_FLAGS}")
         set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack ${CMAKE_EXE_LINKER_FLAGS}")
-        string(REPLACE    " -g " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    endif ()
+    endif()
+    string(REPLACE    " -g " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif ()
 
 if (BUILD_DEVICE)
@@ -110,12 +113,11 @@ if (WIN32)
     add_compile_definitions(BUILDING_DLL)
 endif ()
 
-set(ANF_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../core/ir/meta_tensor.cc
+set(CORE_SRC
+        ${CORE_DIR}/ir/meta_tensor.cc
         ${CORE_DIR}/gvar/logging_level.cc
         ${CORE_DIR}/gvar/typeid_manager.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/../core/base/base.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/common/log_adapter.cc
+        ${CORE_DIR}/base/base.cc
         )
 if (BUILD_CONVERTER)
     if (PLATFORM_ARM64 OR PLATFORM_ARM32)
@@ -163,7 +165,6 @@ if (BUILD_DEVICE)
         add_compile_definitions(ENABLE_ARM32)
     endif ()
     if (PLATFORM_ARM64)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16")
         add_compile_definitions(ENABLE_ARM64)
         if (ENABLE_FP16)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16")
@@ -207,4 +208,4 @@ if (BUILD_DEVICE)
     endif ()
 endif ()
 
-include(${TOP_DIR}/cmake/package_lite.cmake)
\ No newline at end of file
+include(${TOP_DIR}/cmake/package_lite.cmake)
diff --git a/mindspore/lite/README.md b/mindspore/lite/README.md
new file mode 100644
index 0000000000..25a21b7443
--- /dev/null
+++ b/mindspore/lite/README.md
@@ -0,0 +1,56 @@
+[查看中文](./README_CN.md)
+
+## What Is MindSpore Lite
+
+MindSpore lite is a high-performance, lightweight open source reasoning framework that can be used to meet the needs of AI applications on mobile devices. MindSpore Lite focuses on how to deploy AI technology more effectively on devices. It has been integrated into HMS (Huawei Mobile Services) to provide inferences for applications such as image classification, object detection and OCR. MindSpore Lite will promote the development and enrichment of the AI software/hardware application ecosystem.
+
+<img src="../../docs/MindSpore-Lite-architecture.png" alt="MindSpore Lite Architecture" width="600"/>
+
+For more details please check out our [MindSpore Lite Architecture Guide](https://www.mindspore.cn/lite/docs/en/master/architecture.html).
+
+### MindSpore Lite features
+
+1.  Cooperative work with MindSpore training
+    - Provides training, optimization, and deployment.
+	- The unified IR realizes the device-cloud AI application integration.
+
+2. Lightweight
+   - Provides model compress, which could help to improve performance as well.
+   - Provides the ultra-lightweight reasoning solution MindSpore Micro to meet the deployment requirements in extreme environments such as smart watches and headphones.
+
+3. High-performance
+   - The built-in high-performance kernel computing library NNACL supports multiple convolution optimization algorithms such as Slide window, im2col+gemm, winograde, etc.
+   - Assembly code to improve performance of kernel operators. Supports CPU, GPU, and NPU.
+4. Versatility
+   - Supports IOS, Android.
+   - Supports Lite OS.
+   - Supports mobile device, smart screen, pad, and IOT devices.
+   - Supports third party models such as TFLite, CAFFE and ONNX.
+
+## MindSpore Lite AI deployment procedure
+
+1. Model selection and personalized training
+
+   Select a new model or use an existing model for incremental training using labeled data. When designing a model for mobile device, it is necessary to consider the model size, accuracy and calculation amount.
+
+   The MindSpore team provides a series of pre-training models used for image classification, object detection. You can use these pre-trained models in your application.
+
+   The pre-trained models provided by MindSpore include: [Image Classification](https://download.mindspore.cn/model_zoo/official/lite/) and [Object Detection](https://download.mindspore.cn/model_zoo/official/lite/). More models will be provided in the feature.
+
+   MindSpore allows you to retrain pre-trained models to perform other tasks. For example: using a pre-trained image classification model, it can be retrained to recognize new image types. See [Retraining](https://www.mindspore.cn/lite/tutorial/zh-CN/master/advanced_use/retraining_of_quantized_network.html).
+
+2. Model converter and optimization
+
+   If you use MindSpore or a third-party model, you need to use [MindSpore Lite Model Converter Tool](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/converter_tool.html) to convert the model into MindSpore Lite model. The MindSpore Lite model converter tool provides the converter of TensorFlow Lite, Caffe, ONNX to MindSpore Lite model, fusion and quantization could be introduced during convert procedure.
+
+   MindSpore also provides a tool to convert models running on IoT devices .
+
+3. Model deployment
+
+   This stage mainly realizes model deployment, including model management, deployment, operation and maintenance monitoring, etc.
+
+4. Inference 
+
+   Load the model and perform inference. [Inference](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/runtime.html) is the process of running input data through the model to get output.
+
+   MindSpore provides a series of pre-trained models that can be deployed on mobile device [example](#TODO).
diff --git a/mindspore/lite/README_CN.md b/mindspore/lite/README_CN.md
new file mode 100644
index 0000000000..d2051cae3b
--- /dev/null
+++ b/mindspore/lite/README_CN.md
@@ -0,0 +1,66 @@
+﻿
+[View English](./README.md)
+
+## MindSpore Lite介绍
+
+MindSpore Lite是MindSpore推出的端云协同的、轻量化、高性能AI推理框架，用于满足越来越多的端测AI应用需求。MindSpore Lite聚焦AI技术在端侧设备上的部署和运行，已经在华为HMS和智能终端的图像分类、目标识别、人脸识别、文字识别等应用中广泛使用，未来MindSpore Lite将与MindSpore AI社区一起，致力于丰富AI软硬件应用生态。
+
+
+
+<img src="../../docs/MindSpore-Lite-architecture.png" alt="MindSpore Lite Architecture" width="600"/>
+
+欲了解更多详情，请查看我们的[MindSpore Lite 总体架构](https://www.mindspore.cn/lite/docs/zh-CN/master/architecture.html)。
+
+## MindSpore Lite技术特点
+
+1. 端云协同提供一站式训练和推理
+
+   - 提供模型训练、模型转换优化、部署和推理端到端流程。
+   - 统一的IR实现端云AI应用一体化。
+
+2. 超轻量
+
+   - 支持模型量化压缩，模型更小跑得更快。
+   - 提供超轻量的推理解决方案MindSpore Micro，满足智能手表、耳机等极限环境下的部署要求。
+
+3. 高性能
+
+   - 自带的高性能内核计算库NNACL，支持Sliding Windows、Im2Col+GEMM、Winograd等多种卷积优化算法。
+   - 汇编级优化，支持CPU、GPU、NPU异构调度，最大化发挥硬件算力，最小化推理时延和功耗。
+
+4. 广覆盖
+
+   - 支持iOS、Android等手机操作系统。
+   - 支持LiteOS嵌入式操作系统。
+   - 支持手机、大屏、平板、IoT等各种智能设备上的AI应用。
+   - 支持MindSpore/TensorFlow Lite/Caffe/ONNX模型，方便用户快速部署。
+
+## MindSpore Lite AI部署流程
+
+1. 模型选择和个性化训练
+
+   包括选择新模型或对已有模型，利用标注数据进行增量训练。面向端侧设计模型时，需要考虑模型大小、精度和计算量。
+
+   MindSpore团队提供了一系列预训练模型，用于解决图像分类、目标检测等场景的学习问题。可以在您的应用程序中使用这些预训练模型对应的终端模型。
+
+   MindSpore提供的预训练模型包括：[图像分类（Image Classification）](https://download.mindspore.cn/model_zoo/official/lite/)和[目标检测（Object Detection）](https://download.mindspore.cn/model_zoo/official/lite/)。后续MindSpore团队会增加更多的预置模型。
+
+   MindSpore允许您重新训练预训练模型，以执行其他任务。比如：使用预训练的图像分类模型，可以重新训练来识别新的图像类型。参见[重训练](https://www.mindspore.cn/lite/tutorial/zh-CN/master/advanced_use/retraining_of_quantized_network.html)。
+
+2. 模型转换/优化
+
+   如果您使用MindSpore或第三方训练的模型，需要使用[MindSpore Lite模型转换工具](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/converter_tool.html)转换成MindSpore Lite模型格式。MindSpore Lite模型转换工具不仅提供了将TensorFlow Lite、Caffe、ONNX等模型格式转换为MindSpore Lite模型格式，还提供了算子融合、量化等功能。
+
+   MindSpore还提供了将IoT设备上运行的模型转换成.C代码的生成工具。
+
+   经过上述两个部署，您已经得到端侧可以部署的模型。
+
+3. 模型部署
+
+   这个阶段主要实现模型部署，包括模型管理、部署和运维监控等。
+
+4. 模型推理
+
+   主要完成模型推理工作，即加载模型，完成模型相关的所有计算。[推理](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/runtime.html)是通过模型运行输入数据，获取预测的过程。
+
+   MindSpore提供了一系列预训练模型部署在智能终端的[样例](#TODO)。
diff --git a/mindspore/lite/include/context.h b/mindspore/lite/include/context.h
index 7df42a180a..b740039924 100644
--- a/mindspore/lite/include/context.h
+++ b/mindspore/lite/include/context.h
@@ -28,17 +28,17 @@ namespace mindspore::lite {
 class Allocator;
 
 /// \brief CpuBindMode defined for holding bind cpu strategy argument.
-enum CpuBindMode {
+typedef enum {
   MID_CPU = -1,   /**< bind middle cpu first */
   HIGHER_CPU = 1, /**< bind higher cpu first */
   NO_BIND = 0     /**< no bind */
-};
+} CpuBindMode;
 
 /// \brief DeviceType defined for holding user's preferred backend.
 typedef enum {
   DT_CPU, /**< CPU device type */
   DT_GPU, /**< GPU device type */
-  DT_NPU  /**< NPU device type */
+  DT_NPU  /**< NPU device type, not supported yet */
 } DeviceType;
 
 /// \brief DeviceContext defined for holding DeviceType.
diff --git a/mindspore/lite/include/lite_session.h b/mindspore/lite/include/lite_session.h
index cb6c748ddb..7b26f3b524 100644
--- a/mindspore/lite/include/lite_session.h
+++ b/mindspore/lite/include/lite_session.h
@@ -86,17 +86,34 @@ class MS_API LiteSession {
   /// \return STATUS as an error code of running graph, STATUS is defined in errorcode.h.
   virtual int RunGraph(const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr) = 0;
 
-  /// \brief Get output MindSpore Lite MSTensors of model.
+  /// \brief Get output MindSpore Lite MSTensors of model mapped by node name.
   ///
   /// \return The map of output node name and MindSpore Lite MSTensor.
-  virtual std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> GetOutputs() const = 0;
+  virtual std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> GetOutputMapByNode() const = 0;
 
   /// \brief Get output MindSpore Lite MSTensors of model by node name.
   ///
   /// \param[in] node_name Define node name.
   ///
   /// \return The vector of MindSpore Lite MSTensor.
-  virtual std::vector<tensor::MSTensor *> GetOutputsByName(const std::string &node_name) const = 0;
+  virtual std::vector<tensor::MSTensor *> GetOutputsByNodeName(const std::string &node_name) const = 0;
+
+  /// \brief Get output MindSpore Lite MSTensors of model mapped by tensor name.
+  ///
+  /// \return The map of output tensor name and MindSpore Lite MSTensor.
+  virtual std::unordered_map<std::string, mindspore::tensor::MSTensor *> GetOutputMapByTensor() const = 0;
+
+  /// \brief Get name of output tensors of model compiled by this session.
+  ///
+  /// \return The vector of string as output tensor names in order.
+  virtual std::vector<std::string> GetOutputTensorNames() const = 0;
+
+  /// \brief Get output MindSpore Lite MSTensors of model by tensor name.
+  ///
+  /// \param[in] tensor_name Define tensor name.
+  ///
+  /// \return Pointer of MindSpore Lite MSTensor.
+  virtual mindspore::tensor::MSTensor *GetOutputByTensorName(const std::string &tensor_name) const = 0;
 
   /// \brief Resize inputs shape.
   ///
diff --git a/mindspore/lite/include/version.h b/mindspore/lite/include/version.h
index c2534e55d1..377d8be972 100644
--- a/mindspore/lite/include/version.h
+++ b/mindspore/lite/include/version.h
@@ -24,8 +24,17 @@ namespace lite {
 /// \brief Global method to get a version string.
 ///
 /// \return The version string of MindSpore Lite.
+#ifndef MS_VERSION_MAJOR
+#define MS_VERSION_MAJOR 0
+#endif
+#ifndef MS_VERSION_MINOR
+#define MS_VERSION_MINOR 7
+#endif
+#ifndef MS_VERSION_REVISION
+#define MS_VERSION_REVISION 0
+#endif
 std::string Version() {
-  return "MindSpore Lite " + std::to_string(MS_VERSION_MAJOY) + "." + std::to_string(MS_VERSION_MINOR) + "." +
+  return "MindSpore Lite " + std::to_string(MS_VERSION_MAJOR) + "." + std::to_string(MS_VERSION_MINOR) + "." +
          std::to_string(MS_VERSION_REVISION);
 }
 }  // namespace lite
diff --git a/mindspore/lite/java/build_aar.sh b/mindspore/lite/java/build_aar.sh
index fd6faa86b4..1f29d0c09a 100644
--- a/mindspore/lite/java/build_aar.sh
+++ b/mindspore/lite/java/build_aar.sh
@@ -15,10 +15,10 @@ fi
 
 # copy arm64 so
 cd ${TOP_PATH}/output/
-rm -rf mindspore-lite-0.6.0
-tar -zxvf mindspore-lite-0.6.0-runtime-arm64-cpu.tar.gz
+rm -rf mindspore-lite-0.7.0
+tar -zxvf mindspore-lite-0.7.0-runtime-arm64-cpu.tar.gz
 mkdir -p ${BASE_PATH}/lib/
-cp ${TOP_PATH}/output/mindspore-lite-0.6.0/lib/libmindspore-lite.so ${BASE_PATH}/lib/
+cp ${TOP_PATH}/output/mindspore-lite-0.7.0-runtime-arm64-cpu/lib/libmindspore-lite.so ${BASE_PATH}/lib/
 cp ${ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android/libc++_shared.so ${BASE_PATH}/lib/
 
 # build jni so
diff --git a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/LiteSession.java b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/LiteSession.java
index a581afd2c9..fb8d965f42 100644
--- a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/LiteSession.java
+++ b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/LiteSession.java
@@ -76,8 +76,8 @@ public class LiteSession {
         return tensors;
     }
 
-    public Map<String, List<MSTensor>> getOutputs() {
-        Map<String, List<Long>> ret = this.getOutputs(this.sessionPtr);
+    public Map<String, List<MSTensor>> getOutputMapByNode() {
+        Map<String, List<Long>> ret = this.getOutputMapByNode(this.sessionPtr);
         Map<String, List<MSTensor>> tensorMap = new HashMap<>();
         Set<Map.Entry<String, List<Long>>> entrySet = ret.entrySet();
         for (Map.Entry<String, List<Long>> entry : entrySet) {
@@ -93,8 +93,8 @@ public class LiteSession {
         return tensorMap;
     }
 
-    public List<MSTensor> getOutputsByName(String nodeName) {
-        List<Long> ret = this.getOutputsByName(this.sessionPtr, nodeName);
+    public List<MSTensor> getOutputsByNodeName(String nodeName) {
+        List<Long> ret = this.getOutputsByNodeName(this.sessionPtr, nodeName);
         ArrayList<MSTensor> tensors = new ArrayList<>();
         for (Long msTensorAddr : ret) {
             MSTensor msTensor = new MSTensor(msTensorAddr);
@@ -103,6 +103,27 @@ public class LiteSession {
         return tensors;
     }
 
+    public Map<String, MSTensor> getOutputMapByTensor() {
+        Map<String, Long> ret = this.getOutputMapByTensor(this.sessionPtr);
+        Map<String, MSTensor> tensorMap = new HashMap<>();
+        Set<Map.Entry<String, Long>> entrySet = ret.entrySet();
+        for (Map.Entry<String, Long> entry : entrySet) {
+            String name = entry.getKey();
+            Long msTensorAddr = entry.getValue();
+            tensorMap.put(name, new MSTensor(msTensorAddr));
+        }
+        return tensorMap;
+    }
+
+    public List<String> getOutputTensorNames() {
+        return getOutputTensorNames(this.sessionPtr);
+    }
+
+    public MSTensor getOutputByTensorName(String tensorName) {
+        Long tensor_addr = getOutputByTensorName(this.sessionPtr, tensorName);
+        return new MSTensor(tensor_addr);
+    }
+
     public void free() {
         this.free(this.sessionPtr);
         this.sessionPtr = 0;
@@ -120,9 +141,15 @@ public class LiteSession {
 
     private native List<Long> getInputsByName(long sessionPtr, String nodeName);
 
-    private native Map<String, List<Long>> getOutputs(long sessionPtr);
+    private native Map<String, List<Long>> getOutputMapByNode(long sessionPtr);
+
+    private native List<Long> getOutputsByNodeName(long sessionPtr, String nodeName);
+
+    private native Map<String, Long> getOutputMapByTensor(long sessionPtr);
+
+    private native List<String> getOutputTensorNames(long sessionPtr);
 
-    private native List<Long> getOutputsByName(long sessionPtr, String nodeName);
+    private native Long getOutputByTensorName(long sessionPtr, String tensorName);
 
     private native void free(long sessionPtr);
 }
diff --git a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/MSTensor.java b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/MSTensor.java
index 6092ab8ebc..f74178bf37 100644
--- a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/MSTensor.java
+++ b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/MSTensor.java
@@ -16,6 +16,10 @@
 
 package com.mindspore.lite;
 
+import android.util.Log;
+
+import java.nio.ByteBuffer;
+
 public class MSTensor {
     private long tensorPtr;
 
@@ -27,7 +31,7 @@ public class MSTensor {
         this.tensorPtr = tensorPtr;
     }
 
-    public boolean init (int dataType, int[] shape) {
+    public boolean init(int dataType, int[] shape) {
         this.tensorPtr = createMSTensor(dataType, shape, shape.length);
         return this.tensorPtr != 0;
     }
@@ -48,14 +52,30 @@ public class MSTensor {
         this.setDataType(this.tensorPtr, dataType);
     }
 
-    public byte[] getData() {
-        return this.getData(this.tensorPtr);
+    public byte[] getByteData() {
+        return this.getByteData(this.tensorPtr);
+    }
+
+    public float[] getFloatData() {
+        return this.getFloatData(this.tensorPtr);
+    }
+
+    public int[] getIntData() {
+        return this.getIntData(this.tensorPtr);
+    }
+
+    public long[] getLongData() {
+        return this.getLongData(this.tensorPtr);
     }
 
     public void setData(byte[] data) {
         this.setData(this.tensorPtr, data, data.length);
     }
 
+    public void setData(ByteBuffer data) {
+        this.setByteBufferData(this.tensorPtr, data);
+    }
+
     public long size() {
         return this.size(this.tensorPtr);
     }
@@ -69,6 +89,24 @@ public class MSTensor {
         this.tensorPtr = 0;
     }
 
+    private float[] decodeBytes(byte[] bytes) {
+        if (bytes.length % 4 != 0) {
+            Log.e("MS_LITE", "Length of bytes should be multi of 4 ");
+            return null;
+        }
+        int size = bytes.length / 4;
+        float[] ret = new float[size];
+        for (int i = 0; i < size; i = i + 4) {
+            int accNum = 0;
+            accNum = accNum | (bytes[i] & 0xff) << 0;
+            accNum = accNum | (bytes[i + 1] & 0xff) << 8;
+            accNum = accNum | (bytes[i + 2] & 0xff) << 16;
+            accNum = accNum | (bytes[i + 3] & 0xff) << 24;
+            ret[i / 4] = Float.intBitsToFloat(accNum);
+        }
+        return ret;
+    }
+
     private native long createMSTensor(int dataType, int[] shape, int shapeLen);
 
     private native int[] getShape(long tensorPtr);
@@ -79,10 +117,18 @@ public class MSTensor {
 
     private native boolean setDataType(long tensorPtr, int dataType);
 
-    private native byte[] getData(long tensorPtr);
+    private native byte[] getByteData(long tensorPtr);
+
+    private native long[] getLongData(long tensorPtr);
+
+    private native int[] getIntData(long tensorPtr);
+
+    private native float[] getFloatData(long tensorPtr);
 
     private native boolean setData(long tensorPtr, byte[] data, long dataLen);
 
+    private native boolean setByteBufferData(long tensorPtr, ByteBuffer buffer);
+
     private native long size(long tensorPtr);
 
     private native int elementsNum(long tensorPtr);
diff --git a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/Model.java b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/Model.java
index 6e50043fcd..abc07c7139 100644
--- a/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/Model.java
+++ b/mindspore/lite/java/java/app/src/main/java/com/mindspore/lite/Model.java
@@ -80,6 +80,11 @@ public class Model {
         return ret;
     }
 
+    public boolean loadModel(String modelPath) {
+        this.modelPtr = loadModelByPath(modelPath);
+        return this.modelPtr != 0;
+    }
+
     public void free() {
         this.free(this.modelPtr);
         this.modelPtr = 0;
@@ -87,5 +92,7 @@ public class Model {
 
     private native long loadModel(MappedByteBuffer buffer);
 
+    private native long loadModelByPath(String modelPath);
+
     private native void free(long modelPtr);
 }
diff --git a/mindspore/lite/java/native/CMakeLists.txt b/mindspore/lite/java/native/CMakeLists.txt
index e9cb68ef24..38a8b549a4 100644
--- a/mindspore/lite/java/native/CMakeLists.txt
+++ b/mindspore/lite/java/native/CMakeLists.txt
@@ -1,11 +1,11 @@
 cmake_minimum_required(VERSION 3.14)
 project (Lite-java)
 
-set(MS_VERSION_MAJOY 0)
+set(MS_VERSION_MAJOR 0)
 set(MS_VERSION_MINOR 7)
 set(MS_VERSION_REVISION 0)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMS_VERSION_MAJOY=${MS_VERSION_MAJOY} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMS_VERSION_MAJOY=${MS_VERSION_MAJOY} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMS_VERSION_MAJOR=${MS_VERSION_MAJOR} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMS_VERSION_MAJOR=${MS_VERSION_MAJOR} -DMS_VERSION_MINOR=${MS_VERSION_MINOR} -DMS_VERSION_REVISION=${MS_VERSION_REVISION}")
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../lite/)
diff --git a/mindspore/lite/java/native/common/jni_utils.cpp b/mindspore/lite/java/native/common/jni_utils.cpp
index 67aeab8b3d..4c8ff22929 100644
--- a/mindspore/lite/java/native/common/jni_utils.cpp
+++ b/mindspore/lite/java/native/common/jni_utils.cpp
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-
 #include "common/jni_utils.h"
 #include <cstring>
 
 char *JstringToChar(JNIEnv *env, jstring jstr) {
-  char *rtn = NULL;
+  char *rtn = nullptr;
   jclass clsstring = env->FindClass("java/lang/String");
   jstring strencode = env->NewStringUTF("GB2312");
   jmethodID mid = env->GetMethodID(clsstring, "getBytes", "(Ljava/lang/String;)[B");
diff --git a/mindspore/lite/java/native/runtime/context.cpp b/mindspore/lite/java/native/runtime/context.cpp
index eaedc4d26a..7898eacd2f 100644
--- a/mindspore/lite/java/native/runtime/context.cpp
+++ b/mindspore/lite/java/native/runtime/context.cpp
@@ -18,6 +18,7 @@
 #include <jni.h>
 #include "common/ms_log.h"
 #include "include/context.h"
+#include "include/thread_pool_config.h"
 
 extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_context_Context_createContext(JNIEnv *env, jobject thiz,
                                                                                           jint device_type,
@@ -44,13 +45,13 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_context_Context_creat
   }
   switch (cpu_bind_mode) {
     case -1:
-      context->cpu_bind_mode_ = mindspore::lite::MID_CPU;
+      context->cpu_bind_mode_ = MID_CPU;
       break;
     case 0:
-      context->cpu_bind_mode_ = mindspore::lite::NO_BIND;
+      context->cpu_bind_mode_ = NO_BIND;
       break;
     case 1:
-      context->cpu_bind_mode_ = mindspore::lite::HIGHER_CPU;
+      context->cpu_bind_mode_ = HIGHER_CPU;
       break;
     default:
       MS_LOGE("Invalid cpu_bind_mode : %d", cpu_bind_mode);
diff --git a/mindspore/lite/java/native/runtime/lite_session.cpp b/mindspore/lite/java/native/runtime/lite_session.cpp
index 16e4ad1f10..ed17556bc9 100644
--- a/mindspore/lite/java/native/runtime/lite_session.cpp
+++ b/mindspore/lite/java/native/runtime/lite_session.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #include <jni.h>
 #include "common/ms_log.h"
 #include "common/jni_utils.h"
@@ -22,7 +21,7 @@
 #include "include/errorcode.h"
 
 extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_LiteSession_createSession(JNIEnv *env, jobject thiz,
-                                                                                      jlong context_ptr) {
+                                                                                     jlong context_ptr) {
   auto *pointer = reinterpret_cast<void *>(context_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Context pointer from java is nullptr");
@@ -38,8 +37,8 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_LiteSession_createSes
 }
 
 extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_LiteSession_compileGraph(JNIEnv *env, jobject thiz,
-                                                                                        jlong session_ptr,
-                                                                                        jlong model_ptr) {
+                                                                                       jlong session_ptr,
+                                                                                       jlong model_ptr) {
   auto *session_pointer = reinterpret_cast<void *>(session_ptr);
   if (session_pointer == nullptr) {
     MS_LOGE("Session pointer from java is nullptr");
@@ -58,7 +57,7 @@ extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_LiteSession_compil
 }
 
 extern "C" JNIEXPORT void JNICALL Java_com_mindspore_lite_LiteSession_bindThread(JNIEnv *env, jobject thiz,
-                                                                                  jlong session_ptr, jboolean if_bind) {
+                                                                                 jlong session_ptr, jboolean if_bind) {
   auto *pointer = reinterpret_cast<void *>(session_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Session pointer from java is nullptr");
@@ -69,7 +68,7 @@ extern "C" JNIEXPORT void JNICALL Java_com_mindspore_lite_LiteSession_bindThread
 }
 
 extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_LiteSession_runGraph(JNIEnv *env, jobject thiz,
-                                                                                    jlong session_ptr) {
+                                                                                   jlong session_ptr) {
   auto *pointer = reinterpret_cast<void *>(session_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Session pointer from java is nullptr");
@@ -81,7 +80,7 @@ extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_LiteSession_runGra
 }
 
 extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getInputs(JNIEnv *env, jobject thiz,
-                                                                                    jlong session_ptr) {
+                                                                                   jlong session_ptr) {
   jclass array_list = env->FindClass("java/util/ArrayList");
   jmethodID array_list_construct = env->GetMethodID(array_list, "<init>", "()V");
   jobject ret = env->NewObject(array_list, array_list_construct);
@@ -104,8 +103,8 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getInpu
 }
 
 extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getInputsByName(JNIEnv *env, jobject thiz,
-                                                                                          jlong session_ptr,
-                                                                                          jstring node_name) {
+                                                                                         jlong session_ptr,
+                                                                                         jstring node_name) {
   jclass array_list = env->FindClass("java/util/ArrayList");
   jmethodID array_list_construct = env->GetMethodID(array_list, "<init>", "()V");
   jobject ret = env->NewObject(array_list, array_list_construct);
@@ -127,8 +126,8 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getInpu
   return ret;
 }
 
-extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputs(JNIEnv *env, jobject thiz,
-                                                                                     jlong session_ptr) {
+extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputMapByNode(JNIEnv *env, jobject thiz,
+                                                                                            jlong session_ptr) {
   jclass hash_map_clazz = env->FindClass("java/util/HashMap");
   jmethodID hash_map_construct = env->GetMethodID(hash_map_clazz, "<init>", "()V");
   jobject hash_map = env->NewObject(hash_map_clazz, hash_map_construct);
@@ -140,7 +139,7 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutp
     return hash_map;
   }
   auto *lite_session_ptr = static_cast<mindspore::session::LiteSession *>(pointer);
-  auto outputs = lite_session_ptr->GetOutputs();
+  auto outputs = lite_session_ptr->GetOutputMapByNode();
   jclass long_object = env->FindClass("java/lang/Long");
   jmethodID long_object_construct = env->GetMethodID(long_object, "<init>", "(J)V");
   jclass array_list = env->FindClass("java/util/ArrayList");
@@ -159,9 +158,9 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutp
   return hash_map;
 }
 
-extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputsByName(JNIEnv *env, jobject thiz,
-                                                                                           jlong session_ptr,
-                                                                                           jstring node_name) {
+extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputsByNodeName(JNIEnv *env, jobject thiz,
+                                                                                              jlong session_ptr,
+                                                                                              jstring node_name) {
   jclass array_list = env->FindClass("java/util/ArrayList");
   jmethodID array_list_construct = env->GetMethodID(array_list, "<init>", "()V");
   jobject ret = env->NewObject(array_list, array_list_construct);
@@ -175,7 +174,7 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutp
     return ret;
   }
   auto *lite_session_ptr = static_cast<mindspore::session::LiteSession *>(pointer);
-  auto inputs = lite_session_ptr->GetOutputsByName(JstringToChar(env, node_name));
+  auto inputs = lite_session_ptr->GetOutputsByNodeName(JstringToChar(env, node_name));
   for (auto input : inputs) {
     jobject tensor_addr = env->NewObject(long_object, long_object_construct, jlong(input));
     env->CallBooleanMethod(ret, array_list_add, tensor_addr);
@@ -183,8 +182,66 @@ extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutp
   return ret;
 }
 
+extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputMapByTensor(JNIEnv *env, jobject thiz,
+                                                                                              jlong session_ptr) {
+  jclass hash_map_clazz = env->FindClass("java/util/HashMap");
+  jmethodID hash_map_construct = env->GetMethodID(hash_map_clazz, "<init>", "()V");
+  jobject hash_map = env->NewObject(hash_map_clazz, hash_map_construct);
+  jmethodID hash_map_put =
+    env->GetMethodID(hash_map_clazz, "put", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
+  auto *pointer = reinterpret_cast<void *>(session_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Session pointer from java is nullptr");
+    return hash_map;
+  }
+  auto *lite_session_ptr = static_cast<mindspore::session::LiteSession *>(pointer);
+  auto outputs = lite_session_ptr->GetOutputMapByTensor();
+  jclass long_object = env->FindClass("java/lang/Long");
+  jmethodID long_object_construct = env->GetMethodID(long_object, "<init>", "(J)V");
+  for (auto output_iter : outputs) {
+    auto node_name = output_iter.first;
+    auto ms_tensor = output_iter.second;
+    jobject tensor_addr = env->NewObject(long_object, long_object_construct, jlong(ms_tensor));
+    env->CallObjectMethod(hash_map, hash_map_put, env->NewStringUTF(node_name.c_str()), tensor_addr);
+  }
+  return hash_map;
+}
+
+extern "C" JNIEXPORT jobject JNICALL Java_com_mindspore_lite_LiteSession_getOutputTensorNames(JNIEnv *env, jobject thiz,
+                                                                                              jlong session_ptr) {
+  jclass array_list = env->FindClass("java/util/ArrayList");
+  jmethodID array_list_construct = env->GetMethodID(array_list, "<init>", "()V");
+  jobject ret = env->NewObject(array_list, array_list_construct);
+  jmethodID array_list_add = env->GetMethodID(array_list, "add", "(Ljava/lang/Object;)Z");
+
+  auto *pointer = reinterpret_cast<void *>(session_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Session pointer from java is nullptr");
+    return ret;
+  }
+  auto *lite_session_ptr = static_cast<mindspore::session::LiteSession *>(pointer);
+  auto output_names = lite_session_ptr->GetOutputTensorNames();
+  for (auto output_name : output_names) {
+    env->CallBooleanMethod(ret, array_list_add, env->NewStringUTF(output_name.c_str()));
+  }
+  return ret;
+}
+
+extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_LiteSession_getOutputByTensorName(JNIEnv *env, jobject thiz,
+                                                                                             jlong session_ptr,
+                                                                                             jstring tensor_name) {
+  auto *pointer = reinterpret_cast<void *>(session_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Session pointer from java is nullptr");
+    return jlong(nullptr);
+  }
+  auto *lite_session_ptr = static_cast<mindspore::session::LiteSession *>(pointer);
+  auto output = lite_session_ptr->GetOutputByTensorName(JstringToChar(env, tensor_name));
+  return jlong(output);
+}
+
 extern "C" JNIEXPORT void JNICALL Java_com_mindspore_lite_LiteSession_free(JNIEnv *env, jobject thiz,
-                                                                            jlong session_ptr) {
+                                                                           jlong session_ptr) {
   auto *pointer = reinterpret_cast<void *>(session_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Session pointer from java is nullptr");
diff --git a/mindspore/lite/java/native/runtime/model.cpp b/mindspore/lite/java/native/runtime/model.cpp
index f2989c48f6..e1dce79fa5 100644
--- a/mindspore/lite/java/native/runtime/model.cpp
+++ b/mindspore/lite/java/native/runtime/model.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-
 #include <jni.h>
+#include <fstream>
 #include "common/ms_log.h"
+#include "common/jni_utils.h"
 #include "include/model.h"
 
 extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_Model_loadModel(JNIEnv *env, jobject thiz, jobject buffer) {
@@ -38,6 +39,46 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_Model_loadModel(JNIEn
   return reinterpret_cast<jlong>(model);
 }
 
+extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_Model_loadModelByPath(JNIEnv *env, jobject thiz,
+                                                                                 jstring model_path) {
+  auto model_path_char = JstringToChar(env, model_path);
+  if (nullptr == model_path_char) {
+    MS_LOGE("model_path_char is nullptr");
+    return reinterpret_cast<jlong>(nullptr);
+  }
+  std::ifstream ifs(model_path_char);
+  if (!ifs.good()) {
+    MS_LOGE("file: %s is not exist", model_path_char);
+    return reinterpret_cast<jlong>(nullptr);
+  }
+
+  if (!ifs.is_open()) {
+    MS_LOGE("file: %s open failed", model_path_char);
+    return reinterpret_cast<jlong>(nullptr);
+  }
+
+  ifs.seekg(0, std::ios::end);
+  auto size = ifs.tellg();
+  std::unique_ptr<char[]> buf(new (std::nothrow) char[size]);
+  if (buf == nullptr) {
+    MS_LOGE("malloc buf failed, file: %s", model_path_char);
+    ifs.close();
+    return reinterpret_cast<jlong>(nullptr);
+  }
+
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(buf.get(), size);
+  ifs.close();
+  delete[](model_path_char);
+  MS_LOGD("Start Loading model");
+  auto model = mindspore::lite::Model::Import(buf.get(), size);
+  if (model == nullptr) {
+    MS_LOGE("Import model failed");
+    return reinterpret_cast<jlong>(nullptr);
+  }
+  return reinterpret_cast<jlong>(model);
+}
+
 extern "C" JNIEXPORT void JNICALL Java_com_mindspore_lite_Model_free(JNIEnv *env, jobject thiz, jlong model_ptr) {
   auto *pointer = reinterpret_cast<void *>(model_ptr);
   if (pointer == nullptr) {
diff --git a/mindspore/lite/java/native/runtime/ms_tensor.cpp b/mindspore/lite/java/native/runtime/ms_tensor.cpp
index 976a4ebc2f..3a42f810a1 100644
--- a/mindspore/lite/java/native/runtime/ms_tensor.cpp
+++ b/mindspore/lite/java/native/runtime/ms_tensor.cpp
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-
 #include <jni.h>
 #include "common/ms_log.h"
 #include "include/ms_tensor.h"
 #include "ir/dtype/type_id.h"
 
 extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_MSTensor_createMSTensor(JNIEnv *env, jobject thiz,
-                                                                                    jint data_type, jintArray shape,
-                                                                                    jint shape_len) {
+                                                                                   jint data_type, jintArray shape,
+                                                                                   jint shape_len) {
   jboolean is_copy = false;
   jint *local_shape_arr = env->GetIntArrayElements(shape, &is_copy);
   std::vector<int> local_shape(shape_len);
@@ -39,7 +38,7 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_MSTensor_createMSTens
 }
 
 extern "C" JNIEXPORT jintArray JNICALL Java_com_mindspore_lite_MSTensor_getShape(JNIEnv *env, jobject thiz,
-                                                                                  jlong tensor_ptr) {
+                                                                                 jlong tensor_ptr) {
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
@@ -59,8 +58,8 @@ extern "C" JNIEXPORT jintArray JNICALL Java_com_mindspore_lite_MSTensor_getShape
 }
 
 extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setShape(JNIEnv *env, jobject thiz,
-                                                                                 jlong tensor_ptr, jintArray shape,
-                                                                                 jint shape_len) {
+                                                                                jlong tensor_ptr, jintArray shape,
+                                                                                jint shape_len) {
   jboolean is_copy = false;
   jint *local_shape_arr = env->GetIntArrayElements(shape, &is_copy);
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
@@ -78,7 +77,7 @@ extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setShape(
 }
 
 extern "C" JNIEXPORT jint JNICALL Java_com_mindspore_lite_MSTensor_getDataType(JNIEnv *env, jobject thiz,
-                                                                                jlong tensor_ptr) {
+                                                                               jlong tensor_ptr) {
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
@@ -89,7 +88,7 @@ extern "C" JNIEXPORT jint JNICALL Java_com_mindspore_lite_MSTensor_getDataType(J
 }
 
 extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setDataType(JNIEnv *env, jobject thiz,
-                                                                                    jlong tensor_ptr, jint data_type) {
+                                                                                   jlong tensor_ptr, jint data_type) {
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
@@ -100,8 +99,8 @@ extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setDataTy
   return ret == data_type;
 }
 
-extern "C" JNIEXPORT jbyteArray JNICALL Java_com_mindspore_lite_MSTensor_getData(JNIEnv *env, jobject thiz,
-                                                                                  jlong tensor_ptr) {
+extern "C" JNIEXPORT jbyteArray JNICALL Java_com_mindspore_lite_MSTensor_getByteData(JNIEnv *env, jobject thiz,
+                                                                                     jlong tensor_ptr) {
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
@@ -113,20 +112,134 @@ extern "C" JNIEXPORT jbyteArray JNICALL Java_com_mindspore_lite_MSTensor_getData
     MS_LOGD("Tensor has no data");
     return env->NewByteArray(0);
   }
-  auto local_data_size = ms_tensor_ptr->Size();
-  auto ret = env->NewByteArray(local_data_size);
-  env->SetByteArrayRegion(ret, 0, local_data_size, local_data);
+
+  if (ms_tensor_ptr->data_type() != mindspore::kNumberTypeUInt8) {
+    MS_LOGE("data type is error : %d", ms_tensor_ptr->data_type());
+    return env->NewByteArray(0);
+  }
+
+  auto local_element_num = ms_tensor_ptr->ElementsNum();
+  auto ret = env->NewByteArray(local_element_num);
+  env->SetByteArrayRegion(ret, 0, local_element_num, local_data);
+  return ret;
+}
+
+extern "C" JNIEXPORT jlongArray JNICALL Java_com_mindspore_lite_MSTensor_getLongData(JNIEnv *env, jobject thiz,
+                                                                                     jlong tensor_ptr) {
+  auto *pointer = reinterpret_cast<void *>(tensor_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Tensor pointer from java is nullptr");
+    return env->NewLongArray(0);
+  }
+
+  auto *ms_tensor_ptr = static_cast<mindspore::tensor::MSTensor *>(pointer);
+
+  auto *local_data = static_cast<jlong *>(ms_tensor_ptr->MutableData());
+  if (local_data == nullptr) {
+    MS_LOGD("Tensor has no data");
+    return env->NewLongArray(0);
+  }
+
+  if (ms_tensor_ptr->data_type() != mindspore::kNumberTypeInt64) {
+    MS_LOGE("data type is error : %d", ms_tensor_ptr->data_type());
+    return env->NewLongArray(0);
+  }
+  auto local_element_num = ms_tensor_ptr->ElementsNum();
+  auto ret = env->NewLongArray(local_element_num);
+  env->SetLongArrayRegion(ret, 0, local_element_num, local_data);
+  return ret;
+}
+
+extern "C" JNIEXPORT jintArray JNICALL Java_com_mindspore_lite_MSTensor_getIntData(JNIEnv *env, jobject thiz,
+                                                                                   jlong tensor_ptr) {
+  auto *pointer = reinterpret_cast<void *>(tensor_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Tensor pointer from java is nullptr");
+    return env->NewIntArray(0);
+  }
+
+  auto *ms_tensor_ptr = static_cast<mindspore::tensor::MSTensor *>(pointer);
+
+  auto *local_data = static_cast<jint *>(ms_tensor_ptr->MutableData());
+  if (local_data == nullptr) {
+    MS_LOGD("Tensor has no data");
+    return env->NewIntArray(0);
+  }
+
+  if (ms_tensor_ptr->data_type() != mindspore::kNumberTypeInt32) {
+    MS_LOGE("data type is error : %d", ms_tensor_ptr->data_type());
+    return env->NewIntArray(0);
+  }
+  auto local_element_num = ms_tensor_ptr->ElementsNum();
+  auto ret = env->NewIntArray(local_element_num);
+  env->SetIntArrayRegion(ret, 0, local_element_num, local_data);
+  return ret;
+}
+
+extern "C" JNIEXPORT jfloatArray JNICALL Java_com_mindspore_lite_MSTensor_getFloatData(JNIEnv *env, jobject thiz,
+                                                                                       jlong tensor_ptr) {
+  auto *pointer = reinterpret_cast<void *>(tensor_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Tensor pointer from java is nullptr");
+    return env->NewFloatArray(0);
+  }
+
+  auto *ms_tensor_ptr = static_cast<mindspore::tensor::MSTensor *>(pointer);
+
+  auto *local_data = static_cast<jfloat *>(ms_tensor_ptr->MutableData());
+  if (local_data == nullptr) {
+    MS_LOGD("Tensor has no data");
+    return env->NewFloatArray(0);
+  }
+
+  if (ms_tensor_ptr->data_type() != mindspore::kNumberTypeFloat32) {
+    MS_LOGE("data type is error : %d", ms_tensor_ptr->data_type());
+    return env->NewFloatArray(0);
+  }
+  auto local_element_num = ms_tensor_ptr->ElementsNum();
+  auto ret = env->NewFloatArray(local_element_num);
+  env->SetFloatArrayRegion(ret, 0, local_element_num, local_data);
   return ret;
 }
 
 extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setData(JNIEnv *env, jobject thiz,
-                                                                                jlong tensor_ptr, jbyteArray data,
-                                                                                jlong data_len) {
+                                                                               jlong tensor_ptr, jbyteArray data,
+                                                                               jlong data_len) {
+  auto *pointer = reinterpret_cast<void *>(tensor_ptr);
+  if (pointer == nullptr) {
+    MS_LOGE("Tensor pointer from java is nullptr");
+    return static_cast<jboolean>(false);
+  }
+  auto *ms_tensor_ptr = static_cast<mindspore::tensor::MSTensor *>(pointer);
+  if (data_len != ms_tensor_ptr->Size()) {
+    MS_LOGE("data_len(%ld) not equal to Size of ms_tensor(%zu)", data_len, ms_tensor_ptr->Size());
+    return static_cast<jboolean>(false);
+  }
+  jboolean is_copy = false;
+  auto *data_arr = env->GetByteArrayElements(data, &is_copy);
+  auto *local_data = ms_tensor_ptr->MutableData();
+  memcpy(local_data, data_arr, data_len);
+  return static_cast<jboolean>(true);
+}
+
+extern "C" JNIEXPORT jboolean JNICALL Java_com_mindspore_lite_MSTensor_setByteBufferData(JNIEnv *env, jobject thiz,
+                                                                                         jlong tensor_ptr,
+                                                                                         jobject buffer) {
+  jbyte *p_data = reinterpret_cast<jbyte *>(env->GetDirectBufferAddress(buffer));  // get buffer poiter
+  jlong data_len = env->GetDirectBufferCapacity(buffer);                           // get buffer capacity
+  if (!p_data) {
+    MS_LOGE("GetDirectBufferAddress return null");
+    return NULL;
+  }
+  jbyteArray data = env->NewByteArray(data_len);       // create byte[]
+  env->SetByteArrayRegion(data, 0, data_len, p_data);  // copy data to byte[]
+
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
     return static_cast<jboolean>(false);
   }
+
   auto *ms_tensor_ptr = static_cast<mindspore::tensor::MSTensor *>(pointer);
   if (data_len != ms_tensor_ptr->Size()) {
     MS_LOGE("data_len(%ld) not equal to Size of ms_tensor(%zu)", data_len, ms_tensor_ptr->Size());
@@ -150,7 +263,7 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_MSTensor_size(JNIEnv
 }
 
 extern "C" JNIEXPORT jint JNICALL Java_com_mindspore_lite_MSTensor_elementsNum(JNIEnv *env, jobject thiz,
-                                                                                jlong tensor_ptr) {
+                                                                               jlong tensor_ptr) {
   auto *pointer = reinterpret_cast<void *>(tensor_ptr);
   if (pointer == nullptr) {
     MS_LOGE("Tensor pointer from java is nullptr");
diff --git a/mindspore/lite/nnacl/CMakeLists.txt b/mindspore/lite/nnacl/CMakeLists.txt
index 675393d9a7..2c8baf93ba 100644
--- a/mindspore/lite/nnacl/CMakeLists.txt
+++ b/mindspore/lite/nnacl/CMakeLists.txt
@@ -32,9 +32,11 @@ if (PLATFORM_ARM64)
     )
     set_target_properties(optimize PROPERTIES CLEAN_DIRECT_OUTPUT 1)
 
-    add_custom_command(TARGET optimize POST_BUILD
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+        add_custom_command(TARGET optimize POST_BUILD
             COMMAND ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip
             ${TOP_DIR}/build/nnacl/liboptimize.so)
+    endif ()
 
     add_custom_command(TARGET optimize POST_BUILD
             COMMAND rm -rf ${TOP_DIR}/output/lib/liboptimize.so
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/README.md b/mindspore/lite/nnacl/README.md
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/nnacl/README.md
rename to mindspore/lite/nnacl/README.md
diff --git a/mindspore/lite/nnacl/arithmetic_common.h b/mindspore/lite/nnacl/arithmetic_common.h
index 34aab3e4a5..cedb59cd71 100644
--- a/mindspore/lite/nnacl/arithmetic_common.h
+++ b/mindspore/lite/nnacl/arithmetic_common.h
@@ -51,6 +51,8 @@ void TileOneDimension(float *inData, float *outData, int dim, size_t ndim, int *
                       int *outStrides, int *multiple);
 void ComputeStrides(int *shape, int *strides, int ndim);
 
+void CalcMultiplesAndStrides(ArithmeticParameter *param);
+
 void TileDimensions(float *data0, float *data1, float *tile_data0, float *tile_data1, ArithmeticParameter *param);
 void TileDimensionsUint8(uint8_t *data0, uint8_t *data1, uint8_t *tile_data0, uint8_t *tile_data1,
                          ArithmeticParameter *param);
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
index 8323178a2c..3ca68cd60e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
@@ -29,7 +29,7 @@ mov x6, x1
 mov x7, x2
 mov x8, x4
 
-    LoopInputDepth16In:
+    LoopDepth16In:
     cmp x8, #16
     blt L4
     sub x8, x8, #16
@@ -39,8 +39,8 @@ mov x8, x4
     ld1 {v16.4s, v17.4s}, [x0], #32
 
     cmp x8, #16
-    blt LoopInputDepth16Out
-    LoopInputDepth16:
+    blt LoopDepth16Out
+    LoopDepth16:
     fmla v16.4s, v0.4s, v2.4s
     fmla v17.4s, v1.4s, v3.4s
 
@@ -61,9 +61,9 @@ mov x8, x4
 
     sub x8, x8, #16
     cmp x8, #16
-    bge LoopInputDepth16
+    bge LoopDepth16
 
-    LoopInputDepth16Out:
+    LoopDepth16Out:
     fmla v16.4s, v0.4s, v2.4s
     fmla v17.4s, v1.4s, v3.4s
     st1 {v16.4s, v17.4s}, [x9], #32
@@ -81,7 +81,7 @@ mov x8, x4
     cmp x8, #4
     blt L0
 
-    LoopInputDepth4:
+    LoopDepth4:
     ld1 {v0.4s}, [x6], #16
     ld1 {v2.4s}, [x7], #16
     ld1 {v16.4s}, [x0], #16
@@ -89,13 +89,13 @@ mov x8, x4
     st1 {v16.4s}, [x9], #16
     sub x8, x8, #4
     cmp x8, #4
-    bge LoopInputDepth4
+    bge LoopDepth4
 
     L0:
     cmp x8, #0
     beq Loop16LineEnd
 
-    LoopInputDepth0:
+    LoopDepth0:
     ldr s0, [x6], #4
     ldr s1, [x7], #4
     ldr s2, [x0], #4
@@ -103,7 +103,7 @@ mov x8, x4
     fadd s2, s2, s0
     str s2, [x9], #4
     subs x8, x8, #1
-    bne LoopInputDepth0
+    bne LoopDepth0
 
     Loop16LineEnd:
 
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
index 9363b21f1b..c2705a32c4 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
@@ -90,36 +90,36 @@ ConvDwInt8Center:
                 LoopKw16:
                     mov x22, x21
                     ld1 {v25.4h}, [x17], #8
-                    ld1 {v16.4h}, [x22], x13
-                    ld1 {v17.4h}, [x22], x13
+                    ld1 {v16.4h}, [x22], x11
+                    ld1 {v17.4h}, [x22], x11
                     smlal v0.4s, v16.4h, v25.4h
                     smlal v1.4s, v17.4h, v25.4h
-                    ld1 {v18.4h}, [x22], x13
-                    ld1 {v19.4h}, [x22], x13
+                    ld1 {v18.4h}, [x22], x11
+                    ld1 {v19.4h}, [x22], x11
                     smlal v2.4s, v18.4h, v25.4h
                     smlal v3.4s, v19.4h, v25.4h
-                    ld1 {v20.4h}, [x22], x13
-                    ld1 {v21.4h}, [x22], x13
+                    ld1 {v20.4h}, [x22], x11
+                    ld1 {v21.4h}, [x22], x11
                     smlal v4.4s, v20.4h, v25.4h
                     smlal v5.4s, v21.4h, v25.4h
-                    ld1 {v22.4h}, [x22], x13
-                    ld1 {v23.4h}, [x22], x13
+                    ld1 {v22.4h}, [x22], x11
+                    ld1 {v23.4h}, [x22], x11
                     smlal v6.4s, v22.4h, v25.4h
                     smlal v7.4s, v23.4h, v25.4h
-                    ld1 {v16.4h}, [x22], x13
-                    ld1 {v17.4h}, [x22], x13
+                    ld1 {v16.4h}, [x22], x11
+                    ld1 {v17.4h}, [x22], x11
                     smlal v8.4s, v16.4h, v25.4h
                     smlal v9.4s, v17.4h, v25.4h
-                    ld1 {v18.4h}, [x22], x13
-                    ld1 {v19.4h}, [x22], x13
+                    ld1 {v18.4h}, [x22], x11
+                    ld1 {v19.4h}, [x22], x11
                     smlal v10.4s, v18.4h, v25.4h
                     smlal v11.4s, v19.4h, v25.4h
-                    ld1 {v20.4h}, [x22], x13
-                    ld1 {v21.4h}, [x22], x13
+                    ld1 {v20.4h}, [x22], x11
+                    ld1 {v21.4h}, [x22], x11
                     smlal v12.4s, v20.4h, v25.4h
                     smlal v13.4s, v21.4h, v25.4h
-                    ld1 {v22.4h}, [x22], x13
-                    ld1 {v23.4h}, [x22], x13
+                    ld1 {v22.4h}, [x22], x11
+                    ld1 {v23.4h}, [x22], x11
                     smlal v14.4s, v22.4h, v25.4h
                     smlal v15.4s, v23.4h, v25.4h
                     subs x18, x18, #1
@@ -420,20 +420,20 @@ ConvDwInt8Center:
                 LoopKw8:
                     mov x22, x21
                     ld1 {v25.4h}, [x17], #8
-                    ld1 {v16.4h}, [x22], x13
-                    ld1 {v17.4h}, [x22], x13
+                    ld1 {v16.4h}, [x22], x11
+                    ld1 {v17.4h}, [x22], x11
                     smlal v0.4s, v16.4h, v25.4h
                     smlal v1.4s, v17.4h, v25.4h
-                    ld1 {v18.4h}, [x22], x13
-                    ld1 {v19.4h}, [x22], x13
+                    ld1 {v18.4h}, [x22], x11
+                    ld1 {v19.4h}, [x22], x11
                     smlal v2.4s, v18.4h, v25.4h
                     smlal v3.4s, v19.4h, v25.4h
-                    ld1 {v20.4h}, [x22], x13
-                    ld1 {v21.4h}, [x22], x13
+                    ld1 {v20.4h}, [x22], x11
+                    ld1 {v21.4h}, [x22], x11
                     smlal v4.4s, v20.4h, v25.4h
                     smlal v5.4s, v21.4h, v25.4h
-                    ld1 {v22.4h}, [x22], x13
-                    ld1 {v23.4h}, [x22], x13
+                    ld1 {v22.4h}, [x22], x11
+                    ld1 {v23.4h}, [x22], x11
                     smlal v6.4s, v22.4h, v25.4h
                     smlal v7.4s, v23.4h, v25.4h
                     subs x18, x18, #1
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
new file mode 100644
index 0000000000..de74f339eb
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
@@ -0,0 +1,169 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8PostAlign4
+#ifndef __APPLE__
+.type ConvDwInt8PostAlign4, %function
+#endif
+
+// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+//                           int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
+// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
+// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
+
+ConvDwInt8PostAlign4:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    ldr x8, [sp]
+
+    dup v26.4s, w5
+    dup v27.4s, w4
+    dup v28.4s, w6
+
+    dup v29.4s, w3
+    dup v30.4s, w7
+    dup v31.4s, w8
+
+    cmp x2, 16
+    blt LoopDepth8
+
+    LoopDepth16:
+        ld1 {v0.4s}, [x1], #16
+        ld1 {v1.4s}, [x1], #16
+        ld1 {v2.4s}, [x1], #16
+        ld1 {v3.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqshl v1.4s, v1.4s, v26.4s
+        sqshl v2.4s, v2.4s, v26.4s
+        sqshl v3.4s, v3.4s, v26.4s
+
+        sqrdmulh v0.4s, v0.4s, v27.4s
+        sqrdmulh v1.4s, v1.4s, v27.4s
+        sqrdmulh v2.4s, v2.4s, v27.4s
+        sqrdmulh v3.4s, v3.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+        and v17.16b, v28.16b, v1.16b
+        sshr v17.4s, v17.4s, #31
+        sqadd v1.4s, v1.4s, v17.4s
+        srshl v1.4s, v1.4s, v28.4s
+        and v18.16b, v28.16b, v2.16b
+        sshr v18.4s, v18.4s, #31
+        sqadd v2.4s, v2.4s, v18.4s
+        srshl v2.4s, v2.4s, v28.4s
+        and v19.16b, v28.16b, v3.16b
+        sshr v19.4s, v19.4s, #31
+        sqadd v3.4s, v3.4s, v19.4s
+        srshl v3.4s, v3.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        add v1.4s, v1.4s, v29.4s
+        add v2.4s, v2.4s, v29.4s
+        add v3.4s, v3.4s, v29.4s
+
+        smax v0.4s, v0.4s, v30.4s
+        smax v1.4s, v1.4s, v30.4s
+        smax v2.4s, v2.4s, v30.4s
+        smax v3.4s, v3.4s, v30.4s
+
+        smin v0.4s, v0.4s, v31.4s
+        smin v1.4s, v1.4s, v31.4s
+        smin v2.4s, v2.4s, v31.4s
+        smin v3.4s, v3.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v1.4h, v1.4s
+        sqxtn v2.4h, v2.4s
+        sqxtn v3.4h, v3.4s
+
+        sqxtn v0.8b, v0.8h
+        sqxtn v1.8b, v1.8h
+        sqxtn v2.8b, v2.8h
+        sqxtn v3.8b, v3.8h
+
+        st1 {v0.s}[0], [x0], #4
+        st1 {v1.s}[0], [x0], #4
+        st1 {v2.s}[0], [x0], #4
+        st1 {v3.s}[0], [x0], #4
+
+        sub x2, x2, #16
+        cmp x2, #16
+        bge LoopDepth16
+
+    LoopDepth8:
+        cmp x2, #8
+        blt LoopDepth4
+        ld1 {v0.4s}, [x1], #16
+        ld1 {v1.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqshl v1.4s, v1.4s, v26.4s
+
+        sqrdmulh v0.4s, v0.4s, v27.4s
+        sqrdmulh v1.4s, v1.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+        and v17.16b, v28.16b, v1.16b
+        sshr v17.4s, v17.4s, #31
+        sqadd v1.4s, v1.4s, v17.4s
+        srshl v1.4s, v1.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        add v1.4s, v1.4s, v29.4s
+
+        smax v0.4s, v0.4s, v30.4s
+        smax v1.4s, v1.4s, v30.4s
+
+        smin v0.4s, v0.4s, v31.4s
+        smin v1.4s, v1.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v1.4h, v1.4s
+
+        sqxtn v0.8b, v0.8h
+        sqxtn v1.8b, v1.8h
+
+        st1 {v0.s}[0], [x0], #4
+        st1 {v1.s}[0], [x0], #4
+
+        sub x2, x2, #8
+        cmp x2, #8
+        bge LoopDepth8
+
+    LoopDepth4:
+        cmp x2, #4
+        blt End
+        ld1 {v0.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqrdmulh v0.4s, v0.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        smax v0.4s, v0.4s, v30.4s
+        smin v0.4s, v0.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v0.8b, v0.8h
+
+        st1 {v0.s}[0], [x0], #4
+
+        sub x2, x2, #4
+        bge LoopDepth4
+    End:
+    ret
+#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
new file mode 100644
index 0000000000..7d32ef9bae
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
@@ -0,0 +1,122 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8Row
+#ifndef __APPLE__
+.type ConvDwInt8Row, %function
+#endif
+
+// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+//                    int output_channel, int input_step, int8_t input_zp)
+// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
+// x4: output_channel, x5: input_step, x6: input_zp
+//
+ConvDwInt8Row:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+cmp x3, #0
+beq End
+
+mov x10, x0
+
+dup v31.8b, w6
+
+LoopOutPixel:
+mov x7, x1
+mov x8, x2
+mov x9, x4
+
+    LoopDepth16In:
+    cmp x9, #16
+    blt L8
+    sub x9, x9, #16
+
+    ld1 {v0.8b, v1.8b}, [x7], #16
+    ld1 {v2.8h, v3.8h}, [x8], #32
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+
+
+    cmp x9, #16
+    blt LoopDepth16Out
+    LoopDepth16:
+
+    st1 {v16.4s, v17.4s}, [x10], #32
+    ld1 {v18.4s, v19.4s}, [x0], #32
+    ssubl v21.8h, v1.8b, v31.8b
+    smlal v18.4s, v21.4h, v3.4h
+    smlal2 v19.4s, v21.8h, v3.8h
+    st1 {v18.4s, v19.4s}, [x10], #32
+
+    ld1 {v0.8b, v1.8b}, [x7], #16
+    ld1 {v2.8h, v3.8h}, [x8], #32
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+
+    sub x9, x9, #16
+    cmp x9, #16
+    bge LoopDepth16
+
+    LoopDepth16Out:
+
+    st1 {v16.4s, v17.4s}, [x10], #32
+    ld1 {v18.4s, v19.4s}, [x0], #32
+    ssubl v21.8h, v1.8b, v31.8b
+    smlal v18.4s, v21.4h, v3.4h
+    smlal2 v19.4s, v21.8h, v3.8h
+    st1 {v18.4s, v19.4s}, [x10], #32
+
+    L8:
+    cmp x9, #8
+    blt L0
+
+    LoopDepth8:
+    ld1 {v0.8b}, [x7], #8
+    ld1 {v2.8h}, [x8], #16
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+    st1 {v16.4s, v17.4s}, [x10], #32
+
+    sub x9, x9, #8
+    cmp x9, #8
+    bge LoopDepth8
+
+    L0:
+    cmp x9, #0
+    beq Loop16LineEnd
+
+    LoopDepth0:
+    ldrsb w14, [x7], #1
+    ldrsh w15, [x8], #2
+    ldr w16, [x0], #4
+    add w14, w14, w6
+
+    sxth w14, w14
+    madd w14, w14, w15, w16
+    str w14, [x10], #4
+
+    subs x9, x9, #1
+    bne LoopDepth0
+
+    Loop16LineEnd:
+
+subs x3, x3, #1
+add x1, x1, x5
+bne LoopOutPixel
+
+End:
+ret
+
+#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
new file mode 100644
index 0000000000..b5a2ec2e4c
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -0,0 +1,812 @@
+#ifdef __aarch64__
+    .text
+    .align 5
+    .global MatmulFloatNeon64Opt
+#ifndef __APPLE__
+    .type MatmulFloatNeon64Opt, %function
+#endif
+
+// A: LM  [row_8 * depth] col_8_major
+// B: RM  [depth * col_8] row_8_major
+// C: A*B [row_8 * col_8] col_8x8_major
+// A * B -> [8 * depth] * [depth * 8] -> [8 * 4] * [4 * 8] or [8 * 1] * [1 * 8]
+///////////////////////////////////////////////////////////////////////////////
+//CommLoopMul                                          RM 1x8 block
+//                           /-----------------------------------------\
+//                           |v2.s[0] ... v2.s[3]   v3.s[0] ... v3.s[3]|
+//                           \-----------------------------------------/
+//        LM 8x1 block
+//  /---------------------\  /-----------------------------------------\
+//  |        v0.s[0]      |  |v16.s[0]...v16.s[3]   v17.s[0]...v17.s[3]|
+//  |         ...         |  |  ...                              ...   |
+//  |        v0.s[3]      |  |v22.s[0]...v22.s[3]   v23.s[0]...v23.s[3]|
+//  |        v1.s[0]      |  |v24.s[0]...v24.s[3]   v25.s[0]...v25.s[3]|
+//  |         ...         |  |  ...                              ...   |
+//  |        v1.s[3]      |  |v30.s[0]...v30.s[3]   v31.s[0]...v31.s[3]|
+//  \---------------------/  \-----------------------------------------/
+//                                      accumulators 8x8 block
+//
+///////////////////////////////////////////////////////////////////////////////
+//OptLoopMul4                                          RM 4x8 block
+//                                       /--------------------------------------------\
+//                                       |v8.s[0]  ... v8.s[3]   v9.s[0]  ... v9.s[3] |
+//                                       |v10.s[0] ... v10.s[3]  v11.s[0] ... v11.s[3]|
+//                                       |v12.s[0] ... v12.s[3]  v13.s[0] ... v13.s[3]|
+//                                       |v14.s[0] ... v14.s[3]  v15.s[0] ... v15.s[3]|
+//                                       \--------------------------------------------/
+//        LM 8x4 block
+//  /---------------------------------\  /--------------------------------------------\
+//  | v0.s[0] v2.s[0] v4.s[0] v6.s[0] |  |v16.s[0]...v16.s[3]    v17.s[0]...v17.s[3]  |
+//  |  ...     ...     ...     ...    |  |  ...                                 ...   |
+//  | v0.s[3] v2.s[3] v4.s[3] v6.s[3] |  |v22.s[0]...v22.s[3]    v23.s[0]...v23.s[3]  |
+//  | v1.s[0] v3.s[0] v5.s[0] v7.s[0] |  |v24.s[0]...v24.s[3]    v25.s[0]...v25.s[3]  |
+//  |  ...     ...     ...     ...    |  |  ...                                 ...   |
+//  | v1.s[3] v3.s[3] v5.s[3] v7.s[3] |  |v30.s[0]...v30.s[3]    v31.s[0]...v31.s[3]  |
+//  \---------------------------------/  \--------------------------------------------/
+//                                                  accumulators 8x8 block
+/////////////////////////////////////////////////////////////////////////////////
+//
+// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
+//                        int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
+// x0: a
+// x1: b
+// x2: c
+// x3: bias
+// w4: act_type
+// w5: depth
+// w6: row
+// w7: col
+// w17: stride
+// w13: c8_nhwc_c4
+
+MatmulFloatNeon64Opt:
+  sub sp, sp, #128
+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+  ldr x9, [sp, #8]
+  ldr x14, [sp, #16]
+
+  mov w18, #32 // sizeof(float) * 8
+  mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
+  mov x18, #4
+  ldr x17, [sp]
+  cbz x14, NoWinoSteps
+  mul x8, x7, x17
+  mov x11, #8
+  mul x11, x11, x17
+  mul x8, x8, x18
+  mul x11, x11, x18
+NoWinoSteps:
+  mul x17, x17, x18
+
+L1:
+  mov w10, w6 // reload lhs row
+  mov x12, x0 // reload lhs ptr
+  mov x18, x2 // reload dst ptr
+
+L2:
+  mov x16, x1 // reload rhs ptr
+  mov w13, w5 // reload depth
+  dup v8.4s, wzr
+  dup v9.4s, wzr
+  dup v10.4s, wzr
+  dup v11.4s, wzr
+  dup v12.4s, wzr
+  dup v13.4s, wzr
+  dup v14.4s, wzr
+  dup v15.4s, wzr
+  dup v16.4s, wzr
+  dup v17.4s, wzr
+  dup v18.4s, wzr
+  dup v19.4s, wzr
+  dup v20.4s, wzr
+  dup v21.4s, wzr
+  dup v22.4s, wzr
+  dup v23.4s, wzr
+  dup v24.4s, wzr
+  dup v25.4s, wzr
+  dup v26.4s, wzr
+  dup v27.4s, wzr
+  dup v28.4s, wzr
+  dup v29.4s, wzr
+  dup v30.4s, wzr
+  dup v31.4s, wzr
+
+LoopStart:
+  ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
+  ld1 {v3.4s, v4.4s}, [x16], #32
+  fmla v8.4s, v3.4s, v0.s[0]
+  fmla v10.4s, v3.4s, v0.s[1]
+  fmla v12.4s, v3.4s, v0.s[2]
+  fmla v14.4s, v3.4s, v0.s[3]
+  fmla v9.4s, v4.4s, v0.s[0]
+  fmla v11.4s, v4.4s, v0.s[1]
+  fmla v13.4s, v4.4s, v0.s[2]
+  fmla v15.4s, v4.4s, v0.s[3]
+
+  subs w13, w13, #1
+  beq LoopEnd
+
+Loop:
+  ld1 {v0.4s}, [x12], #16
+  fmla v16.4s, v3.4s, v1.s[0]
+  fmla v18.4s, v3.4s, v1.s[1]
+  fmla v20.4s, v3.4s, v1.s[2]
+  fmla v22.4s, v3.4s, v1.s[3]
+  fmla v17.4s, v4.4s, v1.s[0]
+  fmla v19.4s, v4.4s, v1.s[1]
+  fmla v21.4s, v4.4s, v1.s[2]
+  fmla v23.4s, v4.4s, v1.s[3]
+  ld1 {v1.4s}, [x12], #16
+  fmla v24.4s, v3.4s, v2.s[0]
+  fmla v26.4s, v3.4s, v2.s[1]
+  fmla v28.4s, v3.4s, v2.s[2]
+  fmla v30.4s, v3.4s, v2.s[3]
+  ld1 {v3.4s}, [x16], #16
+  fmla v25.4s, v4.4s, v2.s[0]
+  fmla v27.4s, v4.4s, v2.s[1]
+  fmla v29.4s, v4.4s, v2.s[2]
+  fmla v31.4s, v4.4s, v2.s[3]
+  ld1 {v4.4s}, [x16], #16
+  fmla v8.4s, v3.4s, v0.s[0]
+  fmla v10.4s, v3.4s, v0.s[1]
+  fmla v12.4s, v3.4s, v0.s[2]
+  fmla v14.4s, v3.4s, v0.s[3]
+  ld1 {v2.4s}, [x12], #16
+  fmla v9.4s, v4.4s, v0.s[0]
+  fmla v11.4s, v4.4s, v0.s[1]
+  fmla v13.4s, v4.4s, v0.s[2]
+  fmla v15.4s, v4.4s, v0.s[3]
+
+  subs w13, w13, #1
+  bgt Loop
+
+LoopEnd:
+  fmla v16.4s, v3.4s, v1.s[0]
+  fmla v18.4s, v3.4s, v1.s[1]
+  fmla v20.4s, v3.4s, v1.s[2]
+  fmla v22.4s, v3.4s, v1.s[3]
+  fmla v17.4s, v4.4s, v1.s[0]
+  fmla v19.4s, v4.4s, v1.s[1]
+  fmla v21.4s, v4.4s, v1.s[2]
+  fmla v23.4s, v4.4s, v1.s[3]
+  fmla v24.4s, v3.4s, v2.s[0]
+  fmla v26.4s, v3.4s, v2.s[1]
+  fmla v28.4s, v3.4s, v2.s[2]
+  fmla v30.4s, v3.4s, v2.s[3]
+  fmla v25.4s, v4.4s, v2.s[0]
+  fmla v27.4s, v4.4s, v2.s[1]
+  fmla v29.4s, v4.4s, v2.s[2]
+  fmla v31.4s, v4.4s, v2.s[3]
+
+Bias:
+  cbz x3, Activation
+  ld1 {v0.4s}, [x3], #16
+  ld1 {v1.4s}, [x3]
+  sub x3, x3, #16
+  fadd v8.4s, v8.4s, v0.4s
+  fadd v9.4s, v9.4s, v1.4s
+  fadd v10.4s, v10.4s, v0.4s
+  fadd v11.4s, v11.4s, v1.4s
+  fadd v12.4s, v12.4s, v0.4s
+  fadd v13.4s, v13.4s, v1.4s
+  fadd v14.4s, v14.4s, v0.4s
+  fadd v15.4s, v15.4s, v1.4s
+  fadd v16.4s, v16.4s, v0.4s
+  fadd v17.4s, v17.4s, v1.4s
+  fadd v18.4s, v18.4s, v0.4s
+  fadd v19.4s, v19.4s, v1.4s
+  fadd v20.4s, v20.4s, v0.4s
+  fadd v21.4s, v21.4s, v1.4s
+  fadd v22.4s, v22.4s, v0.4s
+  fadd v23.4s, v23.4s, v1.4s
+  fadd v24.4s, v24.4s, v0.4s
+  fadd v25.4s, v25.4s, v1.4s
+  fadd v26.4s, v26.4s, v0.4s
+  fadd v27.4s, v27.4s, v1.4s
+  fadd v28.4s, v28.4s, v0.4s
+  fadd v29.4s, v29.4s, v1.4s
+  fadd v30.4s, v30.4s, v0.4s
+  fadd v31.4s, v31.4s, v1.4s
+
+Activation:
+  cmp w4, #2
+  beq Relu6
+  cmp w4, #1
+  beq Relu
+  b Write
+
+Relu6:
+  mov w13, #6
+  dup v2.4s, w13
+  scvtf v2.4s, v2.4s
+  fmin v8.4s, v8.4s, v2.4s
+  fmin v9.4s, v9.4s, v2.4s
+  fmin v10.4s, v10.4s, v2.4s
+  fmin v11.4s, v11.4s, v2.4s
+  fmin v12.4s, v12.4s, v2.4s
+  fmin v13.4s, v13.4s, v2.4s
+  fmin v14.4s, v14.4s, v2.4s
+  fmin v15.4s, v15.4s, v2.4s
+  fmin v16.4s, v16.4s, v2.4s
+  fmin v17.4s, v17.4s, v2.4s
+  fmin v18.4s, v18.4s, v2.4s
+  fmin v19.4s, v19.4s, v2.4s
+  fmin v20.4s, v20.4s, v2.4s
+  fmin v21.4s, v21.4s, v2.4s
+  fmin v22.4s, v22.4s, v2.4s
+  fmin v23.4s, v23.4s, v2.4s
+  fmin v24.4s, v24.4s, v2.4s
+  fmin v25.4s, v25.4s, v2.4s
+  fmin v26.4s, v26.4s, v2.4s
+  fmin v27.4s, v27.4s, v2.4s
+  fmin v28.4s, v28.4s, v2.4s
+  fmin v29.4s, v29.4s, v2.4s
+  fmin v30.4s, v30.4s, v2.4s
+  fmin v31.4s, v31.4s, v2.4s
+
+Relu:
+  dup v3.4s, wzr
+  fmax v8.4s, v8.4s, v3.4s
+  fmax v9.4s, v9.4s, v3.4s
+  fmax v10.4s, v10.4s, v3.4s
+  fmax v11.4s, v11.4s, v3.4s
+  fmax v12.4s, v12.4s, v3.4s
+  fmax v13.4s, v13.4s, v3.4s
+  fmax v14.4s, v14.4s, v3.4s
+  fmax v15.4s, v15.4s, v3.4s
+  fmax v16.4s, v16.4s, v3.4s
+  fmax v17.4s, v17.4s, v3.4s
+  fmax v18.4s, v18.4s, v3.4s
+  fmax v19.4s, v19.4s, v3.4s
+  fmax v20.4s, v20.4s, v3.4s
+  fmax v21.4s, v21.4s, v3.4s
+  fmax v22.4s, v22.4s, v3.4s
+  fmax v23.4s, v23.4s, v3.4s
+  fmax v24.4s, v24.4s, v3.4s
+  fmax v25.4s, v25.4s, v3.4s
+  fmax v26.4s, v26.4s, v3.4s
+  fmax v27.4s, v27.4s, v3.4s
+  fmax v28.4s, v28.4s, v3.4s
+  fmax v29.4s, v29.4s, v3.4s
+  fmax v30.4s, v30.4s, v3.4s
+  fmax v31.4s, v31.4s, v3.4s
+
+Write:
+  cbnz x14, WriteWino
+  cbz x9, WriteC8
+  cmp w7, #1
+  beq Write1
+  cmp w7, #2
+  beq Write2
+  cmp w7, #3
+  beq Write3
+  cmp w7, #4
+  beq Write4
+  cmp w7, #5
+  beq Write5
+  cmp w7, #6
+  beq Write6
+  cmp w7, #7
+  beq Write7
+  b Write8
+
+Write1:
+  str s8, [x18]
+  cmp w10, #1
+  beq WriteEnd
+  add x18, x18, x17
+  str s10, [x18]
+  cmp w10, #2
+  beq WriteEnd
+  add x18, x18, x17
+  str s12, [x18]
+  cmp w10, #3
+  beq WriteEnd
+  add x18, x18, x17
+  str s14, [x18]
+  cmp w10, #4
+  beq WriteEnd
+  add x18, x18, x17
+  str s16, [x18]
+  cmp w10, #5
+  beq WriteEnd
+  add x18, x18, x17
+  str s18, [x18]
+  cmp w10, #6
+  beq WriteEnd
+  add x18, x18, x17
+  str s20, [x18]
+  cmp w10, #7
+  beq WriteEnd
+  add x18, x18, x17
+  str s22, [x18]
+  cmp w10, #8
+  beq WriteEnd
+  add x18, x18, x17
+  str s24, [x18]
+  cmp w10, #9
+  beq WriteEnd
+  add x18, x18, x17
+  str s26, [x18]
+  cmp w10, #10
+  beq WriteEnd
+  add x18, x18, x17
+  str s28, [x18]
+  cmp w10, #11
+  beq WriteEnd
+  add x18, x18, x17
+  str s30, [x18]
+  add x18, x18, x17
+  b WriteEnd
+Write2:
+  dup s9, v8.s[1]
+  stp s8, s9, [x18]
+  cmp w10, #1
+  beq WriteEnd
+  add x18, x18, x17
+  dup s11, v10.s[1]
+  stp s10, s11, [x18]
+  cmp w10, #2
+  beq WriteEnd
+  add x18, x18, x17
+  dup s13, v12.s[1]
+  stp s12, s13, [x18]
+  cmp w10, #3
+  beq WriteEnd
+  add x18, x18, x17
+  dup s15, v14.s[1]
+  stp s14, s15, [x18]
+  cmp w10, #4
+  beq WriteEnd
+  add x18, x18, x17
+  dup s17, v16.s[1]
+  stp s16, s17, [x18]
+  cmp w10, #5
+  beq WriteEnd
+  add x18, x18, x17
+  dup s19, v18.s[1]
+  stp s18, s19, [x18]
+  cmp w10, #6
+  beq WriteEnd
+  add x18, x18, x17
+  dup s21, v20.s[1]
+  stp s20, s21, [x18]
+  cmp w10, #7
+  beq WriteEnd
+  add x18, x18, x17
+  dup s23, v22.s[1]
+  stp s22, s23, [x18]
+  cmp w10, #8
+  beq WriteEnd
+  add x18, x18, x17
+  dup s25, v24.s[1]
+  stp s24, s25, [x18]
+  cmp w10, #9
+  beq WriteEnd
+  add x18, x18, x17
+  dup s27, v26.s[1]
+  stp s26, s27, [x18]
+  cmp w10, #10
+  beq WriteEnd
+  add x18, x18, x17
+  dup s29, v28.s[1]
+  stp s28, s29, [x18]
+  cmp w10, #11
+  beq WriteEnd
+  add x18, x18, x17
+  dup s31, v30.s[1]
+  stp s30, s31, [x18]
+  add x18, x18, x17
+  b WriteEnd
+Write3:
+  add x13, x18, #8
+  dup s9, v8.s[1]
+  stp s8, s9, [x18]
+  add x18, x18, x17
+  st1 {v8.s}[2], [x13], x17
+  cmp w10, #1
+  beq WriteEnd
+  dup s11, v10.s[1]
+  stp s10, s11, [x18]
+  add x18, x18, x17
+  st1 {v10.s}[2], [x13], x17
+  cmp w10, #2
+  beq WriteEnd
+  dup s13, v12.s[1]
+  stp s12, s13, [x18]
+  add x18, x18, x17
+  st1 {v12.s}[2], [x13], x17
+  cmp w10, #3
+  beq WriteEnd
+  dup s15, v14.s[1]
+  stp s14, s15, [x18]
+  add x18, x18, x17
+  st1 {v14.s}[2], [x13], x17
+  cmp w10, #4
+  beq WriteEnd
+  dup s17, v16.s[1]
+  stp s16, s17, [x18]
+  add x18, x18, x17
+  st1 {v16.s}[2], [x13], x17
+  cmp w10, #5
+  beq WriteEnd
+  dup s19, v18.s[1]
+  stp s18, s19, [x18]
+  add x18, x18, x17
+  st1 {v18.s}[2], [x13], x17
+  cmp w10, #6
+  beq WriteEnd
+  dup s21, v20.s[1]
+  stp s20, s21, [x18]
+  add x18, x18, x17
+  st1 {v20.s}[2], [x13], x17
+  cmp w10, #7
+  beq WriteEnd
+  dup s23, v22.s[1]
+  stp s22, s23, [x18]
+  add x18, x18, x17
+  st1 {v22.s}[2], [x13], x17
+  cmp w10, #8
+  beq WriteEnd
+  dup s25, v24.s[1]
+  stp s24, s25, [x18]
+  add x18, x18, x17
+  st1 {v24.s}[2], [x13], x17
+  cmp w10, #9
+  beq WriteEnd
+  dup s27, v26.s[1]
+  stp s26, s27, [x18]
+  add x18, x18, x17
+  st1 {v26.s}[2], [x13], x17
+  cmp w10, #10
+  beq WriteEnd
+  dup s29, v28.s[1]
+  stp s28, s29, [x18]
+  add x18, x18, x17
+  st1 {v28.s}[2], [x13], x17
+  cmp w10, #11
+  beq WriteEnd
+  dup s31, v30.s[1]
+  stp s30, s31, [x18]
+  add x18, x18, x17
+  st1 {v30.s}[2], [x13]
+  b WriteEnd
+Write4:
+  st1 {v8.4s}, [x18], x17
+  cmp w10, #1
+  beq WriteEnd
+  st1 {v10.4s}, [x18], x17
+  cmp w10, #2
+  beq WriteEnd
+  st1 {v12.4s}, [x18], x17
+  cmp w10, #3
+  beq WriteEnd
+  st1 {v14.4s}, [x18], x17
+  cmp w10, #4
+  beq WriteEnd
+  st1 {v16.4s}, [x18], x17
+  cmp w10, #5
+  beq WriteEnd
+  st1 {v18.4s}, [x18], x17
+  cmp w10, #6
+  beq WriteEnd
+  st1 {v20.4s}, [x18], x17
+  cmp w10, #7
+  beq WriteEnd
+  st1 {v22.4s}, [x18], x17
+  cmp w10, #8
+  beq WriteEnd
+  st1 {v24.4s}, [x18], x17
+  cmp w10, #9
+  beq WriteEnd
+  st1 {v26.4s}, [x18], x17
+  cmp w10, #10
+  beq WriteEnd
+  st1 {v28.4s}, [x18], x17
+  cmp w10, #11
+  beq WriteEnd
+  st1 {v30.4s}, [x18], x17
+  b WriteEnd
+Write5:
+  add x13, x18, #16
+  st1 {v8.4s}, [x18], x17
+  str s9, [x13]
+  cmp w10, #1
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v10.4s}, [x18], x17
+  str s11, [x13]
+  cmp w10, #2
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v12.4s}, [x18], x17
+  str s13, [x13]
+  cmp w10, #3
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v14.4s}, [x18], x17
+  str s15, [x13]
+  cmp w10, #4
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v16.4s}, [x18], x17
+  str s17, [x13]
+  cmp w10, #5
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v18.4s}, [x18], x17
+  str s19, [x13]
+  cmp w10, #6
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v20.4s}, [x18], x17
+  str s21, [x13]
+  cmp w10, #7
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v22.4s}, [x18], x17
+  str s23, [x13]
+  cmp w10, #8
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v24.4s}, [x18], x17
+  str s25, [x13]
+  cmp w10, #9
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v26.4s}, [x18], x17
+  str s27, [x13]
+  cmp w10, #10
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v28.4s}, [x18], x17
+  str s29, [x13]
+  cmp w10, #11
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v30.4s}, [x18], x17
+  str s31, [x13]
+  b WriteEnd
+Write6:
+  add x13, x18, #16
+  st1 {v8.4s}, [x18], x17
+  dup s8, v9.s[1]
+  stp s9, s8, [x13]
+  cmp w10, #1
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v10.4s}, [x18], x17
+  dup s10, v11.s[1]
+  stp s11, s10, [x13]
+  cmp w10, #2
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v12.4s}, [x18], x17
+  dup s12, v13.s[1]
+  stp s13, s12, [x13]
+  cmp w10, #3
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v14.4s}, [x18], x17
+  dup s14, v15.s[1]
+  stp s15, s14, [x13]
+  cmp w10, #4
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v16.4s}, [x18], x17
+  dup s16, v17.s[1]
+  stp s17, s16, [x13]
+  cmp w10, #5
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v18.4s}, [x18], x17
+  dup s18, v19.s[1]
+  stp s19, s18, [x13]
+  cmp w10, #6
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v20.4s}, [x18], x17
+  dup s20, v21.s[1]
+  stp s21, s20, [x13]
+  cmp w10, #7
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v22.4s}, [x18], x17
+  dup s22, v23.s[1]
+  stp s23, s22, [x13]
+  cmp w10, #8
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v24.4s}, [x18], x17
+  dup s24, v25.s[1]
+  stp s25, s24, [x13]
+  cmp w10, #9
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v26.4s}, [x18], x17
+  dup s26, v27.s[1]
+  stp s27, s26, [x13]
+  cmp w10, #10
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v28.4s}, [x18], x17
+  dup s28, v29.s[1]
+  stp s29, s28, [x13]
+  cmp w10, #11
+  beq WriteEnd
+  add x13, x13, x17
+  st1 {v30.4s}, [x18], x17
+  dup s30, v31.s[1]
+  stp s31, s30, [x13]
+  b WriteEnd
+Write7:
+  add x13, x18, #16
+  add x16, x18, #24
+  st1 {v8.4s}, [x18], x17
+  dup s8, v9.s[1]
+  stp s9, s8, [x13]
+  add x13, x13, x17
+  st1 {v9.s}[2], [x16], x17
+  cmp w10, #1
+  beq WriteEnd
+  st1 {v10.4s}, [x18], x17
+  dup s10, v11.s[1]
+  stp s11, s10, [x13]
+  add x13, x13, x17
+  st1 {v11.s}[2], [x16], x17
+  cmp w10, #2
+  beq WriteEnd
+  st1 {v12.4s}, [x18], x17
+  dup s12, v13.s[1]
+  stp s13, s12, [x13]
+  add x13, x13, x17
+  st1 {v13.s}[2], [x16], x17
+  cmp w10, #3
+  beq WriteEnd
+  st1 {v14.4s}, [x18], x17
+  dup s14, v15.s[1]
+  stp s15, s14, [x13]
+  add x13, x13, x17
+  st1 {v15.s}[2], [x16], x17
+  cmp w10, #4
+  beq WriteEnd
+  st1 {v16.4s}, [x18], x17
+  dup s16, v17.s[1]
+  stp s17, s16, [x13]
+  add x13, x13, x17
+  st1 {v17.s}[2], [x16], x17
+  cmp w10, #5
+  beq WriteEnd
+  st1 {v18.4s}, [x18], x17
+  dup s18, v19.s[1]
+  stp s19, s18, [x13]
+  add x13, x13, x17
+  st1 {v19.s}[2], [x16], x17
+  cmp w10, #6
+  beq WriteEnd
+  st1 {v20.4s}, [x18], x17
+  dup s20, v21.s[1]
+  stp s21, s20, [x13]
+  add x13, x13, x17
+  st1 {v21.s}[2], [x16], x17
+  cmp w10, #7
+  beq WriteEnd
+  st1 {v22.4s}, [x18], x17
+  dup s22, v23.s[1]
+  stp s23, s22, [x13]
+  add x13, x13, x17
+  st1 {v23.s}[2], [x16], x17
+  cmp w10, #8
+  beq WriteEnd
+  st1 {v24.4s}, [x18], x17
+  dup s24, v25.s[1]
+  stp s25, s24, [x13]
+  add x13, x13, x17
+  st1 {v25.s}[2], [x16], x17
+  cmp w10, #9
+  beq WriteEnd
+  st1 {v26.4s}, [x18], x17
+  dup s26, v27.s[1]
+  stp s27, s26, [x13]
+  add x13, x13, x17
+  st1 {v27.s}[2], [x16], x17
+  cmp w10, #10
+  beq WriteEnd
+  st1 {v28.4s}, [x18], x17
+  dup s28, v29.s[1]
+  stp s29, s28, [x13]
+  add x13, x13, x17
+  st1 {v29.s}[2], [x16], x17
+  cmp w10, #11
+  beq WriteEnd
+  st1 {v30.4s}, [x18], x17
+  dup s30, v31.s[1]
+  stp s31, s30, [x13]
+  add x13, x13, x17
+  st1 {v31.s}[2], [x16], x17
+  b WriteEnd
+WriteC8:
+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
+  st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64
+  st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64
+  st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
+  st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+  b WriteEnd
+WriteWino:
+  st1 {v8.4s, v9.4s}, [x18], x8
+  st1 {v10.4s, v11.4s}, [x18], x8
+  st1 {v12.4s, v13.4s}, [x18], x8
+  st1 {v14.4s, v15.4s}, [x18], x8
+  st1 {v16.4s, v17.4s}, [x18], x8
+  st1 {v18.4s, v19.4s}, [x18], x8
+  st1 {v20.4s, v21.4s}, [x18], x8
+  st1 {v22.4s, v23.4s}, [x18], x8
+  st1 {v24.4s, v25.4s}, [x18], x8
+  st1 {v26.4s, v27.4s}, [x18], x8
+  st1 {v28.4s, v29.4s}, [x18], x8
+  st1 {v30.4s, v31.4s}, [x18], x8
+  b WriteEnd
+Write8:
+  st1 {v8.4s, v9.4s}, [x18], x17
+  cmp w10, #1
+  beq WriteEnd
+  st1 {v10.4s, v11.4s}, [x18], x17
+  cmp w10, #2
+  beq WriteEnd
+  st1 {v12.4s, v13.4s}, [x18], x17
+  cmp w10, #3
+  beq WriteEnd
+  st1 {v14.4s, v15.4s}, [x18], x17
+  cmp w10, #4
+  beq WriteEnd
+  st1 {v16.4s, v17.4s}, [x18], x17
+  cmp w10, #5
+  beq WriteEnd
+  st1 {v18.4s, v19.4s}, [x18], x17
+  cmp w10, #6
+  beq WriteEnd
+  st1 {v20.4s, v21.4s}, [x18], x17
+  cmp w10, #7
+  beq WriteEnd
+  st1 {v22.4s, v23.4s}, [x18], x17
+  cmp w10, #8
+  beq WriteEnd
+  st1 {v24.4s, v25.4s}, [x18], x17
+  cmp w10, #9
+  beq WriteEnd
+  st1 {v26.4s, v27.4s}, [x18], x17
+  cmp w10, #10
+  beq WriteEnd
+  st1 {v28.4s, v29.4s}, [x18], x17
+  cmp w10, #11
+  beq WriteEnd
+  st1 {v30.4s, v31.4s}, [x18], x17
+
+WriteEnd:
+  subs w10, w10, #12 // lhs row - 12
+  bgt L2
+
+End2:
+  subs w7, w7, #8 // rhs col - 8
+  add x1, x1, x15 // rhs ptr + stride
+  cbz x3, NoBiasStep
+  add x3, x3, #32 // bias ptr + stride
+NoBiasStep:
+  cbnz x14, WinoDstStep
+  cbz x9, NoDstStep
+  add x2, x2, #32 // dst ptr + stride
+  b NoDstStep
+WinoDstStep:
+  add x2, x2, x11
+NoDstStep:
+  bgt L1
+
+End1:
+  sub sp, sp, #128
+  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ret
+#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
new file mode 100644
index 0000000000..dd42b49245
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
@@ -0,0 +1,144 @@
+#ifdef __aarch64__
+    .text
+    .align 5
+    .global MatmulFloatNeon64OptRemain
+#ifndef __APPLE__
+    .type MatmulFloatNeon64OptRemain, %function
+#endif
+
+// void MatmulFloatNeon64(const float *a, const float *b, float *c, int depth
+//                        int row, int col, size_t stride)
+// x0: a
+// x1: b
+// x2: c
+// x3: depth
+// x4: row
+// x5: col
+// x6: stride
+// only for winograd
+MatmulFloatNeon64OptRemain:
+    mov x18, #32 // sizeof(float) * 8
+    mul x9, x3, x18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
+    mov x18, #4
+    mul x8, x5, x6
+    mov x11, #8
+    mul x11, x11, x6
+    mul x8, x8, x18
+    mul x11, x11, x18
+
+    cmp x4, #4
+    ble LoopH4
+
+    LoopH8:
+        mov x10, x4 // reload lhs row
+        mov x12, x0 // reload lhs ptr
+        mov x18, x2 // reload dst ptr
+
+        LoopW8:
+            mov x16, x1 // reload rhs ptr
+            mov x13, x3 // reload depth
+            dup v16.4s, wzr
+            dup v17.4s, wzr
+            dup v18.4s, wzr
+            dup v19.4s, wzr
+            dup v20.4s, wzr
+            dup v21.4s, wzr
+            dup v22.4s, wzr
+            dup v23.4s, wzr
+            dup v24.4s, wzr
+            dup v25.4s, wzr
+            dup v26.4s, wzr
+            dup v27.4s, wzr
+            dup v28.4s, wzr
+            dup v29.4s, wzr
+            dup v30.4s, wzr
+            dup v31.4s, wzr
+
+            LoopD8:
+                ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
+                ld1 {v3.4s, v4.4s}, [x16], #32
+                fmla v16.4s, v3.4s, v0.s[0]
+                fmla v18.4s, v3.4s, v0.s[1]
+                fmla v20.4s, v3.4s, v0.s[2]
+                fmla v22.4s, v3.4s, v0.s[3]
+                fmla v17.4s, v4.4s, v0.s[0]
+                fmla v19.4s, v4.4s, v0.s[1]
+                fmla v21.4s, v4.4s, v0.s[2]
+                fmla v23.4s, v4.4s, v0.s[3]
+                fmla v24.4s, v3.4s, v1.s[0]
+                fmla v26.4s, v3.4s, v1.s[1]
+                fmla v28.4s, v3.4s, v1.s[2]
+                fmla v30.4s, v3.4s, v1.s[3]
+                fmla v25.4s, v4.4s, v1.s[0]
+                fmla v27.4s, v4.4s, v1.s[1]
+                fmla v29.4s, v4.4s, v1.s[2]
+                fmla v31.4s, v4.4s, v1.s[3]
+
+                subs w13, w13, #1
+                bgt LoopD8
+
+                st1 {v16.4s, v17.4s}, [x18], x8
+                st1 {v18.4s, v19.4s}, [x18], x8
+                st1 {v20.4s, v21.4s}, [x18], x8
+                st1 {v22.4s, v23.4s}, [x18], x8
+                st1 {v24.4s, v25.4s}, [x18], x8
+                st1 {v26.4s, v27.4s}, [x18], x8
+                st1 {v28.4s, v29.4s}, [x18], x8
+                st1 {v30.4s, v31.4s}, [x18], x8
+
+            subs x10, x10, #8 // lhs row - 8
+            bgt LoopW8
+
+        subs x5, x5, #8 // rhs col - 8
+        add x1, x1, x9 // rhs ptr + stride
+        add x2, x2, x11
+        bgt LoopH8
+
+        ret
+
+    LoopH4:
+        mov x10, x4 // reload lhs row
+        mov x12, x0 // reload lhs ptr
+        mov x18, x2 // reload dst ptr
+
+        LoopW4:
+            mov x16, x1 // reload rhs ptr
+            mov x13, x3 // reload depth
+            dup v16.4s, wzr
+            dup v17.4s, wzr
+            dup v18.4s, wzr
+            dup v19.4s, wzr
+            dup v20.4s, wzr
+            dup v21.4s, wzr
+            dup v22.4s, wzr
+            dup v23.4s, wzr
+
+            LoopD4:
+                ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
+                ld1 {v3.4s, v4.4s}, [x16], #32
+                fmla v16.4s, v3.4s, v0.s[0]
+                fmla v18.4s, v3.4s, v0.s[1]
+                fmla v20.4s, v3.4s, v0.s[2]
+                fmla v22.4s, v3.4s, v0.s[3]
+                fmla v17.4s, v4.4s, v0.s[0]
+                fmla v19.4s, v4.4s, v0.s[1]
+                fmla v21.4s, v4.4s, v0.s[2]
+                fmla v23.4s, v4.4s, v0.s[3]
+
+                subs x13, x13, #1
+                bgt LoopD4
+
+                st1 {v16.4s, v17.4s}, [x18], x8
+                st1 {v18.4s, v19.4s}, [x18], x8
+                st1 {v20.4s, v21.4s}, [x18], x8
+                st1 {v22.4s, v23.4s}, [x18], x8
+
+            subs x10, x10, #4 // lhs row - 4
+            bgt LoopW4
+
+        subs x5, x5, #8 // rhs col - 8
+        add x1, x1, x9 // rhs ptr + stride
+        add x2, x2, x11
+        bgt LoopH4
+    ret
+#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
index 9f1c11a3e9..e92adc1be6 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
@@ -24,7 +24,7 @@
 
 //void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, 
 //                      const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
-//                      int multiplier, int left_shift, int right_shift);
+//                      int multiplier, int left_shift, int right_shift, int row, int col, int stride);
 
 // x0: a(left matrix ptr)
 // x1: b(right matrix ptr)
@@ -40,13 +40,18 @@
 // w11: multiplier
 // w12: left_shift
 // w13: right_shift 
+// w14: row
+// w15: col
+// w24: stride
 
 MatmulInt8Neon64:
-  sub sp, sp, #160
+  sub sp, sp, #192
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   stp x19, x20, [sp], #16
   stp x21, x22, [sp], #16
+  stp x23, x24, [sp], #16
+  stp x25, x26, [sp], #16
 
   ldr w8, [sp]
   ldr w9, [sp, #8]
@@ -54,25 +59,28 @@ MatmulInt8Neon64:
   ldr w11, [sp, #24]
   ldr w12, [sp, #32]
   ldr w13, [sp, #40]
+  ldr w14, [sp, #48]
+  ldr w15, [sp, #56]
+  ldr w24, [sp, #64]
 
-  mov w15, #0       // b col index
-  mov w16, #0       // a row index
   mov w17, #4       // sizeof(int8)*4
   mul w21, w5, w17  // the stride of a/b: sizeof(int8)*4*deep16
-
+  mov w17, #1
+  mov x25, x2
 L1:
-  cmp w15, w4      
+  cmp w4, #0      // if at the end of col4
   beq End1
 
-  mov w16, #0     // reset a row index
+  mov w16, w3     // reset a row4 counter
+  mov w23, w14    // reset a row counter
   mov x17, x0     // reload a ptr
   mov x22, x6     // reload a_sums ptr 
 L2:
-  cmp w16, w3
+  cmp w16, #0
   beq End2
 
   mov x18, x1     // reload b ptr
-  mov x19, x7    // reload bias ptr
+  mov x19, x7     // reload bias ptr
   mov w20, w5     // reload depth
   dup v16.4s, wzr
   dup v17.4s, wzr
@@ -256,21 +264,128 @@ End3:
   sqxtn v15.8b, v13.8h
   sqxtn2 v15.16b, v14.8h
 
-  st1 {v15.16b}, [x2], #16
-  add w16, w16, #4      // a row index + 4
+  cmp w23, #4
+  blt Write     // if rows < 4
+  cmp w15, #4
+  blt Write     // if cols < 4
+
+  st1 {v15.s}[0], [x2], x24
+  st1 {v15.s}[1], [x2], x24
+  st1 {v15.s}[2], [x2], x24
+  st1 {v15.s}[3], [x2], x24
+  b Endwrite
+
+Write:
+  cmp w15, #4
+  beq WriteCol4
+  cmp w15, #3
+  beq WriteCol3
+  cmp w15, #2
+  beq WriteCol2
+  cmp w15, #1
+  beq WriteCol1
+  
+WriteCol4:
+  st1 {v15.s}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v15.s}[1], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v15.s}[2], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v15.s}[3], [x2], x24
+  b Endwrite
+
+WriteCol3:
+  mov x26, x2
+  st1 {v15.b}[0], [x26], #1
+  st1 {v15.b}[1], [x26], #1
+  st1 {v15.b}[2], [x26], #1
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[4], [x26], #1
+  st1 {v15.b}[5], [x26], #1
+  st1 {v15.b}[6], [x26], #1
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[8], [x26], #1
+  st1 {v15.b}[9], [x26], #1
+  st1 {v15.b}[10], [x26], #1
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[12], [x26], #1
+  st1 {v15.b}[13], [x26], #1
+  st1 {v15.b}[14], [x26], #1
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol2:
+  mov x26, x2
+  st1 {v15.b}[0], [x26], #1
+  st1 {v15.b}[1], [x26], #1
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[4], [x26], #1
+  st1 {v15.b}[5], [x26], #1
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[8], [x26], #1
+  st1 {v15.b}[9], [x26], #1
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v15.b}[12], [x26], #1
+  st1 {v15.b}[13], [x26], #1
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol1:
+  st1 {v15.b}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v15.b}[4], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v15.b}[8], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v15.b}[12], [x2], x24
+  b Endwrite
+
+Endwrite:  
+  sub w16, w16, #4      // a row4 counter - 4
+  sub w23, w23, #4      // a row counter - 4
   b L2
 
 End2:
-  add w15, w15, #4      // b col index + 4
+  sub w4, w4, #4        // b col4 counter - 4
+  sub w15, w15, #4      // b col counter - 4
   add x1, x1, x21       // b ptr + stride
   add x7, x7, #16       // bias ptr + stride
+  add x25, x25, #4      // output + stride(4 * sizeof(int8))
+  mov x2, x25
   b L1
 
 End1:
-  sub sp, sp, #160
+  sub sp, sp, #192
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   ldp x19, x20, [sp], #16
   ldp x21, x22, [sp], #16
+  ldp x23, x24, [sp], #16
+  ldp x25, x26, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S b/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
new file mode 100644
index 0000000000..6cc0a2cf40
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
@@ -0,0 +1,117 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwFp16Row
+#ifndef __APPLE__
+.type ConvDwFp16Row, %function
+#endif
+
+// void ConvDwFp16Row(float16_t* output_ptr, const float16_t* input_ptr,const float16_t* filter_ptr,
+//                    size_t num_pixels, size_t input_channel, size_t input_step)
+// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
+// x4: input_channel, x5: input_step
+//
+ConvDwFp16Row:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+cmp x3, #0
+beq End
+
+mov x9, x0
+mov x12, #2 // sizeof(float16_t)
+mul x5, x5, x12
+
+LoopOutPixel:
+mov x6, x1
+mov x7, x2
+mov x8, x4
+
+LoopInputDepth32In:
+    cmp x8, #32
+    blt Loop8
+    sub x8, x8, #32
+
+    ld1 {v0.8h, v1.8h}, [x6], #32
+    ld1 {v2.8h, v3.8h}, [x7], #32
+    ld1 {v16.8h, v17.8h}, [x0], #32
+
+    cmp x8, #32
+    blt LoopInputDepth32Out
+    LoopInputDepth32:
+    fmla v16.8h, v0.8h, v2.8h
+    fmla v17.8h, v1.8h, v3.8h
+
+    st1 {v16.8h, v17.8h}, [x9], #32
+
+    ld1 {v4.8h, v5.8h}, [x6], #32
+    ld1 {v6.8h, v7.8h}, [x7], #32
+    ld1 {v18.8h, v19.8h}, [x0], #32
+
+    fmla v18.8h, v4.8h, v6.8h
+    fmla v19.8h, v5.8h, v7.8h
+
+    st1 {v18.8h, v19.8h}, [x9], #32
+
+    ld1 {v0.8h, v1.8h}, [x6], #32
+    ld1 {v2.8h, v3.8h}, [x7], #32
+    ld1 {v16.8h, v17.8h}, [x0], #32
+
+    sub x8, x8, #32
+    cmp x8, #32
+    bge LoopInputDepth32
+
+    LoopInputDepth32Out:
+    fmla v16.8h, v0.8h, v2.8h
+    fmla v17.8h, v1.8h, v3.8h
+    st1 {v16.8h, v17.8h}, [x9], #32
+
+    ld1 {v4.8h, v5.8h}, [x6], #32
+    ld1 {v6.8h, v7.8h}, [x7], #32
+    ld1 {v18.8h, v19.8h}, [x0], #32
+
+    fmla v18.8h, v4.8h, v6.8h
+    fmla v19.8h, v5.8h, v7.8h
+
+    st1 {v18.8h, v19.8h}, [x9], #32
+
+    Loop8:
+    cmp x8, #8
+    blt L0
+
+    LoopInputDepth8:
+    ld1 {v0.8h}, [x6], #16
+    ld1 {v2.8h}, [x7], #16
+    ld1 {v16.8h}, [x0], #16
+    fmla v16.8h, v0.8h, v2.8h
+    st1 {v16.8h}, [x9], #16
+    sub x8, x8, #8
+    cmp x8, #8
+    bge LoopInputDepth8
+
+    L0:
+    cmp x8, #0
+    beq Loop8LineEnd
+
+    LoopInputDepth0:
+    ldr h0, [x6], #2
+    ldr h1, [x7], #2
+    ldr h2, [x0], #2
+    fmul h0, h0, h1
+    fadd h2, h2, h0
+    str h2, [x9], #2
+    subs x8, x8, #1
+    bne LoopInputDepth0
+
+    Loop8LineEnd:
+
+subs x3, x3, #1
+add x1, x1, x5
+bne LoopOutPixel
+
+End:
+ret
+
+#endif
diff --git a/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S b/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S
index ae4c07fbeb..be79622f64 100644
--- a/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S
+++ b/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S
@@ -36,7 +36,7 @@ IndirectGemmInt8_24x4_dp:
         ld1 {v17.4s}, [x22], x23
         ld1 {v18.4s}, [x22], x23
         ld1 {v19.4s}, [x22], x23
-        ld1{v20.4s}, [x22], x23
+        ld1 {v20.4s}, [x22], x23
         ld1 {v21.4s}, [x22], x23
         ld1 {v22.4s}, [x22], x23
         ld1 {v23.4s}, [x22], x23
@@ -404,7 +404,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v8.4s, v8.4s, v0.4s
                 srshl v8.4s, v8.4s, v4.4s
-                and v0.16b, v4.16b, v9.16b
+                and v1.16b, v4.16b, v9.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v9.4s, v9.4s, v1.4s
                 srshl v9.4s, v9.4s, v4.4s
@@ -420,7 +420,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v12.4s, v12.4s, v0.4s
                 srshl v12.4s, v12.4s, v4.4s
-                and v0.16b, v4.16b, v13.16b
+                and v1.16b, v4.16b, v13.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v13.4s, v13.4s, v1.4s
                 srshl v13.4s, v13.4s, v4.4s
@@ -436,7 +436,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v16.4s, v16.4s, v0.4s
                 srshl v16.4s, v16.4s, v4.4s
-                and v0.16b, v4.16b, v17.16b
+                and v1.16b, v4.16b, v17.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v17.4s, v17.4s, v1.4s
                 srshl v17.4s, v17.4s, v4.4s
@@ -452,7 +452,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v20.4s, v20.4s, v0.4s
                 srshl v20.4s, v20.4s, v4.4s
-                and v0.16b, v4.16b, v21.16b
+                and v1.16b, v4.16b, v21.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v21.4s, v21.4s, v1.4s
                 srshl v21.4s, v21.4s, v4.4s
@@ -468,7 +468,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v24.4s, v24.4s, v0.4s
                 srshl v24.4s, v24.4s, v4.4s
-                and v0.16b, v4.16b, v25.16b
+                and v1.16b, v4.16b, v25.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v25.4s, v25.4s, v1.4s
                 srshl v25.4s, v25.4s, v4.4s
@@ -484,7 +484,7 @@ IndirectGemmInt8_24x4_dp:
                 sshr v0.4s, v0.4s, #31
                 sqadd v28.4s, v28.4s, v0.4s
                 srshl v28.4s, v28.4s, v4.4s
-                and v0.16b, v4.16b, v29.16b
+                and v1.16b, v4.16b, v29.16b
                 sshr v1.4s, v1.4s, #31
                 sqadd v29.4s, v29.4s, v1.4s
                 srshl v29.4s, v29.4s, v4.4s
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
new file mode 100644
index 0000000000..11a27b1b4d
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -0,0 +1,820 @@
+#ifdef __aarch64__
+    .text
+    .align 5
+    .global MatmulInt8DpNeon64
+#ifndef __APPLE__
+    .type MatmulInt8DpNeon64, %function
+#endif
+
+//
+//                                      int8 RHS 4x8 block
+//                           /-----------------------------------------|
+//                           |v2.b[0] ... v2.b[12] v3.b[0] ... v3.b[12]|
+//                           |  ...                              ...   |
+//                           |v2.b[3] ... v2.b[15] v3.b[3] ... v3.b[15]|
+//                           \-----------------------------------------/
+//    int8 LHS 8x4 block
+//  /---------------------\  /-------------------------------------------|
+//  |v0.b[0]  ... v0.b[3] |  |v16.s[0] ... v16.s[3] v17.s[0] ... v17.s[3]|         
+//  |v0.b[4]  ... v0.b[7]    |v18.s[0] ... v18.s[3] v19.s[0] ... v19.s[3]| 
+//  |v0.b[8]  ... v0.b[11]   |v20.s[0] ... v20.s[3] v21.s[0] ... v21.s[3]|  
+//  |v0.b[12] ... v0.b[15]|  |v22.s[0] ... v22.s[3] v23.s[0] ... v23.s[3]|
+//  |v1.b[0]  ... v1.b[3] |  |v24.s[0] ... v24.s[3] v25.s[0] ... v25.s[3]|
+//  |v1.b[4]  ... v1.b[7] |  |v26.s[0] ... v26.s[3] v27.s[0] ... v27.s[3]|
+//  |v1.b[8]  ... v1.b[11]|  |v28.s[0] ... v28.s[3] v29.s[0] ... v29.s[3]|
+//  |v1.b[12] ... v1.b[15]|  |v30.s[0] ... v30.s[3] v31.s[0] ... v31.s[3]|
+//  \---------------------/  \-------------------------------------------/
+//                                  int32 accumulators 8x8 block
+
+
+
+//                              int8 RHS 16x8 block
+//                            /-------------|
+//                            |v2     v3    |
+//                            |v6     v7    |
+//                            |v10    v11   |
+//                            |v14    v15   |
+//                            \-------------/
+//    int8 LHS 8x16 block
+//  /--------------------\    /-------------|
+//  |v0   v4    v8    v12|    |             |
+//  |v1   v5    v9    v13|    |             |
+//  \--------------------/    \-------------/
+
+
+
+//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4, 
+//                      const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
+//                      int multiplier, int left_shift, int right_shift, int row, int col, int stride);
+
+// x0: a(left matrix ptr)
+// x1: b(right matrix ptr)
+// x2: out ptr
+// w3: row8
+// w4: col8
+// w5: deep4
+// x6: a_sums
+// x7: bias
+// w8: act_min
+// w9: act_max
+// w10: out_zp
+// w11: multiplier
+// w12: left_shift
+// w13: right_shift 
+// w14: row
+// w15: col
+// w24: stride
+
+MatmulInt8DpNeon64:
+  sub sp, sp, #192
+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  stp x19, x20, [sp], #16
+  stp x21, x22, [sp], #16
+  stp x23, x24, [sp], #16
+  stp x25, x26, [sp], #16
+
+  ldr w8, [sp]
+  ldr w9, [sp, #8]
+  ldr w10, [sp, #16]
+  ldr w11, [sp, #24]
+  ldr w12, [sp, #32]
+  ldr w13, [sp, #40]
+  ldr w14, [sp, #48]
+  ldr w15, [sp, #56]
+  ldr w24, [sp, #64]
+
+  mov w17, #8       // sizeof(int8)*8
+  mul w21, w5, w17  // the stride of a/b: sizeof(int8)*8*deep4
+  mov x25, x2
+L1:
+  cmp w4, #0      // if at the end of col8
+  beq End1
+
+  mov w16, w3     // reset a row8 counter
+  mov w23, w14    // reset a row counter
+  mov x17, x0     // reload a ptr
+  mov x22, x6     // reload a_sums ptr 
+L2:
+  cmp w16, #0
+  beq End2
+
+  mov x18, x1     // reload b ptr
+  mov x19, x7     // reload bias ptr
+  mov w20, w5     // reload depth
+  dup v16.4s, wzr
+  dup v17.4s, wzr
+  dup v18.4s, wzr
+  dup v19.4s, wzr
+  dup v20.4s, wzr
+  dup v21.4s, wzr
+  dup v22.4s, wzr
+  dup v23.4s, wzr
+  dup v24.4s, wzr
+  dup v25.4s, wzr
+  dup v26.4s, wzr
+  dup v27.4s, wzr
+  dup v28.4s, wzr
+  dup v29.4s, wzr
+  dup v30.4s, wzr
+  dup v31.4s, wzr
+L3:
+  cmp w20, #16
+  blt LoopD4
+
+LoopD16:
+  ld1 {v0.16b, v1.16b}, [x17], #32
+  ld1 {v2.16b, v3.16b}, [x18], #32
+
+  sdot v16.4s, v2.16b, v0.4b[0]
+  sdot v18.4s, v2.16b, v0.4b[1]
+  sdot v20.4s, v2.16b, v0.4b[2]
+  sdot v22.4s, v2.16b, v0.4b[3]
+
+  ld1 {v4.16b, v5.16b}, [x17], #32
+  sdot v24.4s, v2.16b, v1.4b[0]
+  sdot v26.4s, v2.16b, v1.4b[1]
+  sdot v28.4s, v2.16b, v1.4b[2]
+  sdot v30.4s, v2.16b, v1.4b[3]
+
+  ld1 {v6.16b, v7.16b}, [x18], #32
+  sdot v17.4s, v3.16b, v0.4b[0]
+  sdot v19.4s, v3.16b, v0.4b[1]
+  sdot v21.4s, v3.16b, v0.4b[2]
+  sdot v23.4s, v3.16b, v0.4b[3]
+
+  sdot v25.4s, v3.16b, v1.4b[0]
+  sdot v27.4s, v3.16b, v1.4b[1]
+  sdot v29.4s, v3.16b, v1.4b[2]
+  sdot v31.4s, v3.16b, v1.4b[3]
+
+  ld1 {v8.16b, v9.16b}, [x17], #32
+  sdot v16.4s, v6.16b, v4.4b[0]
+  sdot v18.4s, v6.16b, v4.4b[1]
+  sdot v20.4s, v6.16b, v4.4b[2]
+  sdot v22.4s, v6.16b, v4.4b[3]
+
+  sdot v24.4s, v6.16b, v5.4b[0]
+  sdot v26.4s, v6.16b, v5.4b[1]
+  sdot v28.4s, v6.16b, v5.4b[2]
+  sdot v30.4s, v6.16b, v5.4b[3]
+
+  ld1 {v10.16b, v11.16b}, [x18], #32
+  sdot v17.4s, v7.16b, v4.4b[0]
+  sdot v19.4s, v7.16b, v4.4b[1]
+  sdot v21.4s, v7.16b, v4.4b[2]
+  sdot v23.4s, v7.16b, v4.4b[3]
+
+  sdot v25.4s, v7.16b, v5.4b[0]
+  sdot v27.4s, v7.16b, v5.4b[1]
+  sdot v29.4s, v7.16b, v5.4b[2]
+  sdot v31.4s, v7.16b, v5.4b[3]
+
+  ld1 {v12.16b, v13.16b}, [x17], #32
+  sdot v16.4s, v10.16b, v8.4b[0]
+  sdot v18.4s, v10.16b, v8.4b[1]
+  sdot v20.4s, v10.16b, v8.4b[2]
+  sdot v22.4s, v10.16b, v8.4b[3]
+
+  sdot v24.4s, v10.16b, v9.4b[0]
+  sdot v26.4s, v10.16b, v9.4b[1]
+  sdot v28.4s, v10.16b, v9.4b[2]
+  sdot v30.4s, v10.16b, v9.4b[3]
+
+  ld1 {v14.16b, v15.16b}, [x18], #32
+  sdot v17.4s, v11.16b, v8.4b[0]
+  sdot v19.4s, v11.16b, v8.4b[1]
+  sdot v21.4s, v11.16b, v8.4b[2]
+  sdot v23.4s, v11.16b, v8.4b[3]
+
+  sdot v25.4s, v11.16b, v9.4b[0]
+  sdot v27.4s, v11.16b, v9.4b[1]
+  sdot v29.4s, v11.16b, v9.4b[2]
+  sdot v31.4s, v11.16b, v9.4b[3]
+
+  sdot v16.4s, v14.16b, v12.4b[0]
+  sdot v18.4s, v14.16b, v12.4b[1]
+  sdot v20.4s, v14.16b, v12.4b[2]
+  sdot v22.4s, v14.16b, v12.4b[3]
+
+  sdot v24.4s, v14.16b, v13.4b[0]
+  sdot v26.4s, v14.16b, v13.4b[1]
+  sdot v28.4s, v14.16b, v13.4b[2]
+  sdot v30.4s, v14.16b, v13.4b[3]
+
+  sdot v17.4s, v15.16b, v12.4b[0]
+  sdot v19.4s, v15.16b, v12.4b[1]
+  sdot v21.4s, v15.16b, v12.4b[2]
+  sdot v23.4s, v15.16b, v12.4b[3]
+
+  sdot v25.4s, v15.16b, v13.4b[0]
+  sdot v27.4s, v15.16b, v13.4b[1]
+  sdot v29.4s, v15.16b, v13.4b[2]
+  sdot v31.4s, v15.16b, v13.4b[3]
+
+  subs w20, w20, #16  // depth - 16
+  b L3
+
+LoopD4:
+  cmp w20, #0
+  beq End3
+
+  ld1 {v0.16b, v1.16b}, [x17], #32
+  ld1 {v2.16b, v3.16b}, [x18], #32
+
+  sdot v16.4s, v2.16b, v0.4b[0]
+  sdot v18.4s, v2.16b, v0.4b[1]
+  sdot v20.4s, v2.16b, v0.4b[2]
+  sdot v22.4s, v2.16b, v0.4b[3]
+  sdot v24.4s, v2.16b, v1.4b[0]
+  sdot v26.4s, v2.16b, v1.4b[1]
+  sdot v28.4s, v2.16b, v1.4b[2]
+  sdot v30.4s, v2.16b, v1.4b[3]
+  sdot v17.4s, v3.16b, v0.4b[0]
+  sdot v19.4s, v3.16b, v0.4b[1]
+  sdot v21.4s, v3.16b, v0.4b[2]
+  sdot v23.4s, v3.16b, v0.4b[3]
+  sdot v25.4s, v3.16b, v1.4b[0]
+  sdot v27.4s, v3.16b, v1.4b[1]
+  sdot v29.4s, v3.16b, v1.4b[2]
+  sdot v31.4s, v3.16b, v1.4b[3]
+  
+  subs w20, w20, #4  // depth - 4
+  b LoopD4
+
+End3:
+  // Add (Bias+Depth*Za*Zb-Za*Bsums)
+  ld1 {v15.4s}, [x19], #16  
+  ld1 {v14.4s}, [x19], #16  
+  add v16.4s, v16.4s, v15.4s
+  add v18.4s, v18.4s, v15.4s
+  add v20.4s, v20.4s, v15.4s
+  add v22.4s, v22.4s, v15.4s
+  add v24.4s, v24.4s, v15.4s
+  add v26.4s, v26.4s, v15.4s
+  add v28.4s, v28.4s, v15.4s
+  add v30.4s, v30.4s, v15.4s
+  add v17.4s, v17.4s, v14.4s
+  add v19.4s, v19.4s, v14.4s
+  add v21.4s, v21.4s, v14.4s
+  add v23.4s, v23.4s, v14.4s
+  add v25.4s, v25.4s, v14.4s
+  add v27.4s, v27.4s, v14.4s
+  add v29.4s, v29.4s, v14.4s
+  add v31.4s, v31.4s, v14.4s
+
+  // Subtract (Asums*Zb)
+  ld1 {v13.4s}, [x22], #16
+  ld1 {v12.4s}, [x22], #16
+  dup v0.4s, v13.s[0]
+  dup v1.4s, v13.s[1]
+  dup v2.4s, v13.s[2]
+  dup v3.4s, v13.s[3]
+  dup v4.4s, v12.s[0]
+  dup v5.4s, v12.s[1]
+  dup v6.4s, v12.s[2]
+  dup v7.4s, v12.s[3]
+  sub v16.4s, v16.4s, v0.4s
+  sub v17.4s, v17.4s, v0.4s
+  sub v18.4s, v18.4s, v1.4s
+  sub v19.4s, v19.4s, v1.4s
+  sub v20.4s, v20.4s, v2.4s
+  sub v21.4s, v21.4s, v2.4s
+  sub v22.4s, v22.4s, v3.4s
+  sub v23.4s, v23.4s, v3.4s
+  sub v24.4s, v24.4s, v4.4s
+  sub v25.4s, v25.4s, v4.4s
+  sub v26.4s, v26.4s, v5.4s
+  sub v27.4s, v27.4s, v5.4s
+  sub v28.4s, v28.4s, v6.4s
+  sub v29.4s, v29.4s, v6.4s
+  sub v30.4s, v30.4s, v7.4s
+  sub v31.4s, v31.4s, v7.4s
+
+  // Apply left shift
+  dup v11.4s, w12
+  sqshl v16.4s, v16.4s, v11.4s
+  sqshl v17.4s, v17.4s, v11.4s
+  sqshl v18.4s, v18.4s, v11.4s
+  sqshl v19.4s, v19.4s, v11.4s
+  sqshl v20.4s, v20.4s, v11.4s
+  sqshl v21.4s, v21.4s, v11.4s
+  sqshl v22.4s, v22.4s, v11.4s
+  sqshl v23.4s, v23.4s, v11.4s
+  sqshl v24.4s, v24.4s, v11.4s
+  sqshl v25.4s, v25.4s, v11.4s
+  sqshl v26.4s, v26.4s, v11.4s
+  sqshl v27.4s, v27.4s, v11.4s
+  sqshl v28.4s, v28.4s, v11.4s
+  sqshl v29.4s, v29.4s, v11.4s
+  sqshl v30.4s, v30.4s, v11.4s
+  sqshl v31.4s, v31.4s, v11.4s
+
+  // Apply the fixed-point part of the multiplier.
+  dup v10.4s, w11
+  sqrdmulh v16.4s, v16.4s, v10.4s
+  sqrdmulh v17.4s, v17.4s, v10.4s
+  sqrdmulh v18.4s, v18.4s, v10.4s
+  sqrdmulh v19.4s, v19.4s, v10.4s
+  sqrdmulh v20.4s, v20.4s, v10.4s
+  sqrdmulh v21.4s, v21.4s, v10.4s
+  sqrdmulh v22.4s, v22.4s, v10.4s
+  sqrdmulh v23.4s, v23.4s, v10.4s
+  sqrdmulh v24.4s, v24.4s, v10.4s
+  sqrdmulh v25.4s, v25.4s, v10.4s
+  sqrdmulh v26.4s, v26.4s, v10.4s
+  sqrdmulh v27.4s, v27.4s, v10.4s
+  sqrdmulh v28.4s, v28.4s, v10.4s
+  sqrdmulh v29.4s, v29.4s, v10.4s
+  sqrdmulh v30.4s, v30.4s, v10.4s
+  sqrdmulh v31.4s, v31.4s, v10.4s
+
+  // Apply right shift
+  dup v9.4s, w13
+  and v0.16b, v9.16b, v16.16b
+  sshr v0.4s, v0.4s, #31
+  sqadd v16.4s, v16.4s, v0.4s
+  srshl v16.4s, v16.4s, v9.4s
+  and v1.16b, v9.16b, v17.16b
+  sshr v1.4s, v1.4s, #31
+  sqadd v17.4s, v17.4s, v1.4s
+  srshl v17.4s, v17.4s, v9.4s
+  and v2.16b, v9.16b, v18.16b
+  sshr v2.4s, v2.4s, #31
+  sqadd v18.4s, v18.4s, v2.4s
+  srshl v18.4s, v18.4s, v9.4s
+  and v3.16b, v9.16b, v19.16b
+  sshr v3.4s, v3.4s, #31
+  sqadd v19.4s, v19.4s, v3.4s
+  srshl v19.4s, v19.4s, v9.4s
+  and v0.16b, v9.16b, v20.16b
+  sshr v0.4s, v0.4s, #31
+  sqadd v20.4s, v20.4s, v0.4s
+  srshl v20.4s, v20.4s, v9.4s
+  and v1.16b, v9.16b, v21.16b
+  sshr v1.4s, v1.4s, #31
+  sqadd v21.4s, v21.4s, v1.4s
+  srshl v21.4s, v21.4s, v9.4s
+  and v2.16b, v9.16b, v22.16b
+  sshr v2.4s, v2.4s, #31
+  sqadd v22.4s, v22.4s, v2.4s
+  srshl v22.4s, v22.4s, v9.4s
+  and v3.16b, v9.16b, v23.16b
+  sshr v3.4s, v3.4s, #31
+  sqadd v23.4s, v23.4s, v3.4s
+  srshl v23.4s, v23.4s, v9.4s
+  and v0.16b, v9.16b, v24.16b
+  sshr v0.4s, v0.4s, #31
+  sqadd v24.4s, v24.4s, v0.4s
+  srshl v24.4s, v24.4s, v9.4s
+  and v1.16b, v9.16b, v25.16b
+  sshr v1.4s, v1.4s, #31
+  sqadd v25.4s, v25.4s, v1.4s
+  srshl v25.4s, v25.4s, v9.4s
+  and v2.16b, v9.16b, v26.16b
+  sshr v2.4s, v2.4s, #31
+  sqadd v26.4s, v26.4s, v2.4s
+  srshl v26.4s, v26.4s, v9.4s
+  and v3.16b, v9.16b, v27.16b
+  sshr v3.4s, v3.4s, #31
+  sqadd v27.4s, v27.4s, v3.4s
+  srshl v27.4s, v27.4s, v9.4s
+  and v0.16b, v9.16b, v28.16b
+  sshr v0.4s, v0.4s, #31
+  sqadd v28.4s, v28.4s, v0.4s
+  srshl v28.4s, v28.4s, v9.4s
+  and v1.16b, v9.16b, v29.16b
+  sshr v1.4s, v1.4s, #31
+  sqadd v29.4s, v29.4s, v1.4s
+  srshl v29.4s, v29.4s, v9.4s
+  and v2.16b, v9.16b, v30.16b
+  sshr v2.4s, v2.4s, #31
+  sqadd v30.4s, v30.4s, v2.4s
+  srshl v30.4s, v30.4s, v9.4s
+  and v3.16b, v9.16b, v31.16b
+  sshr v3.4s, v3.4s, #31
+  sqadd v31.4s, v31.4s, v3.4s
+  srshl v31.4s, v31.4s, v9.4s
+
+  // Add the destination zero point
+  dup v8.4s, w10
+  add v16.4s, v16.4s, v8.4s
+  add v17.4s, v17.4s, v8.4s
+  add v18.4s, v18.4s, v8.4s
+  add v19.4s, v19.4s, v8.4s
+  add v20.4s, v20.4s, v8.4s
+  add v21.4s, v21.4s, v8.4s
+  add v22.4s, v22.4s, v8.4s
+  add v23.4s, v23.4s, v8.4s
+  add v24.4s, v24.4s, v8.4s
+  add v25.4s, v25.4s, v8.4s
+  add v26.4s, v26.4s, v8.4s
+  add v27.4s, v27.4s, v8.4s
+  add v28.4s, v28.4s, v8.4s
+  add v29.4s, v29.4s, v8.4s
+  add v30.4s, v30.4s, v8.4s
+  add v31.4s, v31.4s, v8.4s
+
+  // Apply the act_min bound
+  dup v7.4s, w8
+  smax v16.4s, v16.4s, v7.4s
+  smax v17.4s, v17.4s, v7.4s
+  smax v18.4s, v18.4s, v7.4s
+  smax v19.4s, v19.4s, v7.4s
+
+  // Apply the act_min bound
+  dup v6.4s, w9
+  smin v16.4s, v16.4s, v6.4s
+  smin v17.4s, v17.4s, v6.4s
+  smin v18.4s, v18.4s, v6.4s
+  smin v19.4s, v19.4s, v6.4s
+
+  // int32 -> int16
+  sqxtn v0.4h, v16.4s
+  sqxtn2 v0.8h, v17.4s
+  sqxtn v1.4h, v18.4s
+  sqxtn2 v1.8h, v19.4s
+  sqxtn v2.4h, v20.4s
+  sqxtn2 v2.8h, v21.4s
+  sqxtn v3.4h, v22.4s
+  sqxtn2 v3.8h, v23.4s
+  sqxtn v4.4h, v24.4s
+  sqxtn2 v4.8h, v25.4s
+  sqxtn v5.4h, v26.4s
+  sqxtn2 v5.8h, v27.4s
+  sqxtn v6.4h, v28.4s
+  sqxtn2 v6.8h, v29.4s
+  sqxtn v7.4h, v30.4s
+  sqxtn2 v7.8h, v31.4s
+
+  // int16 -> int8
+  sqxtn v8.8b, v0.8h
+  sqxtn2 v8.16b, v1.8h
+  sqxtn v9.8b, v2.8h
+  sqxtn2 v9.16b, v3.8h
+  sqxtn v10.8b, v4.8h
+  sqxtn2 v10.16b, v5.8h
+  sqxtn v11.8b, v6.8h
+  sqxtn2 v11.16b, v7.8h
+
+  cmp w23, #8
+  blt Write     // if rows < 8
+  cmp w15, #8
+  blt Write     // if cols < 8
+
+  st1 {v8.d}[0], [x2], x24
+  st1 {v8.d}[1], [x2], x24
+  st1 {v9.d}[0], [x2], x24
+  st1 {v9.d}[1], [x2], x24
+  st1 {v10.d}[0], [x2], x24
+  st1 {v10.d}[1], [x2], x24
+  st1 {v11.d}[0], [x2], x24
+  st1 {v11.d}[1], [x2], x24
+  b Endwrite
+
+Write:
+  cmp w15, #8
+  bge WriteCol8
+  cmp w15, #7
+  beq WriteCol7
+  cmp w15, #6
+  beq WriteCol6
+  cmp w15, #5
+  beq WriteCol5
+  cmp w15, #4
+  beq WriteCol4
+  cmp w15, #3
+  beq WriteCol3
+  cmp w15, #2
+  beq WriteCol2
+  cmp w15, #1
+  beq WriteCol1
+
+WriteCol8:
+  st1 {v8.d}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v8.d}[1], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v9.d}[0], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v9.d}[1], [x2], x24
+  cmp w23, #4
+  beq Endwrite
+  st1 {v10.d}[0], [x2], x24
+  cmp w23, #5
+  beq Endwrite
+  st1 {v10.d}[1], [x2], x24
+  cmp w23, #6
+  beq Endwrite
+  st1 {v11.d}[0], [x2], x24
+  cmp w23, #7
+  beq Endwrite
+  st1 {v11.d}[1], [x2], x24
+  b Endwrite
+
+WriteCol7:
+  mov x26, x2
+  st1 {v8.s}[0], [x26], #4
+  st1 {v8.h}[2], [x26], #2
+  st1 {v8.b}[6], [x26], #1
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v8.s}[2], [x26], #4
+  st1 {v8.h}[6], [x26], #2
+  st1 {v8.b}[14], [x26], #1
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[0], [x26], #4
+  st1 {v9.h}[2], [x26], #2
+  st1 {v9.b}[6], [x26], #1
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[2], [x26], #4
+  st1 {v9.h}[6], [x26], #2
+  st1 {v9.b}[14], [x26], #1
+  add x2, x2, x24
+  cmp w23, #4
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[0], [x26], #4
+  st1 {v10.h}[2], [x26], #2
+  st1 {v10.b}[6], [x26], #1
+  add x2, x2, x24
+  cmp w23, #5
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[2], [x26], #4
+  st1 {v10.h}[6], [x26], #2
+  st1 {v10.b}[14], [x26], #1
+  add x2, x2, x24
+  cmp w23, #6
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[0], [x26], #4
+  st1 {v11.h}[2], [x26], #2
+  st1 {v11.b}[6], [x26], #1
+  add x2, x2, x24
+  cmp w23, #7
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[2], [x26], #4
+  st1 {v11.h}[6], [x26], #2
+  st1 {v11.b}[14], [x26], #1
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol6:
+  mov x26, x2
+  st1 {v8.s}[0], [x26], #4
+  st1 {v8.h}[2], [x26], #2
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v8.s}[2], [x26], #4
+  st1 {v8.h}[6], [x26], #2
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[0], [x26], #4
+  st1 {v9.h}[2], [x26], #2
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[2], [x26], #4
+  st1 {v9.h}[6], [x26], #2
+  add x2, x2, x24
+  cmp w23, #4
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[0], [x26], #4
+  st1 {v10.h}[2], [x26], #2
+  add x2, x2, x24
+  cmp w23, #5
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[2], [x26], #4
+  st1 {v10.h}[6], [x26], #2
+  add x2, x2, x24
+  cmp w23, #6
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[0], [x26], #4
+  st1 {v11.h}[2], [x26], #2
+  add x2, x2, x24
+  cmp w23, #7
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[2], [x26], #4
+  st1 {v11.h}[6], [x26], #2
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol5:
+  mov x26, x2
+  st1 {v8.s}[0], [x26], #4
+  st1 {v8.b}[4], [x26], #1
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v8.s}[2], [x26], #4
+  st1 {v8.b}[12], [x26], #1
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[0], [x26], #4
+  st1 {v9.b}[4], [x26], #1
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.s}[2], [x26], #4
+  st1 {v9.b}[12], [x26], #1
+  add x2, x2, x24
+  cmp w23, #4
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[0], [x26], #4
+  st1 {v10.b}[4], [x26], #1
+  add x2, x2, x24
+  cmp w23, #5
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.s}[2], [x26], #4
+  st1 {v10.b}[12], [x26], #1
+  add x2, x2, x24
+  cmp w23, #6
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[0], [x26], #4
+  st1 {v11.b}[4], [x26], #1
+  add x2, x2, x24
+  cmp w23, #7
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.s}[2], [x26], #4
+  st1 {v11.b}[12], [x26], #1
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol4:
+  st1 {v8.s}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v8.s}[2], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v9.s}[0], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v9.s}[2], [x2], x24
+  cmp w23, #4
+  beq Endwrite
+  st1 {v10.s}[0], [x2], x24
+  cmp w23, #5
+  beq Endwrite
+  st1 {v10.s}[2], [x2], x24
+  cmp w23, #6
+  beq Endwrite
+  st1 {v11.s}[0], [x2], x24
+  cmp w23, #7
+  beq Endwrite
+  st1 {v11.s}[2], [x2], x24
+  b Endwrite
+
+WriteCol3:
+  mov x26, x2
+  st1 {v8.h}[0], [x26], #2
+  st1 {v8.b}[2], [x26], #1
+  add x2, x2, x24
+  cmp w23, #1
+  beq Endwrite
+  mov x26, x2
+  st1 {v8.h}[4], [x26], #2
+  st1 {v8.b}[10], [x26], #1
+  add x2, x2, x24
+  cmp w23, #2
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.h}[0], [x26], #2
+  st1 {v9.b}[2], [x26], #1
+  add x2, x2, x24
+  cmp w23, #3
+  beq Endwrite
+  mov x26, x2
+  st1 {v9.h}[4], [x26], #2
+  st1 {v9.b}[10], [x26], #1
+  add x2, x2, x24
+  cmp w23, #4
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.h}[0], [x26], #2
+  st1 {v10.b}[2], [x26], #1
+  add x2, x2, x24
+  cmp w23, #5
+  beq Endwrite
+  mov x26, x2
+  st1 {v10.h}[4], [x26], #2
+  st1 {v10.b}[10], [x26], #1
+  add x2, x2, x24
+  cmp w23, #6
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.h}[0], [x26], #2
+  st1 {v11.b}[2], [x26], #1
+  add x2, x2, x24
+  cmp w23, #7
+  beq Endwrite
+  mov x26, x2
+  st1 {v11.h}[4], [x26], #2
+  st1 {v11.b}[10], [x26], #1
+  add x2, x2, x24
+  b Endwrite
+
+WriteCol2:
+  st1 {v8.h}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v8.h}[4], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v9.h}[0], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v9.h}[4], [x2], x24
+  cmp w23, #4
+  beq Endwrite
+  st1 {v10.h}[0], [x2], x24
+  cmp w23, #5
+  beq Endwrite
+  st1 {v10.h}[4], [x2], x24
+  cmp w23, #6
+  beq Endwrite
+  st1 {v11.h}[0], [x2], x24
+  cmp w23, #7
+  beq Endwrite
+  st1 {v11.h}[4], [x2], x24
+  b Endwrite
+
+WriteCol1:
+  st1 {v8.b}[0], [x2], x24
+  cmp w23, #1
+  beq Endwrite
+  st1 {v8.b}[8], [x2], x24
+  cmp w23, #2
+  beq Endwrite
+  st1 {v9.b}[0], [x2], x24
+  cmp w23, #3
+  beq Endwrite
+  st1 {v9.b}[8], [x2], x24
+  cmp w23, #4
+  beq Endwrite
+  st1 {v10.b}[0], [x2], x24
+  cmp w23, #5
+  beq Endwrite
+  st1 {v10.b}[8], [x2], x24
+  cmp w23, #6
+  beq Endwrite
+  st1 {v11.b}[0], [x2], x24
+  cmp w23, #7
+  beq Endwrite
+  st1 {v11.b}[8], [x2], x24
+  b Endwrite
+
+Endwrite:  
+  sub w16, w16, #8      // a row8 counter - 8
+  sub w23, w23, #8      // a row counter - 8
+  b L2
+
+End2:
+  sub w4, w4, #8        // b col8 counter - 8
+  sub w15, w15, #8      // b col counter - 8
+  add x1, x1, x21       // b ptr + stride
+  add x7, x7, #32       // bias ptr + stride
+  add x25, x25, #8      // output + stride(8 * sizeof(int8))
+  mov x2, x25
+  b L1
+
+End1:
+  sub sp, sp, #192
+  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
+  ldp x21, x22, [sp], #16
+  ldp x23, x24, [sp], #16
+  ldp x25, x26, [sp], #16
+  ret
+#endif
diff --git a/mindspore/lite/nnacl/common_func.c b/mindspore/lite/nnacl/common_func.c
index de5b59cddf..326e5e4f36 100644
--- a/mindspore/lite/nnacl/common_func.c
+++ b/mindspore/lite/nnacl/common_func.c
@@ -228,19 +228,3 @@ void IndirectGemmFp32_Comm(float *output, const float *input, const float *weigh
   return;
 }
 
-void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
-                        int32_t left_shift, int32_t right_shift, int32_t zp) {
-  /*  (int32_t)row8x8-major * multiplier => (int8_t)row-major  */
-  for (int r = 0; r < plane; r++) {
-    for (int c = 0; c < oc; c++) {
-      int c8div = c / 8, c8mod = c % 8;
-      int src_index = c8div * plane8 * 8 + r * 8 + c8mod;
-      int dst_index = r * oc + c;
-      int32_t value = in[src_index];
-      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
-      value = MSMIN(CHAR_MAX, value);
-      value = MSMAX(CHAR_MIN, value);
-      out[dst_index] = (int8_t)value;
-    }
-  }
-}
diff --git a/mindspore/lite/nnacl/conv_parameter.h b/mindspore/lite/nnacl/conv_parameter.h
index 6a7f3c5a85..ef01f73ab2 100644
--- a/mindspore/lite/nnacl/conv_parameter.h
+++ b/mindspore/lite/nnacl/conv_parameter.h
@@ -32,8 +32,6 @@ typedef struct ConvParameter {
   int stride_w_;
   int dilation_h_;
   int dilation_w_;
-  int pad_h_;
-  int pad_w_;
   int pad_u_;
   int pad_d_;
   int pad_l_;
@@ -51,8 +49,7 @@ typedef struct ConvParameter {
   int thread_num_;
   int input_unit_;
   int output_unit_;
-  bool is_relu_;
-  bool is_relu6_;
+  ActType act_type_;
 } ConvParameter;
 
 typedef struct SlidingWindowParam {
diff --git a/mindspore/lite/nnacl/fp16/activation_fp16.c b/mindspore/lite/nnacl/fp16/activation_fp16.c
new file mode 100644
index 0000000000..ff2b34767a
--- /dev/null
+++ b/mindspore/lite/nnacl/fp16/activation_fp16.c
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/activation_fp16.h"
+#include "nnacl/errorcode.h"
+
+int ReluFp16(const float16_t *src, float16_t *dst, int ele_num) {
+  int eight_block = UP_DIV(ele_num, C8NUM);
+  int i;
+  for (i = 0; i < eight_block - 1; i++) {
+    int index = i * C8NUM;
+#ifdef ENABLE_NEON
+    float16x8_t relu_src = vld1q_f16(src + index);
+    float16x8_t zero_src = vdupq_n_f16(0);
+    relu_src = vmaxq_f16(relu_src, zero_src);
+    vst1q_f16(dst + index, relu_src);
+#else
+    int j;
+    for (j = 0; j < C8NUM; j++) {
+      dst[index + j] = src[index + j] < 0 ? 0 : src[index + j];
+    }
+#endif
+  }
+  for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) {
+    dst[j] = src[j] < 0 ? 0 : src[j];
+  }
+  return NNACL_OK;
+}
+
+int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num) {
+  int eight_block = UP_DIV(ele_num, C8NUM);
+  int i;
+  for (i = 0; i < eight_block - 1; i++) {
+    int index = i * C8NUM;
+#ifdef ENABLE_NEON
+    float16x8_t relu6_data = vld1q_f16(data + index);
+    float16x8_t zero_data = vdupq_n_f16(0);
+    float16x8_t six_data = vdupq_n_f16(6);
+    relu6_data = vmaxq_f16(relu6_data, zero_data);
+    relu6_data = vminq_f16(relu6_data, six_data);
+    vst1q_f16(dst + index, relu6_data);
+#else
+    int j;
+    for (j = 0; j < C8NUM; ++j) {
+      dst[index + j] = data[index + j] < 0 ? 0 : data[index + j];
+      dst[index + j] = dst[index + j] > 6 ? 6 : dst[index + j];
+    }
+#endif
+  }
+  for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) {
+    dst[j] = data[j] < 0 ? 0 : data[j];
+    dst[j] = dst[j] > 6 ? 6 : dst[j];
+  }
+  return NNACL_OK;
+}
+
+int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha) {
+  for (int i = 0; i < ele_num; ++i) {
+    dst[i] = src[i] > (float16_t)0.0f ? src[i] : (src[i] * alpha);
+  }
+  return NNACL_OK;
+}
+
+int SigmoidFp16(const float16_t *src, float16_t *dst, int ele_num) {
+  for (int i = 0; i < ele_num; ++i) {
+    dst[i] = (float16_t)1.0f / (float16_t)(1.0f + exp(-src[i]));
+  }
+  return NNACL_OK;
+}
+
+int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) {
+  for (int i = 0; i < ele_num; ++i) {
+    dst[i] = (float16_t)1.0f - (float16_t)2.0f / (float16_t)(exp(2 * src[i]) + 1);
+  }
+  return NNACL_OK;
+}
+
+int HSwishFp16(const float16_t *src, float16_t *dst, int ele_num) {
+  for (int i = 0; i < ele_num; ++i) {
+    float16_t in = src[i];
+    float16_t relu6 = MSMIN(MSMAX(in + 3, 0), 6);
+    dst[i] = in * relu6 / (float16_t)6.0f;
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/fp16/activation_fp16.h b/mindspore/lite/nnacl/fp16/activation_fp16.h
new file mode 100644
index 0000000000..eea4b489f8
--- /dev/null
+++ b/mindspore/lite/nnacl/fp16/activation_fp16.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP16_ACTIVATION_FP16_H_
+#define MINDSPORE_LITE_NNACL_FP16_ACTIVATION_FP16_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include <math.h>
+#include "nnacl/op_base.h"
+#include "nnacl/quantization/fixed_point.h"
+
+typedef struct ActivationParameter {
+  OpParameter op_parameter_;
+  int type_;
+  float alpha_;
+} ActivationParameter;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int ReluFp16(const float16_t *src, float16_t *dst, int ele_num);
+int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num);
+int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha);
+int SigmoidFp16(const float16_t *src, float16_t *dst, int ele_num);
+int TanhFp16(const float16_t *src, float16_t *dst, int ele_num);
+int HSwishFp16(const float16_t *src, float16_t *dst, int ele_num);
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_LITE_NNACL_FP16_ACTIVATION_FP16_H_
diff --git a/mindspore/lite/nnacl/fp16/arithmetic_fp16.c b/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
index a801e93621..ef8ae6fd64 100644
--- a/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
+++ b/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
@@ -74,33 +74,48 @@ int ElementOptMulFp16(float16_t *input0, float16_t *input1, float16_t *output, i
                       ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
 #endif
-  for (int index = 0; index < block_c8; index += C8NUM) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vmulq_f16(vin0, vin1);
-    vst1q_f16(output, vout);
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vst1q_f16(output, vout);
 #else
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      output[i] = in0 * in1;
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = in0_opt * input1[i];
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = in0_opt * input1[index];
     }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vst1q_f16(output, vout);
+#else
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = input0[i] * in1_opt;
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    output[index] = in0 * in1;
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = input0[index] * in1_opt;
+    }
   }
 
   return NNACL_OK;
@@ -113,7 +128,6 @@ int ElementMulReluFp16(float16_t *input0, float16_t *input1, float16_t *output,
 #ifdef ENABLE_NEON
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
-
   for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
     float16x8_t vin0 = vld1q_f16(input0);
@@ -143,39 +157,58 @@ int ElementOptMulReluFp16(float16_t *input0, float16_t *input1, float16_t *outpu
                           ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
 
-  for (int index = 0; index < block_c8; index += C8NUM) {
-#ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vmulq_f16(vin0, vin1);
-    vout = vmaxq_f16(vout, zeros);
-    vst1q_f16(output, vout);
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vout = vmaxq_f16(vout, zeros);
+      vst1q_f16(output, vout);
 #else
-    float16_t res;
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      res = in0 * in1;
-      output[i] = res > 0 ? res : 0;
+      float16_t res;
+      for (int i = 0; i < C8NUM; ++i) {
+        res = in0_opt * input1[i];
+        output[i] = res > 0 ? res : 0;
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      float16_t res = in0_opt * input1[index];
+      output[index] = res > 0 ? res : 0;
     }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vout = vmaxq_f16(vout, zeros);
+      vst1q_f16(output, vout);
+#else
+      float16_t res;
+      for (int i = 0; i < C8NUM; ++i) {
+        res = input0[i] * in1_opt;
+        output[i] = res > 0 ? res : 0;
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    float16_t res = in0 * in1;
-    output[index] = res > 0 ? res : 0;
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      float16_t res = input0[index] * in1_opt;
+      output[index] = res > 0 ? res : 0;
+    }
   }
 
   return NNACL_OK;
@@ -216,37 +249,52 @@ int ElementOptMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *outp
                            ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
 #endif
-
-  for (int index = 0; index < block_c8; index += C8NUM) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vmulq_f16(vin0, vin1);
-    vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
-    vst1q_f16(output, vout);
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
+      vst1q_f16(output, vout);
 #else
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      output[i] = MSMIN(MSMAX(in0 * in1, 0), 6);
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMIN(MSMAX(in0_opt * input1[i], 0), 6);
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(in0_opt * input1[index], 0), 6);
     }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vmulq_f16(vin0, vin1);
+      vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
+      vst1q_f16(output, vout);
+#else
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMIN(MSMAX(input0[i] * in1_opt, 0), 6);
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    output[index] = MSMIN(MSMAX(in0 * in1, 0), 6);
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(input0[index] * in1_opt, 0), 6);
+    }
   }
 
   return NNACL_OK;
@@ -255,7 +303,6 @@ int ElementOptMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *outp
 int ElementAddFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
-
   for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
     float16x8_t vin0 = vld1q_f16(input0);
@@ -280,34 +327,50 @@ int ElementOptAddFp16(float16_t *input0, float16_t *input1, float16_t *output, i
                       ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
 #endif
-  for (int index = 0; index < block_c8; index += C8NUM) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vaddq_f16(vin0, vin1);
-    vst1q_f16(output, vout);
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vst1q_f16(output, vout);
 #else
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      output[i] = in0 + in1;
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = in0_opt + input1[i];
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = in0_opt + input1[index];
     }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vst1q_f16(output, vout);
+#else
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = input0[i] + in1_opt;
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    output[index] = in0 + in1;
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = input0[index] + in1_opt;
+    }
   }
+
   return NNACL_OK;
 }
 
@@ -345,37 +408,54 @@ int ElementOptAddReluFp16(float16_t *input0, float16_t *input1, float16_t *outpu
                           ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
 
-  for (int index = 0; index < block_c8; index += C8NUM) {
-#ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vaddq_f16(vin0, vin1);
-    vout = vmaxq_f16(vout, zeros);
-    vst1q_f16(output, vout);
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vout = vmaxq_f16(vout, zeros);
+      vst1q_f16(output, vout);
 #else
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      output[i] = MSMAX(in0 + in1, 0);
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMAX(in0_opt + input1[i], 0);
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      float16_t res = in0_opt + input1[index];
+      output[index] = res > 0 ? res : 0;
     }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vout = vmaxq_f16(vout, zeros);
+      vst1q_f16(output, vout);
+#else
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMAX(input0[i] + in1_opt, 0);
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    float16_t res = in0 + in1;
-    output[index] = res > 0 ? res : 0;
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      float16_t res = input0[index] + in1_opt;
+      output[index] = res > 0 ? res : 0;
+    }
   }
   return NNACL_OK;
 }
@@ -415,39 +495,54 @@ int ElementOptAddRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *outp
                            ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
 #endif
 
-  for (int index = 0; index < block_c8; index += C8NUM) {
-#ifdef ENABLE_NEON
-    float16x8_t vin0 = param->in_elements_num0_ == 1 ? vin0_opt : vld1q_f16(input0);
-    float16x8_t vin1 = param->in_elements_num1_ == 1 ? vin1_opt : vld1q_f16(input1);
-    float16x8_t vout = vaddq_f16(vin0, vin1);
-    vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
-    vst1q_f16(output, vout);
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vin0_opt;
+      float16x8_t vin1 = vld1q_f16(input1);
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
+      vst1q_f16(output, vout);
 #else
-    for (int i = 0; i < C8NUM; ++i) {
-      float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[i];
-      float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[i];
-      output[i] = MSMIN(MSMAX(in0 + in1, 0), 6);
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMIN(MSMAX(in0_opt + input1[i], 0), 6);
+      }
+#endif
+      input1 += C8NUM;
+      output += C8NUM;
     }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(in0_opt + input1[index], 0), 6);
+    }
+  } else {
+    for (int index = 0; index < block_c8; index += C8NUM) {
+#ifdef ENABLE_NEON
+      float16x8_t vin0 = vld1q_f16(input0);
+      float16x8_t vin1 = vin1_opt;
+      float16x8_t vout = vaddq_f16(vin0, vin1);
+      vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
+      vst1q_f16(output, vout);
+#else
+      for (int i = 0; i < C8NUM; ++i) {
+        output[i] = MSMIN(MSMAX(input0[i] + in1_opt, 0), 6);
+      }
 #endif
-    input0 += C8NUM;
-    input1 += C8NUM;
-    output += C8NUM;
-  }
-  for (int index = 0; index < block_mod; ++index) {
-    float16_t in0 = param->in_elements_num0_ == 1 ? in0_opt : input0[index];
-    float16_t in1 = param->in_elements_num1_ == 1 ? in1_opt : input1[index];
-    output[index] = MSMIN(MSMAX(in0 + in1, 0), 6);
+      input0 += C8NUM;
+      output += C8NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(input0[index] + in1_opt, 0), 6);
+    }
   }
-
   return NNACL_OK;
 }
 
@@ -479,11 +574,11 @@ int ElementOptSubFp16(float16_t *input0, float16_t *input1, float16_t *output, i
                       ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
 #endif
   for (int index = 0; index < block_c8; index += C8NUM) {
 #ifdef ENABLE_NEON
@@ -542,11 +637,11 @@ int ElementOptSubReluFp16(float16_t *input0, float16_t *input1, float16_t *outpu
                           ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
   for (int index = 0; index < block_c8; index += C8NUM) {
@@ -609,11 +704,11 @@ int ElementOptSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *outp
                            ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
 #endif
@@ -680,11 +775,11 @@ int ElementOptDivFp16(float16_t *input0, float16_t *input1, float16_t *output, i
                       ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
 #endif
   for (int index = 0; index < block_c8; index += C8NUM) {
     if (param->in_elements_num1_ == 1) {
@@ -765,12 +860,11 @@ int ElementOptDivReluFp16(float16_t *input0, float16_t *input1, float16_t *outpu
                           ArithmeticParameter *param) {
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
-
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
   for (int index = 0; index < block_c8; index += C8NUM) {
@@ -855,11 +949,11 @@ int ElementOptDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *outp
   int block_mod = element_size % C8NUM;
   int block_c8 = element_size - block_mod;
 
+  float16_t in0_opt = input0[0];
+  float16_t in1_opt = input1[0];
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = {input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0], input0[0]};
   float16x8_t vin1_opt = {input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0], input1[0]};
-  float16_t in0_opt = input0[0];
-  float16_t in1_opt = input1[0];
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
 #endif
diff --git a/mindspore/lite/nnacl/fp16/batchnorm_fp16.c b/mindspore/lite/nnacl/fp16/batchnorm_fp16.c
index baa9e6dfdf..facae90c3f 100644
--- a/mindspore/lite/nnacl/fp16/batchnorm_fp16.c
+++ b/mindspore/lite/nnacl/fp16/batchnorm_fp16.c
@@ -17,8 +17,8 @@
 #include "nnacl/fp16/batchnorm_fp16.h"
 #include <math.h>
 
-void BatchNormFp16(const void *input, const void *mean, const void *variance,
-                   BatchNormParameter *param, int task_id, void *output) {
+void BatchNormFp16(const float16_t *input, const void *mean, const void *variance,
+                   BatchNormParameter *param, int task_id, float16_t *output) {
   int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
   int completed_units = task_id * units_per_thread;
   int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
@@ -27,8 +27,9 @@ void BatchNormFp16(const void *input, const void *mean, const void *variance,
   for (int i = 0; i < cur_unit; i++) {
     for (int c = 0; c < param->channel_; c++) {
       float16_t variance_sqrt = sqrt(((const float16_t *)variance)[c] + param->epsilon_);
-      ((float16_t *)output)[cur_offset + c] =
-                  (((const float16_t *)input)[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
+      if (variance_sqrt != 0) {
+        output[cur_offset + c] = (input[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
+      }
     }
     cur_offset += param->channel_;
   }
@@ -44,8 +45,12 @@ void FusedBatchNormFp16(const void *input, const void *scale, const void *offset
   for (int i = 0; i < cur_unit; i++) {
     for (int c = 0; c < param->channel_; c++) {
       float16_t variance_sqrt = sqrt(((const float16_t *)variance)[c] + param->epsilon_);
-      float16_t norm_val = (((const float16_t *)input)[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
-      ((float16_t *)output)[cur_offset + c] = norm_val * ((const float16_t *)scale)[c] + ((const float16_t *)offset)[c];
+      if (variance_sqrt != 0) {
+        float16_t norm_val =
+                (((const float16_t *)input)[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
+        ((float16_t *)output)[cur_offset + c] =
+                norm_val * ((const float16_t *)scale)[c] + ((const float16_t *)offset)[c];
+      }
     }
     cur_offset += param->channel_;
   }
diff --git a/mindspore/lite/nnacl/fp16/batchnorm_fp16.h b/mindspore/lite/nnacl/fp16/batchnorm_fp16.h
index 673bcd46fa..8f6d6aa485 100644
--- a/mindspore/lite/nnacl/fp16/batchnorm_fp16.h
+++ b/mindspore/lite/nnacl/fp16/batchnorm_fp16.h
@@ -25,8 +25,8 @@
 extern "C" {
 #endif
 
-void BatchNormFp16(const void *input, const void *mean, const void *variance, BatchNormParameter *param, int task_id,
-                   void *output);
+void BatchNormFp16(const float16_t *input, const void *mean, const void *variance, BatchNormParameter *param,
+                   int task_id, float16_t *output);
 void FusedBatchNormFp16(const void *input, const void *scale, const void *offset, const void *mean,
                         const void *variance, BatchNormParameter *param, int task_id, void *output);
 
diff --git a/mindspore/lite/nnacl/fp16/common_func.c b/mindspore/lite/nnacl/fp16/common_func.c
deleted file mode 100644
index 84ddcd8e4b..0000000000
--- a/mindspore/lite/nnacl/fp16/common_func.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "nnacl/fp16/common_func.h"
-
-void ReluFp16(float16_t *data, float16_t *dst, int ele_num) {
-  int eight_block = UP_DIV(ele_num, C8NUM);
-  for (int i = 0; i < eight_block - 1; i++) {
-    int index = i * C8NUM;
-#ifdef ENABLE_NEON
-    float16x8_t relu_data = vld1q_f16(data + index);
-    float16x8_t zero_data = vdupq_n_f16(0);
-    relu_data = vmaxq_f16(relu_data, zero_data);
-    vst1q_f16(dst + index, relu_data);
-#else
-    data[index] = data[index] < 0 ? 0 : data[index];
-    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
-    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
-    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
-#endif
-  }
-  for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
-  }
-}
-
-void Relu6Fp16(float16_t *data, float16_t *dst, int ele_num) {
-  int eight_block = UP_DIV(ele_num, C8NUM);
-  for (int i = 0; i < eight_block - 1; i++) {
-    int index = i * C8NUM;
-#ifdef ENABLE_NEON
-    float16x8_t relu6_data = vld1q_f16(data + index);
-    float16x8_t zero_data = vdupq_n_f16(0);
-    float16x8_t six_data = vdupq_n_f16(6);
-    relu6_data = vmaxq_f16(relu6_data, zero_data);
-    relu6_data = vminq_f16(relu6_data, six_data);
-    vst1q_f16(dst + index, relu6_data);
-#else
-    for (int j = 0; j < C8NUM; ++j) {
-      data[index + j] = data[index + j] < 0 ? 0 : data[index + j];
-      data[index + j] = data[index + j] > 6 ? 6 : data[index + j];
-    }
-#endif
-  }
-  for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
-    data[j] = data[j] > 6 ? 6 : data[j];
-  }
-}
diff --git a/mindspore/lite/nnacl/fp16/common_func.h b/mindspore/lite/nnacl/fp16/common_func.h
deleted file mode 100644
index 2faaec4bb0..0000000000
--- a/mindspore/lite/nnacl/fp16/common_func.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
-#define MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include "nnacl/op_base.h"
-#include "nnacl/conv_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef ENABLE_ARM64
-void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
-                      size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
-                      size_t relu6);
-void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
-                      size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
-                      size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
-                      size_t relu, size_t relu6);
-void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
-                        size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
-void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
-                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
-                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
-#endif
-void ReluFp16(float16_t *data, float16_t *dst, int ele_num);
-void Relu6Fp16(float16_t *data, float16_t *dst, int ele_num);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */
diff --git a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
index e5870f8f4b..35dfe0f46d 100644
--- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
@@ -15,8 +15,62 @@
  */
 
 #include "nnacl/fp16/conv_depthwise_fp16.h"
-#include <arm_neon.h>
-#include "nnacl/fp16/common_func.h"
+#include <string.h>
+#include "nnacl/fp16/activation_fp16.h"
+
+void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                const float16_t *bias_data, const ConvParameter *conv_param, int task_id) {
+  int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
+  int h_start = h_step * task_id;
+  int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
+  for (int b = 0; b < conv_param->output_batch_; b++) {
+    const float16_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+    float16_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
+    for (int oh = h_start; oh < h_end; oh++) {
+      float16_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
+
+      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
+      int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
+      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
+
+      for (int ow = 0; ow < conv_param->output_w_; ow++) {
+        memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float16_t));
+      }
+      for (int kh = start_kh; kh < end_kh; kh++) {
+        int ih = ih_origin + conv_param->dilation_w_ * kh;
+
+        const float16_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
+        const float16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
+
+        int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
+        for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
+          int out_w_start = MSMAX(
+            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
+          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
+                                                        conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
+                                                         conv_param->stride_w_);
+
+          float16_t *dst_w = dst_data + out_w_start * conv_param->output_channel_;
+          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
+
+          const float16_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
+          int num_pixels = out_w_end - out_w_start;
+
+          ConvDwFp16Row(dst_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step);
+          weight_kh += conv_param->output_channel_;
+        }
+      }
+      if (relu) {
+        ReluFp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
+      }
+      if (relu6) {
+        Relu6Fp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
+      }
+    }
+  }
+}
 
 /*conv depthwise fp16 begin*/
 void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
@@ -53,16 +107,18 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1
 void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
                          int bottom, int left, int right, const ConvParameter *conv_param,
                          const SlidingWindowParam *sliding) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float16_t *dst_h = dst + top * sliding->out_h_step_;
   for (int oh = top; oh < bottom; oh++) {
-    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
+    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
     const float16_t *src_h = src + ih * sliding->in_h_step_;
 
     float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
     for (int ow = left; ow < right; ow++) {
-      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
+      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
       const float16_t *src_w = src_h + iw * sliding->block_channel_;
@@ -72,11 +128,10 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
 #ifdef ENABLE_ARM64
       ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                        sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
-                       conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
+                       conv_param->kernel_w_ * C8NUM * sizeof(float16_t), relu, relu6);
 #else
       DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
-                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM,
-                               conv_param->is_relu_, conv_param->is_relu6_);
+                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, relu, relu6);
 #endif
       dst_kernel += sliding->block_channel_;
     }  // width loop
@@ -139,6 +194,8 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
 void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
                   const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
                   int task_id) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   const float16_t *src = input_data;
   float16_t *dst = output_data;
   for (int b = 0; b < conv_param->output_batch_; b++) {
@@ -157,8 +214,8 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
                           conv_param->output_w_, conv_param, sliding);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
         float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -166,12 +223,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
                          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t),
                          sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t),
                          sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t),
-                         sliding->in_kw_step_ * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
+                         sliding->in_kw_step_ * sizeof(float16_t), relu, relu6);
 #else
         DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
                             sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                             sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
-                            sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+                            sliding->in_kh_step_, sliding->in_kw_step_, relu, relu6);
 #endif
       }
     }  // output C8 loop
@@ -210,14 +267,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
                                const SlidingWindowParam *sliding) {
   const float16_t *src_h = src + top * sliding->out_h_step_;
   for (int ih = top; ih < bottom; ih++) {
-    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
+    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
     float16_t *dst_h = dst + oh * sliding->in_h_step_;
 
     const float16_t *src_kernel = src_h + left * sliding->block_channel_;
     for (int iw = left; iw < right; iw++) {
-      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
       float16_t *dst_w = dst_h + ow * sliding->block_channel_;
@@ -282,12 +339,14 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
 
 void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
                                  const ConvParameter *conv_param) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float16_t *dst_k = dst;
   for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
     for (int c = 0; c < C8NUM; c++) {
       dst_k[c] += bias[c];
-      dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
-      dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
+      dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
+      dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
     }
     dst_k += block_channel;
   }
@@ -315,8 +374,8 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
                                 conv_param->input_w_, conv_param, sliding);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
         const float16_t *in_t =
           src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
diff --git a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
index e70ebcfaf6..b1ae0a12d9 100644
--- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
+++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
@@ -23,6 +23,26 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+#ifdef ENABLE_ARM64
+void ConvDwFp16Row(float16_t *output_ptr, const float16_t *input_ptr, const float16_t *filter_ptr, size_t num_pixels,
+                   size_t input_channel, size_t input_step);
+void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
+                      size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
+                      size_t relu6);
+void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
+                      size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
+                      size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
+                      size_t relu, size_t relu6);
+void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
+                        size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
+void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
+                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+#endif
+
+void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                const float16_t *bias_data, const ConvParameter *conv_param, int task_id);
+
 void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
                   const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
                   int task_id);
diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.c b/mindspore/lite/nnacl/fp16/conv_fp16.c
index 91bac43931..6237e97f8b 100644
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@@ -173,16 +173,18 @@ void SWBorderPixel(float16_t *dst, const float16_t *src, const float16_t *weight
 
 void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
                   int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float16_t *dst_h = dst + top * sliding->out_h_step_;
   for (int oh = top; oh < bottom; oh++) {
-    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
+    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
     const float16_t *src_h = src + ih * sliding->in_h_step_;
 
     float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
     for (int ow = left; ow < right; ow++) {
-      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
+      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
       const float16_t *src_w = src_h + iw * sliding->ic4_channel_;
@@ -192,7 +194,7 @@ void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
 
       SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                     sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_,
-                    sliding->ic4_channel_, conv_param->is_relu_, conv_param->is_relu6_);
+                    sliding->ic4_channel_, relu, relu6);
 
       dst_kernel += sliding->block_channel_;
     }  // width loop
@@ -273,6 +275,8 @@ void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
 void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data,
                 float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param,
                 SlidingWindowParam *slidingWindow_param) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   int oc4_res = conv_param->output_channel_ % C4NUM;
   const float16_t *src = input_data;
   float16_t *dst;
@@ -299,8 +303,8 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
 
       if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
           slidingWindow_param->bottom_ > slidingWindow_param->top_) {
-        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         const float16_t *in_t =
           src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
         float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
@@ -310,7 +314,7 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
                      conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_,
                      slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_,
                      slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
-                     slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+                     slidingWindow_param->in_kw_step_, relu, relu6);
       }
     }  // output C4 loop
     src += slidingWindow_param->in_step_;
@@ -330,8 +334,8 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
   int out_h = conv_param->output_h_;
   int out_w = conv_param->output_w_;
   int out_channel = conv_param->output_channel_;
-  bool relu = conv_param->is_relu_;
-  bool relu6 = conv_param->is_relu6_;
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   int thread_count = conv_param->thread_num_;
   const int tile_n = 16;
   int output_count = out_h * out_w;
@@ -365,9 +369,10 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
                               out_channel * sizeof(float16_t), 0, 0, relu, relu6);
       } else {
         // res part
-        IndirectGemmFp16_16x8(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel,
+        float16_t *tmp_out_ptr = tmp_out_block + task_id * tile_n * out_channel;
+        IndirectGemmFp16_16x8(tmp_out_ptr, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel,
                               out_channel * sizeof(float16_t), 0, 0, relu, relu6);
-        memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float16_t));
+        memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel * sizeof(float16_t));
       }
     }
   }
@@ -395,7 +400,6 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16
 
   int input_batch = conv_param->input_batch_;
   for (int batch = 0; batch < input_batch; batch++) {
-    int in_batch_offset = batch * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
     int tmp_out_batch_offset = batch * oc8 * C8NUM * out_w_block * out_h_block * output_unit * output_unit;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
       int start_index = thread_id * tile_num;
diff --git a/mindspore/lite/nnacl/fp16/deconv_fp16.c b/mindspore/lite/nnacl/fp16/deconv_fp16.c
index 2675ecf16c..476c33f457 100644
--- a/mindspore/lite/nnacl/fp16/deconv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/deconv_fp16.c
@@ -73,8 +73,8 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
 
     for (int ih = 0; ih < conv_param->input_h_; ih++) {
       for (int iw = 0; iw < conv_param->input_w_; iw++) {
-        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
-        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
 
         int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
         int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -112,7 +112,7 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
     }       /*ih*/
   }         /*oc8*/
 
-  PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
-                     conv_param->is_relu6_);
+  PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
+                     conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
   return NNACL_OK;
 }
diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c
index 6039528fdc..4ae77f72ae 100644
--- a/mindspore/lite/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.c
@@ -21,14 +21,14 @@
 void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) {
   /* support nhwc */
   for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
-    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
+    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
     if (src_h < 0 || src_h >= conv_param->input_h_) {
       continue;
     }
     const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
     float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
     for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
-      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
+      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
       if (src_w < 0 || src_w >= conv_param->input_w_) {
         continue;
       }
@@ -46,44 +46,40 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
   int kernel_w = conv_param->kernel_w_;
   int stride_h = conv_param->stride_h_;
   int stride_w = conv_param->stride_w_;
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int dilation_h = conv_param->dilation_h_;
   int dilation_w = conv_param->dilation_w_;
   int in_channel = conv_param->input_channel_;
   int in_h = conv_param->input_h_;
   int in_w = conv_param->input_w_;
   int out_w = conv_param->output_w_;
-  int channel_block = UP_DIV(in_channel, 4);
-  int kernel_plane = kernel_h * kernel_w;
+  int ic4 = UP_DIV(in_channel, 4);
+  memset(packed_input, 0, kernel_w * kernel_h * ic4 * C4NUM * 16 * sizeof(float16_t));
 
   for (int i = 0; i < real_cal_num; i++) {
     int block_start = block_index + i;
     int input_h = block_start / out_w * stride_h - pad_h;
     int input_w = block_start % out_w * stride_w - pad_w;
-    for (int j = 0; j < kernel_h; j++) {
-      int input_y = input_h + j * dilation_h;
-      if (input_y < 0 || input_y >= in_h) {
-        continue;
-      }
-      int input_y_stride = input_y * in_w * channel_block * C4NUM;
-      for (int n = 0; n < kernel_w; n++) {
-        int input_x = input_w + n * dilation_w;
-        if (input_x < 0 || input_x >= in_w) {
-          continue;
-        }
-        int input_x_stride = input_y_stride + input_x * channel_block * C4NUM;
-        int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * channel_block + i * C4NUM;
-        for (int m = 0; m < channel_block; m++) {
+    int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
+    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
+    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
+    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
+    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
+    for (int j = kh_s; j < kh_e; j++) {
+      int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
+      for (int n = kw_s; n < kw_e; n++) {
+        int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
+        int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * ic4 + i * C4NUM;
+        for (int m = 0; m < ic4; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
           int channel_block_offset = input_plane_offset + m * 16 * C4NUM;
 #ifdef ENABLE_ARM64
           vst1_f16(packed_input + channel_block_offset, vld1_f16(input_data + channel_block_stride));
 #else
-          (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0];
-          (packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1];
-          (packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2];
-          (packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3];
+          for (int l = 0; l < C4NUM; ++l) {
+            (packed_input + channel_block_offset)[l] = (input_data + channel_block_stride)[l];
+          }
 #endif
         }  // channel_block loop
       }    // kernel_w loop
@@ -221,6 +217,19 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
   }
 }
 
+void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
+  for (int n = 0; n < batch; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int hw = 0; hw < plane; hw++) {
+        int nhwc_index = n * channel * plane + hw * channel + c;
+        int nchw_index = n * channel * plane + c * plane + hw;
+        ((float16_t *)(dst))[nhwc_index] = ((const float16_t *)(src))[nchw_index];
+      }
+    }
+  }
+  return;
+}
+
 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
   int ic4 = UP_DIV(channel, C4NUM);
   int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.h b/mindspore/lite/nnacl/fp16/pack_fp16.h
index 493672b05d..afc55f2447 100644
--- a/mindspore/lite/nnacl/fp16/pack_fp16.h
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.h
@@ -41,6 +41,8 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
 
 void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);
 
+void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
+
 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel);
 
 void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel);
diff --git a/mindspore/lite/nnacl/fp16/softmax_fp16.c b/mindspore/lite/nnacl/fp16/softmax_fp16.c
new file mode 100644
index 0000000000..b0df45db6b
--- /dev/null
+++ b/mindspore/lite/nnacl/fp16/softmax_fp16.c
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/softmax_fp16.h"
+#include <math.h>
+#include <float.h>
+
+// output = exp(input) / reduce_sum(exp(input), axis)
+void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter) {
+  int32_t axis = parameter->axis_;
+  int n_dim = parameter->n_dim_;
+  int ele_size = parameter->element_size_;
+  int *input_shape = parameter->input_shape_;
+
+  float16_t max_data = input_ptr[0];
+  for (int i = 0; i < ele_size; i++) {
+    max_data = max_data > input_ptr[i] ? max_data : input_ptr[i];
+  }
+
+  for (int i = 0; i < ele_size; i++) {
+    output_ptr[i] = exp(input_ptr[i] - max_data);
+  }
+  int inner_size = 1, outter_size = 1;
+  for (int i = 0; i < axis; i++) {
+    outter_size *= input_shape[i];
+  }
+  for (int i = axis + 1; i < n_dim; i++) {
+    inner_size *= input_shape[i];
+  }
+
+  for (int i = 0; i < outter_size; i++) {
+    int outter_offset = i * input_shape[axis] * inner_size;
+    int sum_outter_offset = i * inner_size;
+    for (int k = 0; k < inner_size; k++) {
+      int inner_offset = outter_offset + k;
+      for (int j = 0; j < input_shape[axis]; j++) {
+        int axis_offset = inner_offset + j * inner_size;
+        sum_data[k + sum_outter_offset] += output_ptr[axis_offset];
+      }
+    }
+  }
+
+  for (int i = 0; i < outter_size; i++) {
+    int outter_offset = i * input_shape[axis] * inner_size;
+    int sum_outter_offset = i * inner_size;
+    for (int j = 0; j < input_shape[axis]; j++) {
+      int axis_offset = outter_offset + j * inner_size;
+      for (int k = 0; k < inner_size; k++) {
+        int inner_offset = axis_offset + k;
+        output_ptr[inner_offset] = output_ptr[inner_offset] / sum_data[k + sum_outter_offset];
+      }
+    }
+  }
+}
diff --git a/mindspore/lite/nnacl/fp16/softmax_fp16.h b/mindspore/lite/nnacl/fp16/softmax_fp16.h
new file mode 100644
index 0000000000..7e4127fe0e
--- /dev/null
+++ b/mindspore/lite/nnacl/fp16/softmax_fp16.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_FP16_SOFTMAX_FP16_H_
+#define MINDSPORE_LITE_NNACL_FP16_SOFTMAX_FP16_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/softmax_parameter.h"
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP16_SOFTMAX_FP16_H_
diff --git a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
index 4e683efe0c..d683f07ab0 100644
--- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
+++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
@@ -230,8 +230,8 @@ void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_inp
   int input_channel = conv_param->input_channel_;
   int input_width = conv_param->input_w_;
   int input_height = conv_param->input_h_;
-  int pad_w = conv_param->pad_w_;
-  int pad_h = conv_param->pad_h_;
+  int pad_w = conv_param->pad_l_;
+  int pad_h = conv_param->pad_u_;
   int ic8 = UP_DIV(input_channel, C8NUM);
   if (out_w_block == 0) {
     return;
@@ -576,8 +576,8 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
   int output_unit = conv_param->output_unit_;
   int in_channel = conv_param->input_channel_;
   int ic8 = UP_DIV(in_channel, C8NUM);
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int input_h = conv_param->input_h_;
   int input_w = conv_param->input_w_;
   if (out_w_block_num == 0) {
@@ -607,7 +607,7 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
         for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
           int src_x_offset = src_y_offset + j * ic8 * C8NUM;
           int dst_x_offset = dst_y_offset + j * C8NUM;
-          float16_t *src_addr = input_data + src_x_offset;
+          const float16_t *src_addr = input_data + src_x_offset;
           float16_t *dst_addr = tmp_data + dst_x_offset;
 #ifdef ENABLE_NEON
           vst1q_f16(dst_addr, vld1q_f16(src_addr));
diff --git a/mindspore/lite/nnacl/fp32/activation.c b/mindspore/lite/nnacl/fp32/activation.c
index 17e340751e..3cb34cd9ae 100644
--- a/mindspore/lite/nnacl/fp32/activation.c
+++ b/mindspore/lite/nnacl/fp32/activation.c
@@ -43,15 +43,33 @@ int LRelu(const float *src, int length, float *dst, float alpha) {
 }
 
 int Sigmoid(const float *src, int length, float *dst) {
+  const float upper_bound = 16.619047164916992188f;
+  const float lower_bound = -9.0f;
   for (int i = 0; i < length; ++i) {
-    dst[i] = 1.0f / (1.0f + exp(-src[i]));
+    float input_val = src[i];
+    float result;
+    if (input_val > upper_bound) {
+      result = 1.0f;
+    } else if (input_val < lower_bound) {
+      result = exp(input_val);
+    } else {
+      result = 1.0f / (1.0f + exp(-input_val));
+    }
+    dst[i] = result;
   }
   return NNACL_OK;
 }
 
 int Tanh(const float *src, int length, float *dst) {
   for (int i = 0; i < length; ++i) {
-    dst[i] = 1.0f - 2.0f / (exp(2 * src[i]) + 1);
+    float tmp_in = src[i];
+    if (tmp_in > 5.0) {
+      dst[i] = 1.0f;
+    } else if (tmp_in < -5.0) {
+      dst[i] = -1.0f;
+    } else {
+      dst[i] = 1.0f - 2.0f / (exp(2 * tmp_in) + 1);
+    }
   }
   return NNACL_OK;
 }
diff --git a/mindspore/lite/nnacl/fp32/arithmetic.c b/mindspore/lite/nnacl/fp32/arithmetic.c
index 0bfc8e6164..d08ba5c92c 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic.c
+++ b/mindspore/lite/nnacl/fp32/arithmetic.c
@@ -20,55 +20,455 @@
 #define ACCURACY_DATA 0.00000001
 
 int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vmulq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = in0_opt * input1[i];
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = in0_opt * input1[index];
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vmulq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = input0[i] * in1_opt;
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = input0[index] * in1_opt;
+    }
+  }
+
+  return NNACL_OK;
+}
+int ElementOptMulRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(in0_opt * input1[i], 0);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(in0_opt * input1[index], 0);
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(input0[i] * in1_opt, 0);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(input0[index] * in1_opt, 0);
+    }
+  }
+
+  return NNACL_OK;
+}
+int ElementOptMulRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+  float32x4_t bounds = {6, 6, 6, 6};
+#endif
   if (param->in_elements_num0_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[0] * input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(in0_opt * input1[i], 0), 6);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
     }
-  } else if (param->in_elements_num1_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] * input1[0];
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(in0_opt * input1[index], 0), 6);
     }
   } else {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] * input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(input0[i] * in1_opt, 0), 6);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(input0[index] * in1_opt, 0), 6);
     }
   }
+
   return NNACL_OK;
 }
 
 int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+#endif
   if (param->in_elements_num0_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[0] - input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vsubq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = in0_opt - input1[i];
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
     }
-  } else if (param->in_elements_num1_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] - input1[0];
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = in0_opt - input1[index];
     }
   } else {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] - input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vsubq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = input0[i] - in1_opt;
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = input0[index] - in1_opt;
     }
   }
   return NNACL_OK;
 }
+int ElementOptSubRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(in0_opt - input1[i], 0);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(in0_opt - input1[index], 0);
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(input0[i] - in1_opt, 0);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(input0[index] - in1_opt, 0);
+    }
+  }
+
+  return NNACL_OK;
+}
+int ElementOptSubRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+  float32x4_t bounds = {6, 6, 6, 6};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(in0_opt - input1[i], 0), 6);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(in0_opt - input1[index], 0), 6);
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(input0[i] - in1_opt, 0), 6);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(input0[index] - in1_opt, 0), 6);
+    }
+  }
+
+  return NNACL_OK;
+}
 
 int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+#endif
   if (param->in_elements_num0_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[0] + input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vaddq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = in0_opt + input1[i];
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
     }
-  } else if (param->in_elements_num1_ == 1) {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] + input1[0];
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = in0_opt + input1[index];
     }
   } else {
-    for (int i = 0; i < element_size; ++i) {
-      output[i] = input0[i] + input1[i];
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vaddq_f32(vin0, vin1);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = input0[i] + in1_opt;
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = input0[index] + in1_opt;
     }
   }
   return NNACL_OK;
 }
+int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(in0_opt + input1[i], 0);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(in0_opt + input1[index], 0);
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1), zeros);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMAX(input0[i] + in1_opt, 0);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMAX(input0[index] + in1_opt, 0);
+    }
+  }
+
+  return NNACL_OK;
+}
+int ElementOptAddRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
+  int block_mod = element_size % C4NUM;
+  int block_c4 = element_size - block_mod;
+  float in0_opt = input0[0];
+  float in1_opt = input1[0];
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = {input0[0], input0[0], input0[0], input0[0]};
+  float32x4_t vin1_opt = {input1[0], input1[0], input1[0], input1[0]};
+  float32x4_t zeros = {0, 0, 0, 0};
+  float32x4_t bounds = {6, 6, 6, 6};
+#endif
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vin0_opt;
+      float32x4_t vin1 = vld1q_f32(input1);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(in0_opt + input1[i], 0), 6);
+      }
+#endif
+      input1 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(in0_opt + input1[index], 0), 6);
+    }
+  } else {
+    for (int index = 0; index < block_c4; index += C4NUM) {
+#ifdef ENABLE_NEON
+      float32x4_t vin0 = vld1q_f32(input0);
+      float32x4_t vin1 = vin1_opt;
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
+      vst1q_f32(output, vout);
+#else
+      for (int i = 0; i < C4NUM; ++i) {
+        output[i] = MSMIN(MSMAX(input0[i] + in1_opt, 0), 6);
+      }
+#endif
+      input0 += C4NUM;
+      output += C4NUM;
+    }
+    for (int index = 0; index < block_mod; ++index) {
+      output[index] = MSMIN(MSMAX(input0[index] + in1_opt, 0), 6);
+    }
+  }
+
+  return NNACL_OK;
+}
 
 int ElementMul(float *input0, float *input1, float *output, int element_size) {
   int block_mod = element_size % C4NUM;
diff --git a/mindspore/lite/nnacl/fp32/arithmetic.h b/mindspore/lite/nnacl/fp32/arithmetic.h
index 5d3303ca0b..ab0e0d0297 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic.h
+++ b/mindspore/lite/nnacl/fp32/arithmetic.h
@@ -27,8 +27,14 @@
 extern "C" {
 #endif
 int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptAddRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
 int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptSubRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptSubRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
 int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptMulRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
+int ElementOptMulRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
 int ElementMul(float *input0, float *input1, float *output, int element_size);
 int ElementMulRelu(float *input0, float *input1, float *output, int element_size);
 int ElementMulRelu6(float *input0, float *input1, float *output, int element_size);
diff --git a/mindspore/lite/nnacl/fp32/batchnorm.c b/mindspore/lite/nnacl/fp32/batchnorm.c
index 5efde546ce..49926d1c4a 100644
--- a/mindspore/lite/nnacl/fp32/batchnorm.c
+++ b/mindspore/lite/nnacl/fp32/batchnorm.c
@@ -15,7 +15,6 @@
  */
 
 #include "nnacl/fp32/batchnorm.h"
-#include "nnacl/fp16/batchnorm_fp16.h"
 #include <math.h>
 #include "nnacl/batchnorm_parameter.h"
 #include "nnacl/op_base.h"
diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c
index 78c9e495b6..3b390858be 100644
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -18,6 +18,7 @@
 #include <string.h>
 #include "nnacl/fp32/common_func.h"
 #include "nnacl/winograd_transform.h"
+#include "nnacl/fp32/matmul.h"
 
 void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
                    int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) {
@@ -57,16 +58,18 @@ void SWBorderPixel(float *dst, const float *src, const float *weight, const floa
 void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left,
               int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
   int ic4 = sliding->ic4_channel_ / C4NUM;
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float *dst_h = dst + top * sliding->out_h_step_;
   for (int oh = top; oh < bottom; oh++) {
-    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
+    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
     const float *src_h = src + ih * sliding->in_h_step_;
 
     float *dst_kernel = dst_h + left * sliding->block_channel_;
     for (int ow = left; ow < right; ow++) {
-      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
+      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
       const float *src_w = src_h + iw * sliding->ic4_channel_;
@@ -75,8 +78,8 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
       const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_;
 
       SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
-                    sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4,
-                    conv_param->is_relu_, conv_param->is_relu6_);
+                    sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, relu,
+                    relu6);
 
       dst_kernel += sliding->block_channel_;
     }  // width loop
@@ -144,6 +147,8 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
                 float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) {
   int ic4 = slidingWindow_param->ic4_channel_ / C4NUM;
   int oc4_res = conv_param->output_channel_ % C4NUM;
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   const float *src = input_data;
   float *dst = NULL;
   if (oc4_res == 0) {
@@ -169,28 +174,26 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
 
       if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
           slidingWindow_param->bottom_ > slidingWindow_param->top_) {
-        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         const float *in_t =
           src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
         float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
                        slidingWindow_param->left_ * slidingWindow_param->block_channel_;
 #ifdef ENABLE_ARM64
-        ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
-                         slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
-                         conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
-                         slidingWindow_param->block_channel_ * sizeof(float), ic4,
-                         slidingWindow_param->in_sh_step_ * sizeof(float),
-                         slidingWindow_param->in_sw_step_ * sizeof(float),
-                         slidingWindow_param->in_kh_step_ * sizeof(float),
-                         slidingWindow_param->in_kw_step_ * sizeof(float),
-                         conv_param->is_relu_, conv_param->is_relu6_);
+        ConvSwFp32Center(
+          out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
+          slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
+          slidingWindow_param->out_h_step_ * sizeof(float), slidingWindow_param->block_channel_ * sizeof(float), ic4,
+          slidingWindow_param->in_sh_step_ * sizeof(float), slidingWindow_param->in_sw_step_ * sizeof(float),
+          slidingWindow_param->in_kh_step_ * sizeof(float), slidingWindow_param->in_kw_step_ * sizeof(float), relu,
+          relu6);
 #else
         SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
-                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
-                 conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
+                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
+                 slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
                  slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
-                 slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+                 slidingWindow_param->in_kw_step_, relu, relu6);
 #endif
       }
     }  // output C4 loop
@@ -219,6 +222,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
   int kernel_plane = kernel_h * kernel_w;
   int unit_size = kernel_plane * ic4 * C4NUM;
   int packed_input_size = output_tile_count * TILE_NUM * unit_size;
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
 
   // we accumulate 4 channels per time for input blocks
   int conv_depth = kernel_h * kernel_w;
@@ -240,23 +245,18 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
       if (real_cal_num == TILE_NUM) {
         float *gemm_output = output_data + out_offset;
         gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0,
-                  conv_param->is_relu_, conv_param->is_relu6_);
+                  relu, relu6);
       } else {
         // res part
-        gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0,
-                  0, conv_param->is_relu_, conv_param->is_relu6_);
-        memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float));
+        float *tmp_out_ptr = tmp_out_block + task_id * TILE_NUM * out_channel;
+        gemm_func(tmp_out_ptr, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0,
+                  relu, relu6);
+        memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel * sizeof(float));
       }
     }
   }
 }
 
-// fp32 conv1x1 strassen matmul
-int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr,
-                StrassenMatMulParameter matmul_param) {
-  return StrassenMatmul(input_data, weight_data, output_data, &matmul_param, FP32_STRASSEN_MAX_RECURSION, 0, tmp_ptr);
-}
-
 // fp32 conv winograd
 void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list,
                       int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func,
@@ -270,38 +270,46 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
   int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
   int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
   int output_count = out_w_block * out_h_block;
-  int output_tile_count = UP_DIV(output_count, TILE_NUM);
+  int output_tile_count = UP_DIV(output_count, C12NUM);
   int out_channel = conv_param->output_channel_;
   int oc4 = UP_DIV(out_channel, C4NUM);
+  int oc8 = UP_DIV(out_channel, C8NUM);
   int input_unit_square = input_unit * input_unit;
-  size_t output_offset = oc4 * C4NUM * input_unit_square * sizeof(float);
 
   float *trans_input = buffer_list[0];
   float *gemm_out = buffer_list[1];
   float *tmp_out_data = buffer_list[2];
   float *tmp_data = buffer_list[3];
-  int trans_input_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
-  int gemm_out_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
+  float *col_buffer = buffer_list[4];
+  int trans_input_offset = C12NUM * input_unit_square * ic4 * C4NUM;
+  int gemm_out_offset = C12NUM * input_unit_square * oc8 * C8NUM;
   int tmp_data_offset = input_unit_square * C4NUM;
+  int col_buffer_offset = C12NUM * ic4 * C4NUM;
   // step 1 : filter transform (pre-processed offline)
   // step 2 : input transform (online)
   for (int b = 0; b < in_batch; b++) {
     int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
     int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) {
-      int out_tile_index = thread_id * TILE_NUM;
-      int cal_num = output_count - thread_id * TILE_NUM;
-      cal_num = cal_num > TILE_NUM ? TILE_NUM : cal_num;
+      int out_tile_index = thread_id * C12NUM;
+      int cal_num = output_count - thread_id * C12NUM;
+      cal_num = cal_num > C12NUM ? C12NUM : cal_num;
       WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
                              tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
                              input_trans_func);
       // step 3 : gemm
-      gemm_func(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL,
-                input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, 0, 0);
+      float *src_ptr = trans_input + task_id * trans_input_offset;
+      float *dst_ptr = gemm_out + task_id * gemm_out_offset;
+      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
+      for (int i = 0; i < input_unit_square; ++i) {
+        RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
+        MatMulOpt(tmp_col_ptr, trans_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM,
+                  cal_num, oc8 * C8NUM, input_unit_square, 2);
+      }
 
       // step 4 : output transform
-      WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data,
-                              cal_num, out_tile_index, out_w_block, conv_param, output_trans_func);
+      WinogradOutputTransform(dst_ptr, tmp_out_data + tmp_out_batch_offset, bias_data, cal_num, out_tile_index,
+                              out_w_block, conv_param, output_trans_func);
     }
   }
 }
@@ -442,24 +450,28 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig
 }
 
 // fp32 conv3x3
-void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
-                 TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
+void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
+                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
   int thread_count = conv_param->thread_num_;
   int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
   int output_channel = conv_param->output_channel_;
   int oc4 = UP_DIV(output_channel, C4NUM);
+  int oc8 = UP_DIV(output_channel, C8NUM);
   int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT);
   int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT);
   int output_count = out_w_block * out_h_block;
-  int output_tile_count = UP_DIV(output_count, TILE_NUM);
+  int output_tile_count = UP_DIV(output_count, C12NUM);
   const int input_unit_square = 4 * 4;
+
   float *tile_buffer = buffer_list[0];
   float *block_unit_buffer = buffer_list[1];
   float *tmp_dst_buffer = buffer_list[2];
   float *nc4hw4_out = buffer_list[3];
-  int tile_buffer_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
+  float *col_buffer = buffer_list[4];
+  int tile_buffer_offset = C12NUM * input_unit_square * ic4 * C4NUM;
   int block_unit_buffer_offset = input_unit_square * C4NUM;
-  int tmp_dst_buffer_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
+  int tmp_dst_buffer_offset = C12NUM * input_unit_square * oc8 * C8NUM;
+  int col_buffer_offset = C12NUM * ic4 * C4NUM;
 
   int input_batch = conv_param->input_batch_;
   for (int batch = 0; batch < input_batch; batch++) {
@@ -467,18 +479,22 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
     int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_;
 
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
-      int start_index = thread_id * TILE_NUM;
-      int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
+      int start_index = thread_id * C12NUM;
+      int real_cal_num = (output_count - start_index) < C12NUM ? (output_count - start_index) : C12NUM;
       Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset,
                                 block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num,
                                 out_w_block, conv_param);
 
-      gemm_func(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset,
-                transed_weight, NULL, input_unit_square, ic4, oc4 * C4NUM,
-                oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0);
-
-      Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset,
-                                 bias_data, start_index, real_cal_num, out_w_block, conv_param);
+      float *src_ptr = tile_buffer + task_id * tile_buffer_offset;
+      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
+      float *dst_ptr = tmp_dst_buffer + task_id * tmp_dst_buffer_offset;
+      for (int i = 0; i < input_unit_square; ++i) {
+        RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
+        MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0,
+                  ic4 * C4NUM, real_cal_num, oc8 * C8NUM, input_unit_square, 2);
+      }
+      Conv3x3Fp32OutputTransform(dst_ptr, nc4hw4_out + nc4hw4_buffer_offset, bias_data, start_index, real_cal_num,
+                                 out_w_block, conv_param);
     }
   }
 }
diff --git a/mindspore/lite/nnacl/fp32/conv.h b/mindspore/lite/nnacl/fp32/conv.h
index f2d20c3b52..3ea865d6b2 100644
--- a/mindspore/lite/nnacl/fp32/conv.h
+++ b/mindspore/lite/nnacl/fp32/conv.h
@@ -24,7 +24,6 @@
 #include "nnacl/op_base.h"
 #include "nnacl/common_func.h"
 #include "nnacl/conv_parameter.h"
-#include "nnacl/fp32/strassen_matmul.h"
 #include "nnacl/winograd_utils.h"
 #include "nnacl/fp32/conv_depthwise.h"
 
@@ -52,10 +51,6 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
               float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param,
               GEMM_FUNC_FP32 gemm_func);
 
-// fp32 conv1x1 strassen matmul
-int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr,
-                StrassenMatMulParameter matmul_param);
-
 // fp32 convolution winograd
 void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list,
                       int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func,
@@ -70,8 +65,8 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig
                                int output_unit);
 
 // fp32 conv3x3
-void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
-                 TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
+void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
+                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise.c b/mindspore/lite/nnacl/fp32/conv_depthwise.c
index c71e135ba7..b80166190d 100644
--- a/mindspore/lite/nnacl/fp32/conv_depthwise.c
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise.c
@@ -38,13 +38,15 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
   int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
   int h_start = h_step * task_id;
   int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   for (int b = 0; b < conv_param->output_batch_; b++) {
     const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
     float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
     for (int oh = h_start; oh < h_end; oh++) {
       float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
 
-      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_h_;
+      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
       int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
       int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
 
@@ -60,13 +62,13 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
         int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
         for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
           int out_w_start = MSMAX(
-            0, (conv_param->pad_w_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
-          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_w_ -
+            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
+          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
                                                         conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
                                                          conv_param->stride_w_);
 
           float *dst_w = dst_data + out_w_start * conv_param->output_channel_;
-          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_w_ + conv_param->dilation_w_ * kw;
+          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
 
           const float *src_kw = src_kh + iw_origin * conv_param->input_channel_;
           int num_pixels = out_w_end - out_w_start;
@@ -75,10 +77,10 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
           weight_kh += conv_param->output_channel_;
         }
       }
-      if (conv_param->is_relu_) {
+      if (relu) {
         ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
       }
-      if (conv_param->is_relu6_) {
+      if (relu6) {
         Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
       }
     }
@@ -91,16 +93,16 @@ void InitSlidingParam(SlidingWindowParam *sliding, const ConvParameter *conv_par
   int top = 0;
   int bottom = conv_param->output_h_;
 
-  for (; left * conv_param->stride_w_ < conv_param->pad_w_; left++) {
+  for (; left * conv_param->stride_w_ < conv_param->pad_l_; left++) {
   }
-  for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_w_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
+  for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_l_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
            conv_param->input_w_ &&
          right > left;
        right--) {
   }
-  for (; top * conv_param->stride_h_ < conv_param->pad_h_; top++) {
+  for (; top * conv_param->stride_h_ < conv_param->pad_u_; top++) {
   }
-  for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_h_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
+  for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_u_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
            conv_param->input_h_ &&
          bottom > top;
        bottom--) {
@@ -181,16 +183,18 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
 
 void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom,
                      int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float *dst_h = dst + top * sliding->out_h_step_;
   for (int oh = top; oh < bottom; oh++) {
-    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
+    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
     const float *src_h = src + ih * sliding->in_h_step_;
 
     float *dst_kernel = dst_h + left * sliding->block_channel_;
     for (int ow = left; ow < right; ow++) {
-      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
+      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
       const float *src_w = src_h + iw * sliding->block_channel_;
@@ -201,11 +205,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
 #ifdef ENABLE_ARM64
       ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                        sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
-                       conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
+                       conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6);
 #else
       DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
-                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM,
-                           conv_param->is_relu_, conv_param->is_relu6_);
+                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, relu, relu6);
 #endif
       dst_kernel += sliding->block_channel_;
     }  // width loop
@@ -259,6 +262,8 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
 // conv depthwise fp32: sliding window
 void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
                   const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   const float *src = input_data;
   float *dst = output_data;
   for (int b = 0; b < conv_param->output_batch_; b++) {
@@ -277,8 +282,8 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
                       conv_param->output_w_, conv_param, sliding);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
         float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -286,12 +291,12 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
                          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float),
                          sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float),
                          sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float),
-                         sliding->in_kw_step_ * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
+                         sliding->in_kw_step_ * sizeof(float), relu, relu6);
 #else
         DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
                         conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
-                        sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_,
-                        conv_param->is_relu_, conv_param->is_relu6_);
+                        sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, relu,
+                        relu6);
 #endif
       }
     }  // output C4 loop
@@ -302,399 +307,6 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
 }
 /*conv depthwise fp32 end*/
 
-/*conv depthwise 3x3 fp32 begin*/
-void ConvDw3x3Fp32FilterTrans(float *trans_weight, float *weight, int oc4) {
-  for (int c = 0; c < oc4; c++) {
-    float *src = weight + c * C4NUM * 9;
-    float *dst = trans_weight + c * C4NUM * 16;
-#ifdef ENABLE_ARM
-    float32x4_t g00 = vld1q_f32(src);
-    float32x4_t g01 = vld1q_f32(src + 4);
-    float32x4_t g02 = vld1q_f32(src + 2 * 4);
-    float32x4_t g10 = vld1q_f32(src + 3 * 4);
-    float32x4_t g11 = vld1q_f32(src + 4 * 4);
-    float32x4_t g12 = vld1q_f32(src + 5 * 4);
-    float32x4_t g20 = vld1q_f32(src + 6 * 4);
-    float32x4_t g21 = vld1q_f32(src + 7 * 4);
-    float32x4_t g22 = vld1q_f32(src + 8 * 4);
-
-    float32x4_t dst00 = g00;
-    float32x4_t dst01 = g01;
-    float32x4_t dst02 = g02;
-
-    float32x4_t dst10 = vaddq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5));
-    dst10 = vaddq_f32(dst10, vmulq_n_f32(g20, 0.5));
-    float32x4_t dst11 = vaddq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5));
-    dst11 = vaddq_f32(dst11, vmulq_n_f32(g21, 0.5));
-    float32x4_t dst12 = vaddq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5));
-    dst12 = vaddq_f32(dst12, vmulq_n_f32(g22, 0.5));
-
-    float32x4_t dst20 = vsubq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5));
-    dst20 = vaddq_f32(dst20, vmulq_n_f32(g20, 0.5));
-    float32x4_t dst21 = vsubq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5));
-    dst21 = vaddq_f32(dst21, vmulq_n_f32(g21, 0.5));
-    float32x4_t dst22 = vsubq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5));
-    dst22 = vaddq_f32(dst22, vmulq_n_f32(g22, 0.5));
-
-    float32x4_t dst30 = g20;
-    float32x4_t dst31 = g21;
-    float32x4_t dst32 = g22;
-
-    float32x4_t m00 = dst00;
-    float32x4_t m01 = vaddq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5));
-    m01 = vaddq_f32(m01, vmulq_n_f32(dst02, 0.5));
-    float32x4_t m02 = vsubq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5));
-    m02 = vaddq_f32(m02, vmulq_n_f32(dst02, 0.5));
-    float32x4_t m03 = dst02;
-
-    float32x4_t m10 = dst10;
-    float32x4_t m11 = vaddq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5));
-    m11 = vaddq_f32(m11, vmulq_n_f32(dst12, 0.5));
-    float32x4_t m12 = vsubq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5));
-    m12 = vaddq_f32(m12, vmulq_n_f32(dst12, 0.5));
-    float32x4_t m13 = dst12;
-
-    float32x4_t m20 = dst20;
-    float32x4_t m21 = vaddq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5));
-    m21 = vaddq_f32(m21, vmulq_n_f32(dst22, 0.5));
-    float32x4_t m22 = vsubq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5));
-    m22 = vaddq_f32(m22, vmulq_n_f32(dst22, 0.5));
-    float32x4_t m23 = dst22;
-
-    float32x4_t m30 = dst30;
-    float32x4_t m31 = vaddq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5));
-    m31 = vaddq_f32(m31, vmulq_n_f32(dst32, 0.5));
-    float32x4_t m32 = vsubq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5));
-    m32 = vaddq_f32(m32, vmulq_n_f32(dst32, 0.5));
-    float32x4_t m33 = dst32;
-
-    vst1q_f32(dst, m00);
-    vst1q_f32(dst + 4, m01);
-    vst1q_f32(dst + 8, m02);
-    vst1q_f32(dst + 12, m03);
-    vst1q_f32(dst + 16, m10);
-    vst1q_f32(dst + 20, m11);
-    vst1q_f32(dst + 24, m12);
-    vst1q_f32(dst + 28, m13);
-    vst1q_f32(dst + 32, m20);
-    vst1q_f32(dst + 36, m21);
-    vst1q_f32(dst + 40, m22);
-    vst1q_f32(dst + 44, m23);
-    vst1q_f32(dst + 48, m30);
-    vst1q_f32(dst + 52, m31);
-    vst1q_f32(dst + 56, m32);
-    vst1q_f32(dst + 60, m33);
-#else
-    for (int j = 0; j < C4NUM; j++) {
-      float *local_ptr = src + j;
-      float dst00 = local_ptr[0];
-      float dst01 = (local_ptr + 4)[0];
-      float dst02 = (local_ptr + 8)[0];
-
-      const float dst10 = 0.5f * local_ptr[0] + 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0];
-      const float dst11 = 0.5f * (local_ptr + 4)[0] + 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0];
-      const float dst12 = 0.5f * (local_ptr + 8)[0] + 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0];
-
-      const float dst20 = 0.5f * local_ptr[0] - 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0];
-      const float dst21 = 0.5f * (local_ptr + 4)[0] - 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0];
-      const float dst22 = 0.5f * (local_ptr + 8)[0] - 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0];
-
-      float dst30 = (local_ptr + 24)[0];
-      float dst31 = (local_ptr + 28)[0];
-      float dst32 = (local_ptr + 32)[0];
-
-      float m00 = dst00;
-      const float m01 = 0.5f * dst00 + 0.5f * dst01 + 0.5f * dst02;
-      const float m02 = 0.5f * dst00 - 0.5f * dst01 + 0.5f * dst02;
-      float m03 = dst02;
-
-      float m10 = dst10;
-      const float m11 = 0.5f * dst10 + 0.5f * dst11 + 0.5f * dst12;
-      const float m12 = 0.5f * dst10 - 0.5f * dst11 + 0.5f * dst12;
-      float m13 = dst12;
-
-      float m20 = dst20;
-      const float m21 = 0.5f * dst20 + 0.5f * dst21 + 0.5f * dst22;
-      const float m22 = 0.5f * dst20 - 0.5f * dst21 + 0.5f * dst22;
-      float m23 = dst22;
-
-      float m30 = dst30;
-      const float m31 = 0.5f * dst30 + 0.5f * dst31 + 0.5f * dst32;
-      const float m32 = 0.5f * dst30 - 0.5f * dst31 + 0.5f * dst32;
-      float m33 = dst32;
-
-      *(dst + j) = m00;
-      *(dst + j + 4) = m01;
-      *(dst + j + 8) = m02;
-      *(dst + j + 12) = m03;
-
-      *(dst + j + 16) = m10;
-      *(dst + j + 20) = m11;
-      *(dst + j + 24) = m12;
-      *(dst + j + 28) = m13;
-
-      *(dst + j + 32) = m20;
-      *(dst + j + 36) = m21;
-      *(dst + j + 40) = m22;
-      *(dst + j + 44) = m23;
-
-      *(dst + j + 48) = m30;
-      *(dst + j + 52) = m31;
-      *(dst + j + 56) = m32;
-      *(dst + j + 60) = m33;
-    }
-#endif
-  }
-}
-
-void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float *block_buffer, int out_h_block,
-                             int out_w_block, const ConvParameter *conv_param) {
-  int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
-  const int input_unit = 4;
-  memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float));
-
-  for (int oh = 0; oh < out_h_block; oh++) {
-    int ih = oh * 2 - conv_param->pad_h_;
-    int real_h_start = ih > 0 ? 0 : -ih;
-    int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih);
-    for (int ow = 0; ow < out_w_block; ow++) {
-      int iw = ow * 2 - conv_param->pad_w_;
-      int real_w_start = iw > 0 ? 0 : -iw;
-      int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw);
-
-      memset(block_buffer, 0, 16 * C4NUM * sizeof(float));
-      int src_plane_offset = ic4 * C4NUM * (ih * conv_param->input_w_ + iw);
-      for (int h = real_h_start; h < real_h_end; h++) {
-        int src_h_offset = src_plane_offset + (h * conv_param->input_w_) * ic4 * C4NUM;
-        int dst_h_offset = (h * input_unit) * C4NUM;
-        for (int w = real_w_start; w < real_w_end; w++) {
-          int src_w_offset = src_h_offset + w * ic4 * C4NUM;
-          int dst_w_offset = dst_h_offset + w * C4NUM;
-          float *src_addr = (float *)(input_data) + src_w_offset;
-          float *dst_addr = block_buffer + dst_w_offset;
-#ifdef ENABLE_NEON
-          vst1q_f32(dst_addr, vld1q_f32(src_addr));
-#else
-          for (int k = 0; k < C4NUM; k++) {
-            (dst_addr + k)[0] = (src_addr + k)[0];
-          }
-#endif
-        }
-      }
-      int trans_offset = (oh * out_w_block + ow) * 16 * C4NUM;
-      Conv3x3Fp32InputUnit(block_buffer, trans_input + trans_offset, C4NUM);
-    }
-  }
-}
-
-void ConvDw3x3Fp32Winograd(float *trans_buffer, const float *weight, int out_h_block, int out_w_block) {
-  const int unit = 4;
-  for (int oh = 0; oh < out_h_block; oh++) {
-    float *buf_oh = trans_buffer + oh * out_w_block * 16 * C4NUM;
-    for (int ow = 0; ow < out_w_block; ow++) {
-      float *buf_ow = buf_oh + ow * 16 * C4NUM;
-      for (int kh = 0; kh < unit; kh++) {
-        float *buf_kh = buf_ow + kh * unit * C4NUM;
-        const float *weight_kh = weight + kh * unit * C4NUM;
-        for (int kw = 0; kw < unit; kw++) {
-          float *buf_kw = buf_kh + kw * C4NUM;
-          const float *weight_kw = weight_kh + kw * C4NUM;
-          for (int c = 0; c < C4NUM; c++) {
-            buf_kw[c] = buf_kw[c] * weight_kw[c];
-          }
-        }
-      }
-    }
-  }
-}
-
-void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bias, int channel, int output_w,
-                             bool h_in_range, bool w_in_range, bool is_relu, bool is_relu6) {
-#ifdef ENABLE_ARM
-  float32x4_t bias_ptr = vld1q_f32(bias);
-
-  float32x4_t s00 = vld1q_f32(src_buf);
-  float32x4_t s01 = vld1q_f32(src_buf + 4);
-  float32x4_t s02 = vld1q_f32(src_buf + 8);
-  float32x4_t s03 = vld1q_f32(src_buf + 12);
-
-  float32x4_t s10 = vld1q_f32(src_buf + 16);
-  float32x4_t s11 = vld1q_f32(src_buf + 20);
-  float32x4_t s12 = vld1q_f32(src_buf + 24);
-  float32x4_t s13 = vld1q_f32(src_buf + 28);
-
-  float32x4_t s20 = vld1q_f32(src_buf + 32);
-  float32x4_t s21 = vld1q_f32(src_buf + 36);
-  float32x4_t s22 = vld1q_f32(src_buf + 40);
-  float32x4_t s23 = vld1q_f32(src_buf + 44);
-
-  float32x4_t s30 = vld1q_f32(src_buf + 48);
-  float32x4_t s31 = vld1q_f32(src_buf + 52);
-  float32x4_t s32 = vld1q_f32(src_buf + 56);
-  float32x4_t s33 = vld1q_f32(src_buf + 60);
-
-  float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20);
-  float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21);
-  float32x4_t t02 = vaddq_f32(vaddq_f32(s02, s12), s22);
-  float32x4_t t03 = vaddq_f32(vaddq_f32(s03, s13), s23);
-
-  float32x4_t t10 = vsubq_f32(vsubq_f32(s10, s20), s30);
-  float32x4_t t11 = vsubq_f32(vsubq_f32(s11, s21), s31);
-  float32x4_t t12 = vsubq_f32(vsubq_f32(s12, s22), s32);
-  float32x4_t t13 = vsubq_f32(vsubq_f32(s13, s23), s33);
-
-  float32x4_t d00 = vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), bias_ptr);
-  float32x4_t d01 = vaddq_f32(vsubq_f32(vsubq_f32(t01, t02), t03), bias_ptr);
-  float32x4_t d10 = vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), bias_ptr);
-  float32x4_t d11 = vaddq_f32(vsubq_f32(vsubq_f32(t11, t12), t13), bias_ptr);
-
-  float32x4_t zeros = {0, 0, 0, 0};
-  float32x4_t bounds = {6, 6, 6, 6};
-  if (is_relu) {
-    d00 = vmaxq_f32(d00, zeros);
-    d01 = vmaxq_f32(d01, zeros);
-    d10 = vmaxq_f32(d10, zeros);
-    d11 = vmaxq_f32(d11, zeros);
-  }
-  if (is_relu6) {
-    d00 = vminq_f32(vmaxq_f32(d00, zeros), bounds);
-    d01 = vminq_f32(vmaxq_f32(d01, zeros), bounds);
-    d10 = vminq_f32(vmaxq_f32(d10, zeros), bounds);
-    d11 = vminq_f32(vmaxq_f32(d11, zeros), bounds);
-  }
-
-  vst1q_f32(dst_output, d00);
-  if (w_in_range) {
-    vst1q_f32(dst_output + channel, d01);
-  }
-  if (h_in_range) {
-    vst1q_f32(dst_output + output_w * channel, d10);
-    if (w_in_range) {
-      vst1q_f32(dst_output + output_w * channel + channel, d11);
-    }
-  }
-#else
-  for (int i = 0; i < C4NUM; i++) {
-    const float *local_ptr = src_buf + i;
-    const float *bias_ptr = bias + i;
-
-    float s00 = local_ptr[0];
-    float s01 = (local_ptr + 4)[0];
-    float s02 = (local_ptr + 8)[0];
-    float s03 = (local_ptr + 12)[0];
-
-    float s10 = (local_ptr + 16)[0];
-    float s11 = (local_ptr + 20)[0];
-    float s12 = (local_ptr + 24)[0];
-    float s13 = (local_ptr + 28)[0];
-
-    float s20 = (local_ptr + 32)[0];
-    float s21 = (local_ptr + 36)[0];
-    float s22 = (local_ptr + 40)[0];
-    float s23 = (local_ptr + 44)[0];
-
-    float s30 = (local_ptr + 48)[0];
-    float s31 = (local_ptr + 52)[0];
-    float s32 = (local_ptr + 56)[0];
-    float s33 = (local_ptr + 60)[0];
-
-    float t00 = s00 + s10 + s20;
-    float t01 = s01 + s11 + s21;
-    float t02 = s02 + s12 + s22;
-    float t03 = s03 + s13 + s23;
-
-    float t10 = s10 - s20 - s30;
-    float t11 = s11 - s21 - s31;
-    float t12 = s12 - s22 - s32;
-    float t13 = s13 - s23 - s33;
-
-    float d00 = t00 + t01 + t02 + bias_ptr[0];
-    float d01 = t01 - t02 - t03 + bias_ptr[0];
-    float d10 = t10 + t11 + t12 + bias_ptr[0];
-    float d11 = t11 - t12 - t13 + bias_ptr[0];
-
-    if (is_relu) {
-      d00 = MSMAX(d00, 0);
-      d01 = MSMAX(d01, 0);
-      d10 = MSMAX(d10, 0);
-      d11 = MSMAX(d11, 0);
-    }
-    if (is_relu6) {
-      d00 = MSMIN(MSMAX(d00, 0), 6);
-      d01 = MSMIN(MSMAX(d01, 0), 6);
-      d10 = MSMIN(MSMAX(d10, 0), 6);
-      d11 = MSMIN(MSMAX(d11, 0), 6);
-    }
-
-    (dst_output + i)[0] = d00;
-    if (w_in_range) {
-      (dst_output + i + channel)[0] = d01;
-    }
-    if (h_in_range) {
-      (dst_output + i + output_w * channel)[0] = d10;
-      if (w_in_range) {
-        (dst_output + i + output_w * channel + channel)[0] = d11;
-      }
-    }
-  }
-#endif
-}
-
-void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block,
-                              int out_w_block, const ConvParameter *conv_param) {
-  int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
-  bool h_in_range = true;
-  for (int oh = 0; oh < out_h_block; oh++) {
-    const int real_oh = 2 * oh;
-    if ((oh + 1) * 2 > conv_param->output_h_) {
-      h_in_range = false;
-    }
-    bool w_in_range = true;
-    float *buf_oh = trans_buffer + oh * out_w_block * 16 * C4NUM;
-    float *output_oh = output_data + real_oh * conv_param->output_w_ * oc4 * C4NUM;
-
-    for (int ow = 0; ow < out_w_block; ow++) {
-      const int real_ow = 2 * ow;
-      if ((ow + 1) * 2 > conv_param->output_w_) {
-        w_in_range = false;
-      }
-      float *buf_ow = buf_oh + ow * 16 * C4NUM;
-      float *output_ow = output_oh + real_ow * oc4 * C4NUM;
-
-      ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range,
-                              conv_param->is_relu_, conv_param->is_relu6_);
-    }
-  }
-}
-
-void ConvDw3x3Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
-                   float *trans_buffer, float *block_buffer, const ConvParameter *conv_param, int task_id) {
-  int thread_count = conv_param->thread_num_;
-  int output_channel = conv_param->output_channel_;
-  int oc4 = UP_DIV(output_channel, C4NUM);
-  int out_h_block = UP_DIV(conv_param->output_h_, 2);
-  int out_w_block = UP_DIV(conv_param->output_w_, 2);
-
-  int input_batch = conv_param->input_batch_;
-  for (int batch = 0; batch < input_batch; batch++) {
-    const float *input = input_data + batch * conv_param->input_h_ * conv_param->input_w_ *
-                                        UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM;
-    float *output = output_data + batch * conv_param->output_h_ * conv_param->output_w_ *
-                                    UP_DIV(conv_param->output_channel_, C4NUM) * C4NUM;
-    for (int oc = task_id; oc < oc4; oc += thread_count) {
-      const float *weight = weight_data + oc * 16 * C4NUM;
-      const float *bias = bias_data + oc * C4NUM;
-
-      ConvDw3x3Fp32InputTrans(input + oc * C4NUM, trans_buffer, block_buffer, out_h_block, out_w_block, conv_param);
-
-      ConvDw3x3Fp32Winograd(trans_buffer, weight, out_h_block, out_w_block);
-
-      ConvDw3x3Fp32OutputTrans(trans_buffer, output + oc * C4NUM, bias, out_h_block, out_w_block, conv_param);
-    }
-  }
-}
-/*conv depthwise 3x3 fp32 end*/
-
 /*deconv depthwise fp32 begin*/
 void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weight, int height, int width,
                                 int in_kh_step, int in_kw_step, int kernel_w_step) {
@@ -727,14 +339,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
                            const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
   const float *src_h = src + top * sliding->out_h_step_;
   for (int ih = top; ih < bottom; ih++) {
-    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
+    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
     float *dst_h = dst + oh * sliding->in_h_step_;
 
     const float *src_kernel = src_h + left * sliding->block_channel_;
     for (int iw = left; iw < right; iw++) {
-      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
       float *dst_w = dst_h + ow * sliding->block_channel_;
@@ -790,12 +402,14 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
 #endif
 
 void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {
+  bool relu = conv_param->act_type_ == ActType_Relu;
+  bool relu6 = conv_param->act_type_ == ActType_Relu6;
   float *dst_k = dst;
   for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
     for (int c = 0; c < C4NUM; c++) {
       dst_k[c] += bias[c];
-      dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
-      dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
+      dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
+      dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
     }
     dst_k += block_channel;
   }
@@ -821,8 +435,8 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we
                             conv_param->input_w_, conv_param, sliding);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
         const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 
diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise.h b/mindspore/lite/nnacl/fp32/conv_depthwise.h
index 7dd0c96c3c..4edf2105fb 100644
--- a/mindspore/lite/nnacl/fp32/conv_depthwise.h
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise.h
@@ -48,11 +48,6 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
 void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
                   const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
 
-void ConvDw3x3Fp32FilterTrans(float *trans_weight, float *weight, int oc4);
-
-void ConvDw3x3Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
-                   float *trans_buffer, float *block_buffer, const ConvParameter *conv_param, int task_id);
-
 void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
                     const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
 
diff --git a/mindspore/lite/nnacl/fp32/deconv.c b/mindspore/lite/nnacl/fp32/deconv.c
index d380a28f7e..b9a78a622c 100644
--- a/mindspore/lite/nnacl/fp32/deconv.c
+++ b/mindspore/lite/nnacl/fp32/deconv.c
@@ -33,18 +33,18 @@ void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, in
   return;
 }
 
-int DeConvPostFp32C8x8(const float *src, float *tmp, const float *bias, float *dst, int output_channel,
-                       ConvParameter *conv_param) {
-  /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
+int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *dst, int output_channel,
+                        ConvParameter *conv_param) {
+  /* row12x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
   size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
   size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
   size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
   int oc8 = UP_ROUND(output_channel, C8NUM);
-  int in_plane8 = UP_ROUND(input_plane, C8NUM);
+  int in_plane12 = UP_ROUND(input_plane, C12NUM);
   int src_iw_stride = C8NUM;
   int src_ih_stride = conv_param->input_w_ * C8NUM;
-  int src_kw_stride = in_plane8 * C8NUM;
-  int src_kh_stride = in_plane8 * conv_param->kernel_w_ * C8NUM;
+  int src_kw_stride = in_plane12 * C8NUM;
+  int src_kh_stride = in_plane12 * conv_param->kernel_w_ * C8NUM;
   int dst_oh_stride = conv_param->output_w_ * C8NUM;
   int dst_ow_stride = C8NUM;
   int dst_kh_stride = conv_param->dilation_h_ * conv_param->output_w_ * C8NUM;
@@ -52,13 +52,13 @@ int DeConvPostFp32C8x8(const float *src, float *tmp, const float *bias, float *d
 
   for (int c = 0; c < oc8; c += 8) {
     float *dst_ptr = tmp + c * output_plane;
-    const float *src_ptr = src + c * in_plane8 * kernel_plane;
+    const float *src_ptr = src + c * in_plane12 * kernel_plane;
     memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float));
 
     for (int ih = 0; ih < conv_param->input_h_; ih++) {
       for (int iw = 0; iw < conv_param->input_w_; iw++) {
-        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
-        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
 
         int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
         int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -97,45 +97,7 @@ int DeConvPostFp32C8x8(const float *src, float *tmp, const float *bias, float *d
     }       /*ih*/
   }         /*oc8*/
 
-  PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
-                     conv_param->is_relu6_);
-  return NNACL_OK;
-}
-
-int DeConvPostFp32C4(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel,
-                     int input_plane, int kernel_plane, int output_plane, ConvParameter *conv_param) {
-  int oc4 = UP_DIV(output_channel, C4NUM);
-  for (int c = 0; c < oc4; c++) {
-    float *dst_ptr = tmp_c4 + c * output_plane * C4NUM;
-    const float *src_ptr = src + c * input_plane * kernel_plane * C4NUM;
-    memset(dst_ptr, 0, output_plane * C4NUM * sizeof(float));
-
-    for (int ih = 0; ih < conv_param->input_h_; ih++) {
-      for (int iw = 0; iw < conv_param->input_w_; iw++) {
-        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
-        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
-
-        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
-        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
-        int kw_start = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
-        int kw_end = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
-        for (int kh = kh_start; kh < kh_end; kh++) {
-          for (int kw = kw_start; kw < kw_end; kw++) {
-            int src_index = ih * conv_param->input_w_ * C4NUM + iw * C4NUM +
-                            kh * input_plane * conv_param->kernel_w_ * C4NUM + kw * input_plane * C4NUM;
-            int dst_index = oh * conv_param->output_w_ * C4NUM + ow * C4NUM +
-                            kh * conv_param->dilation_h_ * conv_param->output_w_ * C4NUM +
-                            kw * conv_param->dilation_w_ * C4NUM;
-            for (int i = 0; i < C4NUM; i++) {
-              dst_ptr[dst_index + i] += src_ptr[src_index + i];
-            }
-          } /*kw*/
-        }   /*kh*/
-      }     /*iw*/
-    }       /*ih*/
-  }         /*oc4*/
-
-  PostConvFuncFp32C4(tmp_c4, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
-                     conv_param->is_relu6_);
+  PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
+                     conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
   return NNACL_OK;
 }
diff --git a/mindspore/lite/nnacl/fp32/deconv.h b/mindspore/lite/nnacl/fp32/deconv.h
index 601e91e605..65b9b41fbe 100644
--- a/mindspore/lite/nnacl/fp32/deconv.h
+++ b/mindspore/lite/nnacl/fp32/deconv.h
@@ -16,20 +16,19 @@
 #ifndef MINDSPORE_LITE_NNACL_FP32_DECONV_H_
 #define MINDSPORE_LITE_NNACL_FP32_DECONV_H_
 
+#include <string.h>
 #include "nnacl/pack.h"
 #include "nnacl/op_base.h"
 #include "nnacl/conv_parameter.h"
-#include "nnacl/fp32/strassen_matmul.h"
+#include "nnacl/errorcode.h"
+#include "nnacl/fp32/common_func.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, int output_channel, int plane);
-
-int DeConvPostFp32C4(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel,
-                     int input_plane, int kernel_plane, int output_plane, ConvParameter *conv_param);
-int DeConvPostFp32C8x8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel,
-                       ConvParameter *conv_param);
+int DeConvPostFp32C12x8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel,
+                        ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/gather.h b/mindspore/lite/nnacl/fp32/gather.h
index 25438f6bf4..c2ded7f99e 100644
--- a/mindspore/lite/nnacl/fp32/gather.h
+++ b/mindspore/lite/nnacl/fp32/gather.h
@@ -19,19 +19,13 @@
 
 #include "nnacl/op_base.h"
 
-typedef struct GatherParameter {
-  OpParameter op_parameter_;
-  int axis_;
-  int batchDims_;
-} GatherParameter;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 int Gather(float *input, int outer_size, int inner_size, int limit, int *indices, int indices_element_size,
            float *output);
-int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, int *indices,
-                int indices_element_size, int32_t *output);
+int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, int *indices, int indices_element_size,
+                int32_t *output);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c
index 03c65ef23f..ee9dec656f 100644
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@@ -28,6 +28,129 @@ void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col) {
   return;
 }
 
+void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col) {
+  for (int r = 0; r < row; r++) {
+    float *src = src_ptr + r * col;
+    for (int c = 0; c < col; c++) {
+      int cd8 = c / C12NUM;
+      int cm8 = c % C12NUM;
+      dst_ptr[cd8 * C12NUM * row + r * C12NUM + cm8] = src[c];
+    }
+  }
+  return;
+}
+
+void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) {
+  size_t row_up_12 = UP_ROUND(row, C12NUM);
+  size_t row12 = row / C12NUM * C12NUM;
+  size_t col4 = col / C4NUM * C4NUM;
+  float *src_r = src_ptr;
+  float *dst_r = dst_ptr;
+
+  size_t ri = 0;
+  for (; ri < row12; ri += C12NUM) {
+    size_t ci = 0;
+    for (; ci < col4; ci += C4NUM) {
+      float *src_c = src_r + ci;
+      float *dst_c = dst_r + ci * C12NUM;
+
+      /* 12x4 row-major to col-major */
+#ifdef ENABLE_ARM64
+      size_t stride = col * sizeof(float);
+      asm volatile(
+        "mov x10, %[src_c]\n"
+        "mov x11, %[dst_c]\n"
+
+        "ld1 {v0.4s}, [x10], %[stride]\n"
+        "ld1 {v1.4s}, [x10], %[stride]\n"
+        "ld1 {v2.4s}, [x10], %[stride]\n"
+        "ld1 {v3.4s}, [x10], %[stride]\n"
+
+        "ld1 {v4.4s}, [x10], %[stride]\n"
+        "ld1 {v5.4s}, [x10], %[stride]\n"
+        "ld1 {v6.4s}, [x10], %[stride]\n"
+        "ld1 {v7.4s}, [x10], %[stride]\n"
+
+        "zip1 v12.4s, v0.4s, v1.4s\n"
+        "zip2 v13.4s, v0.4s, v1.4s\n"
+        "zip1 v14.4s, v2.4s, v3.4s\n"
+        "zip2 v15.4s, v2.4s, v3.4s\n"
+
+        "ld1 {v8.4s}, [x10], %[stride]\n"
+        "ld1 {v9.4s}, [x10], %[stride]\n"
+        "ld1 {v10.4s}, [x10], %[stride]\n"
+        "ld1 {v11.4s}, [x10], %[stride]\n"
+
+        "zip1 v16.4s, v4.4s, v5.4s\n"
+        "zip2 v17.4s, v4.4s, v5.4s\n"
+        "zip1 v18.4s, v6.4s, v7.4s\n"
+        "zip2 v19.4s, v6.4s, v7.4s\n"
+
+        "trn1 v20.2d, v12.2d, v14.2d\n"
+        "trn2 v23.2d, v12.2d, v14.2d\n"
+        "trn1 v26.2d, v13.2d, v15.2d\n"
+        "trn2 v29.2d, v13.2d, v15.2d\n"
+
+        "trn1 v21.2d, v16.2d, v18.2d\n"
+        "trn2 v24.2d, v16.2d, v18.2d\n"
+        "trn1 v27.2d, v17.2d, v19.2d\n"
+        "trn2 v30.2d, v17.2d, v19.2d\n"
+
+        "zip1 v12.4s, v8.4s, v9.4s\n"
+        "zip2 v13.4s, v8.4s, v9.4s\n"
+        "zip1 v14.4s, v10.4s, v11.4s\n"
+        "zip2 v15.4s, v10.4s, v11.4s\n"
+
+        "trn1 v22.2d, v12.2d, v14.2d\n"
+        "trn2 v25.2d, v12.2d, v14.2d\n"
+        "trn1 v28.2d, v13.2d, v15.2d\n"
+        "trn2 v31.2d, v13.2d, v15.2d\n"
+
+        "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64\n"
+        "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64\n"
+        "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64\n"
+
+        :
+        : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride)
+        : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31");
+#else
+      for (int tr = 0; tr < C12NUM; tr++) {
+        for (int tc = 0; tc < C4NUM; tc++) {
+          dst_c[tc * C12NUM + tr] = src_c[tr * col + tc];
+        }
+      }
+#endif
+    }
+    for (; ci < col; ci++) {
+      float *src_c = src_r + ci;
+      float *dst_c = dst_r + ci * C12NUM;
+      for (size_t i = 0; i < C12NUM; i++) {
+        dst_c[i] = src_c[i * col];
+      }
+    }
+    src_r += C12NUM * col;
+    dst_r += C12NUM * col;
+  }
+
+  for (; ri < row; ri++) {
+    for (size_t i = 0; i < col; i++) {
+      dst_r[i * C12NUM] = src_r[i];
+    }
+    src_r += col;
+    dst_r += 1;
+  }
+
+  for (; ri < row_up_12; ri++) {
+    for (size_t i = 0; i < col; i++) {
+      dst_r[i * C12NUM] = 0;
+    }
+    dst_r += 1;
+  }
+  return;
+}
+
 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) {
   size_t row8 = row / C8NUM * C8NUM;
   size_t col4 = col / C4NUM * C4NUM;
@@ -221,18 +344,18 @@ void Row8x8Major2RowMajor(float *src_ptr, float *dst_ptr, size_t row, size_t col
   return;
 }
 
-void MatMul8x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row,
-               int col, int stride, bool write_nhwc) {
-  if (write_nhwc) {
+void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row,
+                int col, int stride, int out_type) {
+  if (out_type == OutType_Nhwc) {
     /*  col8-major * row8-major => col-major  */
     for (int r = 0; r < row; r++) {
       for (int c = 0; c < col; c++) {
-        int r8div = r / 8, r8mod = r % 8;
+        int r12div = r / 12, r12mod = r % 12;
         int c8div = c / 8, c8mod = c % 8;
         size_t ci = r * stride + c;
         float value = 0;
         for (int d = 0; d < deep; d++) {
-          size_t ai = r8div * deep * 8 + d * 8 + r8mod;
+          size_t ai = r12div * deep * 12 + d * 12 + r12mod;
           size_t bi = c8div * deep * 8 + d * 8 + c8mod;
           value = value + a[ai] * b[bi];
         }
@@ -242,19 +365,19 @@ void MatMul8x8(const float *a, const float *b, float *dst, const float *bias, Ac
         dst[ci] = value;
       }
     }
-  } else {
-    /*  col8-major * row8-major => col8x8-major  */
+  } else if (out_type == OutType_C8) {
+    /*  col8-major * row8-major => col12x8-major  */
     int col_8 = UP_ROUND(col, C8NUM);
-    int row_8 = UP_ROUND(row, C8NUM);
-    for (int r = 0; r < row_8; r++) {
+    int row_12 = UP_ROUND(row, C12NUM);
+    for (int r = 0; r < row_12; r++) {
       for (int c = 0; c < col_8; c++) {
-        int r8div = r / 8, r8mod = r % 8;
-        int c8div = c / 8, c8mod = c % 8;
-        size_t ci = c8div * row_8 * 8 + r * 8 + c8mod;
+        int r12div = r / C12NUM, r12mod = r % C12NUM;
+        int c8div = c / C8NUM, c8mod = c % C8NUM;
+        size_t ci = (c8div * C8NUM * row_12 + r * C8NUM + c8mod);
         float value = 0;
         for (int d = 0; d < deep; d++) {
-          size_t ai = r8div * deep * 8 + d * 8 + r8mod;
-          size_t bi = c8div * deep * 8 + d * 8 + c8mod;
+          size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod;
+          size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod;
           value = value + a[ai] * b[bi];
         }
         if (bias != NULL) value += bias[c];
@@ -263,15 +386,39 @@ void MatMul8x8(const float *a, const float *b, float *dst, const float *bias, Ac
         dst[ci] = value;
       }
     }
+  } else {
+    for (int i = 0; i < row; ++i) {
+      int src_r_offset = i;
+      int dst_r_offset = i * col * stride;
+      for (int j = 0; j < col; ++j) {
+        int c8div = j / 8, c8mod = j % 8;
+        size_t ci = dst_r_offset + c8div * 8 * stride + c8mod;
+        float value = 0;
+        for (int d = 0; d < deep; ++d) {
+          size_t ai = src_r_offset + d * C12NUM;
+          size_t bi = c8div * deep * 8 + d * 8 + c8mod;
+          value = value + a[ai] * b[bi];
+        }
+        if (bias != NULL) value += bias[j];
+        if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
+        if (act_type != ActType_No) value = MSMAX(0.0f, value);
+        dst[ci] = value;
+      }
+    }
   }
   return;
 }
 
-void MatMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, int col,
-            int stride, bool write_nhwc) {
+void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
+               int col, size_t stride, int out_type) {
 #ifdef ENABLE_ARM64
-  MatmulFloatNeon64(a, b, c, bias, (int)act_type, deep, row, col, stride, write_nhwc);
+  if (out_type == 2 && row <= 8) {
+    MatmulFloatNeon64OptRemain(a, b, c, deep, row, col, stride);
+  } else {
+    MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc),
+                         (int)(out_type == OutType_TileC8));
+  }
 #else
-  MatMul8x8(a, b, c, bias, act_type, deep, row, col, stride, write_nhwc);
+  MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type);
 #endif
 }
diff --git a/mindspore/lite/nnacl/fp32/matmul.h b/mindspore/lite/nnacl/fp32/matmul.h
index 7459e426ea..61759b60be 100644
--- a/mindspore/lite/nnacl/fp32/matmul.h
+++ b/mindspore/lite/nnacl/fp32/matmul.h
@@ -26,14 +26,20 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void MatMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int row, int col,
-            int stride, bool write_nhwc);
+void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
+               int col, size_t stride, int out_type);
+
 void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col);
+void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col);
+void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col);
 void Row8x8Major2RowMajor(float *src_ptr, float *dst_ptr, size_t row, size_t col, size_t stride);
 #ifdef ENABLE_ARM64
 void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                        int col, size_t stride, bool write_nhwc);
+void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
+                          int col, size_t stride, size_t write_nhwc, size_t write_c4);
+void MatmulFloatNeon64OptRemain(const float *a, const float *b, float *c, int depth, int row, int col, size_t stride);
 #endif
 #ifdef __cplusplus
 }
diff --git a/mindspore/lite/nnacl/fp32/pooling.c b/mindspore/lite/nnacl/fp32/pooling.c
index 543ccfb132..54bc578815 100644
--- a/mindspore/lite/nnacl/fp32/pooling.c
+++ b/mindspore/lite/nnacl/fp32/pooling.c
@@ -130,8 +130,103 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
   int out_plane = output_w * output_h;
   int out_tile_count = UP_DIV(out_plane, TILE_NUM);
   int thread_num = pooling_param->thread_num_;
+  int c4 = UP_DIV(channel, C4NUM); /* oc && ic */
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
+    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
+    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
+      int cal_start_index = thread_id * TILE_NUM;
+      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+      for (int i = 0; i < real_cal_num; i++) {
+        int index = cal_start_index + i;
+        int out_w_index = index % output_w;
+        int out_h_index = index / output_w;
+        int in_w_index = out_w_index * stride_w - pad_w;
+        int in_h_index = out_h_index * stride_h - pad_h;
+
+        const float *src_plane_ptr = src_b_ptr;
+        float *dst_plane_ptr = dst_b_ptr + index * channel;
+
+        int real_win_h_start = MSMAX(0, -in_h_index);
+        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+        int resl_win_w_start = MSMAX(0, -in_w_index);
+        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+
+        for (int ci = 0; ci < c4 - 1; ci++) {
+          const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
+          float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX);
+#else
+          float tmp_max1 = -FLT_MAX;
+          float tmp_max2 = -FLT_MAX;
+          float tmp_max3 = -FLT_MAX;
+          float tmp_max4 = -FLT_MAX;
+#endif
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+#ifdef ENABLE_NEON
+              tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr));
+#else
+              tmp_max1 = fmax(tmp_max1, src_win_ptr[0]);
+              tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
+              tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
+              tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
+
+#endif
+            }  // win_w loop
+          }    // win_h loop
+#ifdef ENABLE_NEON
+          vst1q_f32(dst_c_ptr, tmp_max);
+#else
+          dst_c_ptr[0] = tmp_max1;
+          dst_c_ptr[1] = tmp_max2;
+          dst_c_ptr[2] = tmp_max3;
+          dst_c_ptr[3] = tmp_max4;
+#endif
+        }  // ic4-1 loop
+        int channel_s = (c4 - 1) * C4NUM;
+        for (int ci = channel_s; ci < channel; ci++) {
+          float *dst_c_ptr = dst_plane_ptr + ci;
+          const float *src_c_ptr = src_plane_ptr + ci;
+          float tmp_max = -FLT_MAX;
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+              tmp_max = fmax(tmp_max, src_win_ptr[0]);
+            }  // win_w loop
+          }    // win_h loop
+          dst_c_ptr[0] = tmp_max;
+        }  // channel_res loop
+      }    // real_cal_num loop
+    }      // out_plane loop
+  }        // out_batch loop
+}
+
+void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
   int c4 = UP_DIV(channel, C4NUM);
-  // input channel is equal to output channel
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  int thread_num = pooling_param->thread_num_;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0);
+#endif
 
   for (int batch = 0; batch < output_batch; batch++) {
     int in_batch_offset = batch * in_h * in_w * channel;
@@ -149,6 +244,121 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
         for (int j = 0; j < c4 - 1; j++) {
           int in_channel_offset = in_batch_offset + j * C4NUM;
           int out_channel_offset = out_plane_offset + j * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t tmp_avg = vdupq_n_f32(0);
+#else
+          float tmp_avg1 = 0;
+          float tmp_avg2 = 0;
+          float tmp_avg3 = 0;
+          float tmp_avg4 = 0;
+#endif
+          int real_count = 0;
+          for (int h = 0; h < win_h; h++) {
+            for (int w = 0; w < win_w; w++) {
+              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
+                  (in_w_index + w) >= in_w) {
+                continue;
+              } else {
+                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+#ifdef ENABLE_NEON
+                tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset));
+#else
+                tmp_avg1 += *(input_ptr + in_offset);
+                tmp_avg2 += *(input_ptr + in_offset + 1);
+                tmp_avg3 += *(input_ptr + in_offset + 2);
+                tmp_avg4 += *(input_ptr + in_offset + 3);
+#endif
+                ++real_count;
+              }
+            }  // win_w loop
+          }    // win_h loop
+#ifdef ENABLE_NEON
+          tmp_avg = vmaxq_f32(tmp_avg, zeros);
+          vst1q_f32(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f32(real_count));
+#else
+          tmp_avg1 = fmax(tmp_avg1, 0);
+          tmp_avg2 = fmax(tmp_avg2, 0);
+          tmp_avg3 = fmax(tmp_avg3, 0);
+          tmp_avg4 = fmax(tmp_avg4, 0);
+
+          *(output_ptr + out_channel_offset) = tmp_avg1 / (float)real_count;
+          *(output_ptr + out_channel_offset + 1) = tmp_avg2 / (float)real_count;
+          *(output_ptr + out_channel_offset + 2) = tmp_avg3 / (float)real_count;
+          *(output_ptr + out_channel_offset + 3) = tmp_avg4 / (float)real_count;
+#endif
+        }  // ic4-1 loop
+        int channel_s = (c4 - 1) * C4NUM;
+        for (int k = channel_s; k < channel; k++) {
+          int in_channel_offset = in_batch_offset + k;
+          int out_channel_offset = out_plane_offset + k;
+          float tmp_avg = 0;
+          int real_count = 0;
+          for (int h = 0; h < win_h; h++) {
+            for (int w = 0; w < win_w; w++) {
+              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
+                  (in_w_index + w) >= in_w) {
+                continue;
+              } else {
+                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+                tmp_avg += *(input_ptr + in_offset);
+                ++real_count;
+              }
+            }  // win_w loop
+          }    // win_h loop
+          tmp_avg = fmax(tmp_avg, 0);
+          *(output_ptr + out_channel_offset) = tmp_avg / (float)real_count;
+        }  // channel_res loop
+      }    // real_cal_num loop
+    }      // out_plane loop
+  }        // out_batch loop
+}
+
+void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  int thread_num = pooling_param->thread_num_;
+  int c4 = UP_DIV(channel, C4NUM);
+
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0);
+#endif
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
+    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
+    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
+      int cal_start_index = thread_id * TILE_NUM;
+      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+      for (int i = 0; i < real_cal_num; i++) {
+        int index = cal_start_index + i;
+        int out_w_index = index % output_w;
+        int out_h_index = index / output_w;
+        int in_w_index = out_w_index * stride_w - pad_w;
+        int in_h_index = out_h_index * stride_h - pad_h;
+
+        const float *src_plane_ptr = src_b_ptr;
+        float *dst_plane_ptr = dst_b_ptr + index * channel;
+
+        int real_win_h_start = MSMAX(0, -in_h_index);
+        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+        int resl_win_w_start = MSMAX(0, -in_w_index);
+        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+
+        for (int ci = 0; ci < c4 - 1; ci++) {
+          const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
+          float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
 #ifdef ENABLE_NEON
           float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX);
 #else
@@ -157,6 +367,105 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
           float tmp_max3 = -FLT_MAX;
           float tmp_max4 = -FLT_MAX;
 #endif
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+#ifdef ENABLE_NEON
+              tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr));
+#else
+              tmp_max1 = fmax(tmp_max1, src_win_ptr[0]);
+              tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
+              tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
+              tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
+
+#endif
+            }  // win_w loop
+          }    // win_h loop
+#ifdef ENABLE_NEON
+          tmp_max = vmaxq_f32(tmp_max, zeros);
+          vst1q_f32(dst_c_ptr, tmp_max);
+#else
+          // relu:
+          tmp_max1 = fmax(tmp_max1, 0);
+          tmp_max2 = fmax(tmp_max2, 0);
+          tmp_max3 = fmax(tmp_max3, 0);
+          tmp_max4 = fmax(tmp_max4, 0);
+
+          dst_c_ptr[0] = tmp_max1;
+          dst_c_ptr[1] = tmp_max2;
+          dst_c_ptr[2] = tmp_max3;
+          dst_c_ptr[3] = tmp_max4;
+#endif
+        }  // ic4-1 loop
+        int channel_s = (c4 - 1) * C4NUM;
+        for (int ci = channel_s; ci < channel; ci++) {
+          float *dst_c_ptr = dst_plane_ptr + ci;
+          const float *src_c_ptr = src_plane_ptr + ci;
+          float tmp_max = -FLT_MAX;
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+              tmp_max = fmax(tmp_max, src_win_ptr[0]);
+            }  // win_w loop
+          }    // win_h loop
+          dst_c_ptr[0] = tmp_max;
+        }  // channel_res loop
+      }    // real_cal_num loop
+    }      // out_plane loop
+  }        // out_batch loop
+}
+
+void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int c4 = UP_DIV(channel, C4NUM);
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  int thread_num = pooling_param->thread_num_;
+  // input channel is equal to output channel
+
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0);
+  float32x4_t bounds = vdupq_n_f32(6);
+#endif
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    int in_batch_offset = batch * in_h * in_w * channel;
+    int out_batch_offset = batch * output_h * output_w * channel;
+    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
+      int cal_start_index = thread_id * TILE_NUM;
+      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+      for (int i = 0; i < real_cal_num; i++) {
+        int index = cal_start_index + i;
+        int out_w_index = index % output_w;
+        int out_h_index = index / output_w;
+        int in_w_index = out_w_index * stride_w - pad_w;
+        int in_h_index = out_h_index * stride_h - pad_h;
+        int out_plane_offset = out_batch_offset + index * channel;
+        for (int j = 0; j < c4 - 1; j++) {
+          int in_channel_offset = in_batch_offset + j * C4NUM;
+          int out_channel_offset = out_plane_offset + j * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t tmp_avg = vdupq_n_f32(0);
+#else
+          float tmp_avg1 = 0;
+          float tmp_avg2 = 0;
+          float tmp_avg3 = 0;
+          float tmp_avg4 = 0;
+#endif
+          int real_count = 0;
           for (int h = 0; h < win_h; h++) {
             for (int w = 0; w < win_w; w++) {
               if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
@@ -165,30 +474,48 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
               } else {
                 int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
 #ifdef ENABLE_NEON
-                tmp_max = vmaxq_f32(tmp_max, vld1q_f32(input_ptr + in_offset));
+                tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset));
 #else
-                tmp_max1 = fmax(tmp_max1, *(input_ptr + in_offset));
-                tmp_max2 = fmax(tmp_max2, *(input_ptr + in_offset + 1));
-                tmp_max3 = fmax(tmp_max3, *(input_ptr + in_offset + 2));
-                tmp_max4 = fmax(tmp_max4, *(input_ptr + in_offset + 3));
+                tmp_avg1 += *(input_ptr + in_offset);
+                tmp_avg2 += *(input_ptr + in_offset + 1);
+                tmp_avg3 += *(input_ptr + in_offset + 2);
+                tmp_avg4 += *(input_ptr + in_offset + 3);
 #endif
+                ++real_count;
               }
             }  // win_w loop
           }    // win_h loop
 #ifdef ENABLE_NEON
-          vst1q_f32(output_ptr + out_channel_offset, tmp_max);
+          tmp_avg = tmp_avg / vdupq_n_f32(real_count);
+          tmp_avg = vmaxq_f32(tmp_avg, zeros);
+          tmp_avg = vminq_f32(tmp_avg, bounds);
+          vst1q_f32(output_ptr + out_channel_offset, tmp_avg);
 #else
-          *(output_ptr + out_channel_offset) = tmp_max1;
-          *(output_ptr + out_channel_offset + 1) = tmp_max2;
-          *(output_ptr + out_channel_offset + 2) = tmp_max3;
-          *(output_ptr + out_channel_offset + 3) = tmp_max4;
+          tmp_avg1 /= (float)real_count;
+          tmp_avg2 /= (float)real_count;
+          tmp_avg3 /= (float)real_count;
+          tmp_avg4 /= (float)real_count;
+          tmp_avg1 = fmax(tmp_avg1, 0);
+          tmp_avg2 = fmax(tmp_avg2, 0);
+          tmp_avg3 = fmax(tmp_avg3, 0);
+          tmp_avg4 = fmax(tmp_avg4, 0);
+          tmp_avg1 = fmin(tmp_avg1, 6);
+          tmp_avg2 = fmin(tmp_avg2, 6);
+          tmp_avg3 = fmin(tmp_avg3, 6);
+          tmp_avg4 = fmin(tmp_avg4, 6);
+
+          *(output_ptr + out_channel_offset) = tmp_avg1;
+          *(output_ptr + out_channel_offset + 1) = tmp_avg2;
+          *(output_ptr + out_channel_offset + 2) = tmp_avg3;
+          *(output_ptr + out_channel_offset + 3) = tmp_avg4;
 #endif
         }  // ic4-1 loop
         int channel_s = (c4 - 1) * C4NUM;
         for (int k = channel_s; k < channel; k++) {
           int in_channel_offset = in_batch_offset + k;
           int out_channel_offset = out_plane_offset + k;
-          float tmp_max = -FLT_MAX;
+          float tmp_avg = 0;
+          int real_count = 0;
           for (int h = 0; h < win_h; h++) {
             for (int w = 0; w < win_w; w++) {
               if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
@@ -196,11 +523,125 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
                 continue;
               } else {
                 int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-                tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
+                tmp_avg += *(input_ptr + in_offset);
+                ++real_count;
               }
             }  // win_w loop
           }    // win_h loop
-          *(output_ptr + out_channel_offset) = tmp_max;
+          tmp_avg /= (float)real_count;
+          tmp_avg = fmax(tmp_avg, 0);
+          tmp_avg = fmin(tmp_avg, 6);
+          *(output_ptr + out_channel_offset) = tmp_avg;
+        }  // channel_res loop
+      }    // real_cal_num loop
+    }      // out_plane loop
+  }        // out_batch loop
+}
+
+void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  int thread_num = pooling_param->thread_num_;
+  int c4 = UP_DIV(channel, C4NUM);
+
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0);
+  float32x4_t bounds = vdupq_n_f32(6);
+#endif
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
+    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
+    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
+      int cal_start_index = thread_id * TILE_NUM;
+      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+      for (int i = 0; i < real_cal_num; i++) {
+        int index = cal_start_index + i;
+        int out_w_index = index % output_w;
+        int out_h_index = index / output_w;
+        int in_w_index = out_w_index * stride_w - pad_w;
+        int in_h_index = out_h_index * stride_h - pad_h;
+
+        const float *src_plane_ptr = src_b_ptr;
+        float *dst_plane_ptr = dst_b_ptr + index * channel;
+
+        int real_win_h_start = MSMAX(0, -in_h_index);
+        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+        int resl_win_w_start = MSMAX(0, -in_w_index);
+        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+
+        for (int ci = 0; ci < c4 - 1; ci++) {
+          const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
+          float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX);
+#else
+          float tmp_max1 = -FLT_MAX;
+          float tmp_max2 = -FLT_MAX;
+          float tmp_max3 = -FLT_MAX;
+          float tmp_max4 = -FLT_MAX;
+#endif
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+#ifdef ENABLE_NEON
+              tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr));
+#else
+              tmp_max1 = fmax(tmp_max1, src_win_ptr[0]);
+              tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
+              tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
+              tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
+
+#endif
+            }  // win_w loop
+          }    // win_h loop
+#ifdef ENABLE_NEON
+          tmp_max = vmaxq_f32(tmp_max, zeros);
+          tmp_max = vminq_f32(tmp_max, bounds);
+          vst1q_f32(dst_c_ptr, tmp_max);
+#else
+          // relu:
+          tmp_max1 = fmax(tmp_max1, 0);
+          tmp_max2 = fmax(tmp_max2, 0);
+          tmp_max3 = fmax(tmp_max3, 0);
+          tmp_max4 = fmax(tmp_max4, 0);
+          tmp_max1 = fmin(tmp_max1, 6);
+          tmp_max2 = fmin(tmp_max2, 6);
+          tmp_max3 = fmin(tmp_max3, 6);
+          tmp_max4 = fmin(tmp_max4, 6);
+
+          dst_c_ptr[0] = tmp_max1;
+          dst_c_ptr[1] = tmp_max2;
+          dst_c_ptr[2] = tmp_max3;
+          dst_c_ptr[3] = tmp_max4;
+#endif
+        }  // ic4-1 loop
+        int channel_s = (c4 - 1) * C4NUM;
+        for (int ci = channel_s; ci < channel; ci++) {
+          float *dst_c_ptr = dst_plane_ptr + ci;
+          const float *src_c_ptr = src_plane_ptr + ci;
+          float tmp_max = -FLT_MAX;
+
+          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+            for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
+              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+              tmp_max = fmax(tmp_max, src_win_ptr[0]);
+            }  // win_w loop
+          }    // win_h loop
+          dst_c_ptr[0] = tmp_max;
         }  // channel_res loop
       }    // real_cal_num loop
     }      // out_plane loop
diff --git a/mindspore/lite/nnacl/fp32/pooling.h b/mindspore/lite/nnacl/fp32/pooling.h
index c8c90fca64..ae62f97390 100644
--- a/mindspore/lite/nnacl/fp32/pooling.h
+++ b/mindspore/lite/nnacl/fp32/pooling.h
@@ -30,6 +30,14 @@ extern "C" {
 void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
 
 void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
+
+void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
+
+void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
+
+void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
+
+void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/resize.c b/mindspore/lite/nnacl/fp32/resize.c
index 2dce46ed62..80ec9a8317 100644
--- a/mindspore/lite/nnacl/fp32/resize.c
+++ b/mindspore/lite/nnacl/fp32/resize.c
@@ -17,17 +17,15 @@
 #include "nnacl/fp32/resize.h"
 #include "nnacl/common_func.h"
 #include "nnacl/errorcode.h"
-
-int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
-                   bool align_corners, int tid, int thread_num) {
-  if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL) {
+int PrepareResizeBilinear(const int *input_shape, const int *output_shape, bool align_corners, int *y_bottoms,
+                          int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights, float *x_left_weights) {
+  if (input_shape == NULL || output_shape == NULL || y_bottoms == NULL || y_tops == NULL || x_lefts == NULL ||
+      x_rights == NULL || y_bottom_weights == NULL || x_left_weights == NULL) {
     return NNACL_NULL_PTR;
   }
 
-  int in_n = input_shape[0];
   int in_h = input_shape[1];
   int in_w = input_shape[2];
-  int in_c = input_shape[3];
 
   int new_height = output_shape[1];
   int new_width = output_shape[2];
@@ -40,65 +38,119 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input
     width_scale = (float)(in_w - 1) / (new_width - 1);
   }
 
-  int n, h, w, c;
-  for (n = 0; n < in_n; n++) {
-    for (h = tid; h < new_height; h += thread_num) {
-      float actual_y = (float)h * height_scale;
-      int y_bottom = (int)(floor(actual_y));
-      int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1);
-      float y_top_weight = actual_y - (float)(y_bottom);
-      const float y_bottom_weight = 1.0f - y_top_weight;
-      for (w = 0; w < new_width; w++) {
-        float actual_x = (float)(w)*width_scale;
-        int x_left = (int)(floor(actual_x));
-        int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1);
-        float x_right_weight = actual_x - (float)(x_left);
-        const float x_left_weight = 1.0f - x_right_weight;
-        c = 0;
+  int h, w;
+  for (h = 0; h < new_height; h++) {
+    float actual_y = (float)h * height_scale;
+    int y_bottom = (int)(floor(actual_y));
+    int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1);
+    float y_top_weight = actual_y - (float)(y_bottom);
+    const float y_bottom_weight = 1.0f - y_top_weight;
+
+    y_bottoms[h] = y_bottom;
+    y_tops[h] = y_top;
+    y_bottom_weights[h] = y_bottom_weight;
+  }
+  for (w = 0; w < new_width; w++) {
+    float actual_x = (float)(w)*width_scale;
+    int x_left = (int)(floor(actual_x));
+    int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1);
+    float x_right_weight = actual_x - (float)(x_left);
+    const float x_left_weight = 1.0f - x_right_weight;
+
+    x_lefts[w] = x_left;
+    x_rights[w] = x_right;
+    x_left_weights[w] = x_left_weight;
+  }
+  return NNACL_OK;
+}
+
+int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
+                   int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
+                   float *x_left_weights, int n_h_begin, int n_h_end) {
+  if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL || y_bottoms == NULL ||
+      y_tops == NULL || x_lefts == NULL || x_rights == NULL || y_bottom_weights == NULL || x_left_weights == NULL) {
+    return NNACL_NULL_PTR;
+  }
+
+  int in_w = input_shape[2];
+  int in_c = input_shape[3];
+
+  int new_height = output_shape[1];
+  int new_width = output_shape[2];
+
+  int n_h, n, h, w, c;
+  n = n_h_begin / new_height;
+  h = n_h_begin % new_height;
+  int n_h_stride = new_width * in_c;
+  int out_offset = n_h_begin * n_h_stride;
+  for (n_h = n_h_begin; n_h < n_h_end; n_h++, h++) {
+    if (h == new_height) {
+      h = 0;
+      n++;
+    }
+    int y_bottom = y_bottoms[h];
+    int y_top = y_tops[h];
+    float y_bottom_weight = y_bottom_weights[h];
+    float y_top_weight = 1.0f - y_bottom_weight;
+
+    for (w = 0; w < new_width; w++) {
+      int x_left = x_lefts[w];
+      int x_right = x_rights[w];
+      float x_left_weight = x_left_weights[w];
+      float x_right_weight = 1.0f - x_left_weight;
+      float top_left_weight = y_top_weight * x_left_weight;
+      float top_right_weight = y_top_weight * x_right_weight;
+      float bottom_left_weight = y_bottom_weight * x_left_weight;
+      float bottom_right_weight = y_bottom_weight * x_right_weight;
+
+      c = 0;
+      int in_bottom_left_offset = offset(input_shape, n, y_bottom, x_left, c);
+      int in_bottom_right_offset = in_bottom_left_offset + (x_right - x_left) * in_c;
+      int in_top_left_offset = in_bottom_left_offset + (y_top - y_bottom) * in_w * in_c;
+      int in_top_right_offset = in_bottom_right_offset + (y_top - y_bottom) * in_w * in_c;
+
 #ifdef ENABLE_NEON
-        for (; c <= in_c - 4; c += 4) {
-          float32x4_t bottom_left = vld1q_f32(input_data + offset(input_shape, n, y_bottom, x_left, c));
-          float32x4_t bottom_right = vld1q_f32(input_data + offset(input_shape, n, y_bottom, x_right, c));
-          float32x4_t top_left = vld1q_f32(input_data + offset(input_shape, n, y_top, x_left, c));
-          float32x4_t top_right = vld1q_f32(input_data + offset(input_shape, n, y_top, x_right, c));
-
-          float32x4_t y_top_w = vdupq_n_f32(y_top_weight);
-          float32x4_t y_bottom_w = vdupq_n_f32(y_bottom_weight);
-          float32x4_t x_left_w = vdupq_n_f32(x_left_weight);
-          float32x4_t x_right_w = vdupq_n_f32(x_right_weight);
-
-          float32x4_t interp_value = vdupq_n_f32(0.0);
-          float32x4_t tmp = vmulq_f32(bottom_left, y_bottom_w);
-          tmp = vmulq_f32(tmp, x_left_w);
-          interp_value = vaddq_f32(interp_value, tmp);
-
-          tmp = vmulq_f32(bottom_right, y_bottom_w);
-          tmp = vmulq_f32(tmp, x_right_w);
-          interp_value = vaddq_f32(interp_value, tmp);
-
-          tmp = vmulq_f32(top_left, y_top_w);
-          tmp = vmulq_f32(tmp, x_left_w);
-          interp_value = vaddq_f32(interp_value, tmp);
-
-          tmp = vmulq_f32(top_right, y_top_w);
-          tmp = vmulq_f32(tmp, x_right_w);
-          interp_value = vaddq_f32(interp_value, tmp);
-          vst1q_f32(output_data + offset(output_shape, n, h, w, c), interp_value);
-        }
+      float32x4_t top_left_w = vdupq_n_f32(top_left_weight);
+      float32x4_t top_right_w = vdupq_n_f32(top_right_weight);
+      float32x4_t bottom_left_w = vdupq_n_f32(bottom_left_weight);
+      float32x4_t bottom_right_w = vdupq_n_f32(bottom_right_weight);
+
+      for (; c <= in_c - 4; c += 4) {
+        float32x4_t bottom_left = vld1q_f32(input_data + in_bottom_left_offset + c);
+        float32x4_t bottom_right = vld1q_f32(input_data + in_bottom_right_offset + c);
+        float32x4_t top_left = vld1q_f32(input_data + in_top_left_offset + c);
+        float32x4_t top_right = vld1q_f32(input_data + in_top_right_offset + c);
+
+        float32x4_t interp_value = vdupq_n_f32(0.0);
+
+        float32x4_t tmp = vmulq_f32(bottom_left, bottom_left_w);
+        interp_value = vaddq_f32(interp_value, tmp);
+
+        tmp = vmulq_f32(bottom_right, bottom_right_w);
+        interp_value = vaddq_f32(interp_value, tmp);
+
+        tmp = vmulq_f32(top_left, top_left_w);
+        interp_value = vaddq_f32(interp_value, tmp);
+
+        tmp = vmulq_f32(top_right, top_right_w);
+        interp_value = vaddq_f32(interp_value, tmp);
+        vst1q_f32(output_data + out_offset, interp_value);
+        out_offset += 4;
+      }
 #endif
-        for (; c < in_c; c++) {
-          float bottom_left = input_data[offset(input_shape, n, y_bottom, x_left, c)];
-          float bottom_right = input_data[offset(input_shape, n, y_bottom, x_right, c)];
-          float top_left = input_data[offset(input_shape, n, y_top, x_left, c)];
-          float top_right = input_data[offset(input_shape, n, y_top, x_right, c)];
-          float interp_value = bottom_left * y_bottom_weight * x_left_weight +
-                               bottom_right * y_bottom_weight * x_right_weight +
-                               top_left * y_top_weight * x_left_weight + top_right * y_top_weight * x_right_weight;
-          output_data[offset(output_shape, n, h, w, c)] = interp_value;
-        }
+      for (; c < in_c; c++) {
+        float bottom_left = input_data[in_bottom_left_offset + c];
+        float bottom_right = input_data[in_bottom_right_offset + c];
+        float top_left = input_data[in_top_left_offset + c];
+        float top_right = input_data[in_top_right_offset + c];
+        float interp_value = bottom_left * bottom_left_weight + bottom_right * bottom_right_weight +
+                             top_left * top_left_weight + top_right * top_right_weight;
+        output_data[out_offset] = interp_value;
+        out_offset++;
       }
     }
   }
+
   return NNACL_OK;
 }
 
diff --git a/mindspore/lite/nnacl/fp32/resize.h b/mindspore/lite/nnacl/fp32/resize.h
index a6b76cbdcf..afa9888355 100644
--- a/mindspore/lite/nnacl/fp32/resize.h
+++ b/mindspore/lite/nnacl/fp32/resize.h
@@ -25,9 +25,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
-                   bool align_corners, int tid, int thread_num);
 
+int PrepareResizeBilinear(const int *input_shape, const int *output_shape, bool align_corners, int *y_bottoms,
+                          int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights, float *x_left_weights);
+int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
+                   int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
+                   float *x_left_weights, int n_h_begin, int n_h_end);
 int ResizeNearestNeighbor(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
                           int tid, int thread_num);
 #ifdef __cplusplus
diff --git a/mindspore/lite/nnacl/fp32/scale.c b/mindspore/lite/nnacl/fp32/scale.c
new file mode 100644
index 0000000000..0806999f80
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/scale.c
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/scale.h"
+#ifdef ENABLE_ARM
+#include <arm_neon.h>
+#endif
+void ScaleInner(float *in_data, float *out_data, float *scale, float *offset, int outer_start, int outer_end,
+                int axis_size, int inner_size) {
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size * inner_size;
+    for (int i = 0; i < axis_size; i++) {
+      int axis_offset = out_offset + i * inner_size;
+      int in_index = 0;
+#ifdef ENABLE_ARM64
+      for (; in_index < inner_size - 4; in_index += 4) {
+        int in_offset = axis_offset + in_index;
+        float32x4_t data = vld1q_f32(in_data + in_offset);
+        float32x4_t scale_4 = vdupq_n_f32(scale[i]);
+        float32x4_t offset_4 = vdupq_n_f32(offset[i]);
+        float32x4_t reslut = vfmaq_f32(offset_4, data, scale_4);
+        vst1q_f32(out_data + in_offset, reslut);
+      }
+#endif
+      for (; in_index < inner_size; in_index++) {
+        int in_offset = axis_offset + in_index;
+        out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
+      }
+    }
+  }
+}
+
+void ScaleAxis(float *in_data, float *out_data, float *scale, float *offset, int outer_start, int outer_end,
+               int axis_size) {
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size;
+    int index = 0;
+#ifdef ENABLE_ARM64
+    for (; index < axis_size - 4; index += 4) {
+      int in_offset = out_offset + index;
+      float32x4_t data = vld1q_f32(in_data + in_offset);
+      float32x4_t scale_4 = vld1q_f32(scale + index);
+      float32x4_t offset_4 = vld1q_f32(offset + index);
+      float32x4_t reslut = vfmaq_f32(offset_4, data, scale_4);
+      vst1q_f32(out_data + in_offset, reslut);
+    }
+#endif
+    for (; index < axis_size; index++) {
+      int in_offset = out_offset + index;
+      out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
+    }
+  }
+}
+
+void DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param) {
+  int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
+  int outer_start = task_id * outer_step;
+  int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
+
+  if (scale_param->inner_size_ == 1) {
+    ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
+  } else {
+    ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
+               scale_param->inner_size_);
+  }
+}
diff --git a/mindspore/lite/nnacl/fp32/scale.h b/mindspore/lite/nnacl/fp32/scale.h
new file mode 100644
index 0000000000..63ba1dd200
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/scale.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_SCALE_FP32_H_
+#define MINDSPORE_LITE_NNACL_SCALE_FP32_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/scale.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SCALE_FP32_H_
diff --git a/mindspore/lite/nnacl/fp32/space_to_batch.c b/mindspore/lite/nnacl/fp32/space_to_batch.c
index bc64d0663d..0cd665de02 100644
--- a/mindspore/lite/nnacl/fp32/space_to_batch.c
+++ b/mindspore/lite/nnacl/fp32/space_to_batch.c
@@ -16,132 +16,79 @@
 #include "nnacl/fp32/space_to_batch.h"
 #include "nnacl/arithmetic_common.h"
 #include "nnacl/errorcode.h"
-#include "nnacl/fp32/concat.h"
 #include "nnacl/op_base.h"
 
-int EnumElement(int *shape, int n_dims) {
-  int total = 1;
-  for (int i = 0; i < n_dims; i++) {
-    total *= shape[i];
-  }
-  return total;
-}
-
-void TransposeForNHWC(const float *in_data, float *out_data, int *strides, int *out_strides, int *perm,
-                      int *output_shape, int h_start, int h_end) {
-  const int stride0 = strides[perm[0]];
-  const int stride1 = strides[perm[1]];
-  const int stride2 = strides[perm[2]];
-  const int stride3 = strides[perm[3]];
-  const int stride4 = strides[perm[4]];
-  const int out_stride0 = out_strides[0];
-  const int out_stride1 = out_strides[1];
-  const int out_stride2 = out_strides[2];
-  const int out_stride3 = out_strides[3];
-  const int out_stride4 = out_strides[4];
-  const int output0 = output_shape[0];
-  const int output2 = output_shape[2];
-  const int output3 = output_shape[3];
-  const int output4 = output_shape[4];
-
-  for (int i = 0; i < output0; ++i) {
-    int out_stride0_i = i * out_stride0;
-    int stride0_i = i * stride0;
-    for (int j = h_start; j < h_end; ++j) {
-      int out_stride1_j = j * out_stride1;
-      int stride1_j = j * stride1;
-      for (int k = 0; k < output2; ++k) {
-        int out_stride2_k = k * out_stride2;
-        int stride2_k = k * stride2;
-        for (int m = 0; m < output3; ++m) {
-          int out_stride3_m = m * out_stride3;
-          int stride3_m = m * stride3;
-          for (int n = 0; n < output4; ++n) {
-            int out_stride4_n = n * out_stride4;
-            int stride4_n = n * stride4;
-            memcpy(out_data + out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n,
-                   in_data + stride0_i + stride1_j + stride2_k + stride3_m + stride4_n, stride4 * sizeof(float));
-          }
-        }
+void DoSpaceToBatchNHWC(const float *input, float *output, SpaceToBatchParameter *param, int *in_shape,
+                        int *out_shape) {
+  int out_dim0 = out_shape[0];
+  int out_dim1 = out_shape[1];
+  int out_dim2 = out_shape[2];
+  int copy_num = out_shape[3];
+  int block_w = param->block_sizes_[1];
+  int block_h = param->block_sizes_[0];
+  int in_strides[4];
+  ComputeStrides(in_shape, in_strides, 4);
+  int out_strides[4];
+  ComputeStrides(out_shape, out_strides, 4);
+  size_t copy_size = copy_num * sizeof(float);
+  size_t out_offset = 0;
+  for (int n = 0; n < out_dim0; ++n) {
+    int in_n = n % in_shape[0];
+    int32_t stride_w = (n / in_shape[0]) % block_w;
+    int32_t stride_h = (n / in_shape[0]) / block_w;
+    size_t in_offset0 = in_n * in_strides[0];
+    for (int h = 0; h < out_dim1; ++h) {
+      size_t in_offset1 = in_offset0 + (h * block_h + stride_h) * in_strides[1];
+      for (int w = 0; w < out_dim2; ++w) {
+        size_t in_offset2 = in_offset1 + (w * block_w + stride_w) * in_strides[2];
+        memcpy(output + out_offset, input + in_offset2, copy_size);
+        out_offset += copy_num;
       }
     }
   }
 }
 
-int SpaceToBatchForNHWC(const float *input, float *output, int *in_shape, int shape_size, int *block_sizes, int h_start,
-                        int h_end) {
-  int trans_in_shape[6] = {in_shape[0],    in_shape[1] / block_sizes[0],
-                           block_sizes[0], in_shape[2] / block_sizes[1],
-                           block_sizes[1], in_shape[3]};
-  int trans_out_shape[6] = {
-    in_shape[0], block_sizes[0], block_sizes[1], in_shape[1] / block_sizes[0], in_shape[2] / block_sizes[1],
-    in_shape[3]};
-  int in_strides[C4NUM + 2];
-  ComputeStrides(trans_in_shape, in_strides, shape_size + 2);
-  int out_strides[C4NUM + 2];
-  ComputeStrides(trans_out_shape, out_strides, shape_size + 2);
-
-  int perm[6] = {0, 2, 4, 1, 3, 5};
-  TransposeForNHWC(input, output, in_strides, out_strides, perm, trans_out_shape, h_start, h_end);
-  return NNACL_OK;
-}
-
-void DoPadding(const float *input, float *padded_input, SpaceToBatchParameter param, float *tmp_space[]) {
-  float *tmp = padded_input;
-  (void)memcpy(tmp, input, param.num_elements_ * sizeof(float));
-  float *target = tmp_space[0];
-  float *tmp_zeros = tmp_space[1];
-  float *tmp2 = NULL;
-  int cur_shape[param.n_dims_], cur_start_shape[param.n_dims_], cur_end_shape[param.n_dims_],
-    cur_target_shape[param.n_dims_];
-  float *concat_inputs[3];
-  int *concat_shapes[4];
-
-  for (int i = 0; i < param.n_dims_; i++) {
-    cur_shape[i] = param.in_shape_[i];
-    cur_start_shape[i] = param.in_shape_[i];
-    cur_end_shape[i] = param.in_shape_[i];
-    cur_target_shape[i] = param.in_shape_[i];
-  }
-  for (int i = 0; i < param.n_space_dims_; ++i) {
-    if (param.padded_in_shape_[i + 1] > param.in_shape_[i + 1]) {
-      int concat_idx = 0;
-      cur_target_shape[i + 1] = 0;
-      if (param.paddings_[2 * i] != 0) {
-        cur_start_shape[i + 1] = param.paddings_[2 * i];
-        concat_inputs[concat_idx] = tmp_zeros;
-        concat_shapes[concat_idx++] = cur_start_shape;
-        cur_target_shape[i + 1] += cur_start_shape[i + 1];
+void DoSpaceToBatchPaddingNHWC(const float *input, float *output, int *in_shape, int *padding, int *out_shape,
+                              const float *pedding_h_data, const float *pedding_w_data) {
+  int in_h = in_shape[1];
+  int in_w = in_shape[2];
+  int in_c = in_shape[3];
+  int out_w = out_shape[2];
+  int out_c = out_shape[3];
+  size_t ped_h_num = out_w * out_c;
+  size_t ped_h_size = ped_h_num * sizeof(float);
+  size_t ped_w_size = out_c * sizeof(float);
+  size_t out_offset = 0;
+  int in_strides[4];
+  ComputeStrides(in_shape, in_strides, 4);
+  int out_strides[4];
+  ComputeStrides(out_shape, out_strides, 4);
+  size_t copy_size = in_c * sizeof(float);
+  for (int i = 0; i < in_shape[0]; ++i) {
+    size_t in_offset0 = i * in_strides[0];
+    for (int pad_h_top = 0; pad_h_top < padding[0]; ++pad_h_top) {
+        memcpy(output + out_offset, pedding_h_data, ped_h_size);
+        out_offset += ped_h_num;
+    }
+    for (int j = 0; j < in_h; ++j) {
+      size_t in_offset1 = in_offset0 + j * in_strides[1];
+      for (int pad_w_left = 0; pad_w_left < padding[2]; ++pad_w_left) {
+        memcpy(output + out_offset, pedding_w_data, ped_w_size);
+        out_offset += out_c;
       }
-
-      concat_inputs[concat_idx] = tmp;
-      concat_shapes[concat_idx++] = cur_shape;
-      cur_target_shape[i + 1] += cur_shape[i + 1];
-      if (param.paddings_[2 * i + 1] != 0) {
-        cur_end_shape[i + 1] = param.paddings_[2 * i + 1];
-        concat_inputs[concat_idx] = tmp_zeros;
-        concat_shapes[concat_idx++] = cur_end_shape;
-        cur_target_shape[i + 1] += cur_end_shape[i + 1];
+      for (int k = 0; k < in_w; ++k) {
+        size_t in_offset2 = in_offset1 + k * in_strides[2];
+        memcpy(output + out_offset, input + in_offset2, copy_size);
+        out_offset += in_c;
+      }
+      for (int pad_w_right = 0; pad_w_right < padding[3]; ++pad_w_right) {
+        memcpy(output + out_offset, pedding_w_data, ped_w_size);
+        out_offset += out_c;
       }
-      concat_shapes[concat_idx] = cur_target_shape;
-      Concat((void **)concat_inputs, concat_idx, i + 1, concat_shapes, param.n_dims_, target);
-
-      tmp2 = tmp;
-      tmp = target;
-      target = tmp2;
-      cur_start_shape[i + 1] = cur_end_shape[i + 1] = cur_shape[i + 1] = concat_shapes[concat_idx][i + 1];
+    }
+    for (int pad_h_bottom = 0; pad_h_bottom < padding[1]; ++pad_h_bottom) {
+      memcpy(output + out_offset, pedding_h_data, ped_h_size);
+      out_offset += ped_h_num;
     }
   }
-  if (padded_input != tmp) {
-    memcpy(padded_input, tmp, param.num_elements_padded_ * sizeof(float));
-  }
-}
-
-int SpaceToBatch(const float *input, float *output, SpaceToBatchParameter param, int h_start, int h_end) {
-  if (input == NULL || output == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  int ret =
-    SpaceToBatchForNHWC(input, output, param.padded_in_shape_, param.n_dims_, param.block_sizes_, h_start, h_end);
-  return ret;
 }
diff --git a/mindspore/lite/nnacl/fp32/space_to_batch.h b/mindspore/lite/nnacl/fp32/space_to_batch.h
index 5b19e7dfca..5406408022 100644
--- a/mindspore/lite/nnacl/fp32/space_to_batch.h
+++ b/mindspore/lite/nnacl/fp32/space_to_batch.h
@@ -22,26 +22,17 @@
 
 typedef struct SpaceToBatchParameter {
   OpParameter op_parameter_;
-  int block_sizes_[8];
-  int paddings_[8];
-  int n_dims_;
-  int num_elements_;
-  int num_elements_padded_;
-  int n_space_dims_;
-  int in_shape_[8];
-  int padded_in_shape_[8];
   bool need_paddings_;
+  int block_sizes_[4];
+  int paddings_[4];
 } SpaceToBatchParameter;
 #ifdef __cplusplus
 extern "C" {
 #endif
-int SpaceToBatch(const float *input, float *output, SpaceToBatchParameter param, int h_start, int h_end);
-int SpaceToBatchForNHWC(const float *input, float *output, int *in_shape, int shape_size, int *block_size, int h_start,
-                        int h_end);
-void TransposeForNHWC(const float *in_data, float *out_data, int *strides, int *out_strides, int *perm,
-                      int *output_shape, int h_start, int h_end);
-void DoPadding(const float *input, float *padded_input, SpaceToBatchParameter param, float *tmp_space[]);
-int EnumElement(int *shape, int n_dims);
+void DoSpaceToBatchNHWC(const float *input, float *output, SpaceToBatchParameter *param, int *in_shape,
+                        int *out_shape);
+void DoSpaceToBatchPaddingNHWC(const float *input, float *output, int *in_shape, int *padding, int *out_shape,
+                               const float *pedding_h_data, const float *pedding_w_data);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/strassen_matmul.c b/mindspore/lite/nnacl/fp32/strassen_matmul.c
deleted file mode 100644
index 3f88fd8d5f..0000000000
--- a/mindspore/lite/nnacl/fp32/strassen_matmul.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/fp32/strassen_matmul.h"
-
-bool CheckRecursion(int row, int col, int deep, int max_recursion, int cur_recursion) {
-  if (cur_recursion >= max_recursion) {
-    return false;
-  }
-
-  if (row % 2 != 0 || col % 2 != 0 || deep % 2 != 0) {
-    return false;
-  }
-
-  int row2 = row / 2;
-  int col2 = col / 2;
-  int deep2 = deep / 2;
-
-  float save_cost = row * col * 4 * deep * 4 * 2 + row * col * 4 -
-                    7 * (row2 * col2 * 4 * deep2 * 4 * 2 - row2 * col2 * 4) - 4 * (row2 * deep2 * 4 * 3) -
-                    4 * (deep2 * 4 * col2 * 4 * 3) - 7 * (row2 * col2 * 4 * 3);
-
-  return (save_cost > 0.f);
-}
-
-void GemmMatMulComm(const float *a_ptr, const float *b_ptr, float *dst_ptr, int row, int col, int deep, int b_stride,
-                    int c_stride) {
-  int row4mod = row % 4;
-  int row4div = row / 4;
-  for (int r = 0; r < row; r++) {
-    int r4mod = r % 4;
-    int r4div = r / 4;
-    for (int c = 0; c < col * 4; c++) {
-      float value = 0;
-      int ic = c / 4 * c_stride + r * 4 + c % 4;
-      for (int d = 0; d < deep * 4; d++) {
-        int d4mod = d % 4;
-        int d4div = d / 4;
-        int a_stride = (r < (row4div * 4)) ? 4 : row4mod;
-        int ai = r4div * 4 * deep * 4 + d4div * a_stride * 4 + r4mod * 4 + d4mod;
-        int bi = c / 4 * b_stride + d * 4 + c % 4;
-        value = value + a_ptr[ai] * b_ptr[bi];
-      }
-      dst_ptr[ic] = value;
-    }
-  }
-  return;
-}
-
-void GemmMatMul(const float *a_ptr, const float *b_ptr, float *dst_ptr, int row, int col, int deep, int b_stride,
-                int c_stride) {
-  int row4mod = row % 4;
-  int row4div = row / 4;
-
-  if (row4div > 0) {
-    GemmMatMulComm(a_ptr, b_ptr, dst_ptr, row4div * 4, col, deep, b_stride, c_stride);
-  }
-
-  if (row4mod != 0) {
-    GemmMatMulComm(a_ptr + row4div * deep * 4 * 4, b_ptr, dst_ptr + row4div * 4 * 4, row4mod, col, deep, b_stride,
-                   c_stride);
-  }
-  return;
-}
-
-int RecursionMatmul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *matmul_param,
-                    int max_recursion, int cur_recursion, float *tmp_a_ptr) {
-  size_t row2 = matmul_param->row_ / 2;
-  size_t deep2 = matmul_param->deep_ / 2;
-  size_t col2 = matmul_param->col_ / 2;
-  size_t a_stride = matmul_param->a_stride_;
-  size_t b_stride = matmul_param->b_stride_;
-  size_t c_stride = matmul_param->c_stride_;
-
-  StrassenMatMulParameter rec_matmul;
-  rec_matmul.row_ = row2;
-  rec_matmul.deep_ = deep2;
-  rec_matmul.col_ = col2;
-
-  float *x_ptr = (float *)(malloc(row2 * MSMAX(deep2, col2) * FP32_STRASSEN_UINT * sizeof(float)));
-  if (x_ptr == NULL) {
-    return NNACL_ERRCODE_STRASSEN_RECURSION_MALLOC;
-  }
-  float *y_ptr = (float *)(malloc(col2 * deep2 * FP32_STRASSEN_WEIGHT_UINT * sizeof(float)));
-  if (y_ptr == NULL) {
-    free(x_ptr);
-    return NNACL_ERRCODE_STRASSEN_RECURSION_MALLOC;
-  }
-  size_t x_stride = row2 * FP32_STRASSEN_UINT;
-  size_t y_stride = deep2 * FP32_STRASSEN_WEIGHT_UINT;
-
-  const float *a11 = a_ptr;
-  const float *a12 = a_ptr + deep2 * a_stride;
-  const float *a21 = a_ptr + row2 * FP32_STRASSEN_UINT;
-  const float *a22 = a_ptr + deep2 * a_stride + row2 * FP32_STRASSEN_UINT;
-  const float *b11 = b_ptr;
-  const float *b12 = b_ptr + col2 * b_stride;
-  const float *b21 = b_ptr + deep2 * FP32_STRASSEN_WEIGHT_UINT;
-  const float *b22 = b_ptr + col2 * b_stride + deep2 * FP32_STRASSEN_WEIGHT_UINT;
-  float *c11 = c_ptr;
-  float *c12 = c_ptr + col2 * c_stride;
-  float *c21 = c_ptr + row2 * FP32_STRASSEN_UINT;
-  float *c22 = c_ptr + col2 * c_stride + row2 * FP32_STRASSEN_UINT;
-
-  /* S3 = A11 - A21 */
-  MatrixSub(a11, a21, x_ptr, a_stride, a_stride, x_stride, row2, deep2);
-
-  /* T3 = B22 - B12 */
-  MatrixSub(b22, b12, y_ptr, b_stride, b_stride, y_stride, deep2 * 4, col2);
-
-  /* P7 = S3T3 */
-  rec_matmul.a_stride_ = x_stride;
-  rec_matmul.b_stride_ = y_stride;
-  rec_matmul.c_stride_ = c_stride;
-  StrassenMatmul(x_ptr, y_ptr, c21, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* S1 = A21 + A22 */
-  MatrixAdd(a21, a22, x_ptr, a_stride, a_stride, x_stride, row2, deep2);
-
-  /* T1 = B12 - B11 */
-  MatrixSub(b12, b11, y_ptr, b_stride, b_stride, y_stride, deep2 * 4, col2);
-
-  /* P5 = S1T1 */
-  StrassenMatmul(x_ptr, y_ptr, c22, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* S2 = S1 - A11 */
-  MatrixSub(x_ptr, a11, x_ptr, x_stride, a_stride, x_stride, row2, deep2);
-
-  /* T2 = B22 - T1 */
-  MatrixSub(b22, y_ptr, y_ptr, b_stride, y_stride, y_stride, deep2 * 4, col2);
-
-  /* P6 = S2T2 */
-  StrassenMatmul(x_ptr, y_ptr, c12, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* S4 = A12 - S2 */
-  MatrixSub(a12, x_ptr, x_ptr, a_stride, x_stride, x_stride, row2, deep2);
-
-  /* P3 = S4B22 */
-  rec_matmul.b_stride_ = b_stride;
-  StrassenMatmul(x_ptr, b22, c11, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* P1 = A11B11 */
-  rec_matmul.a_stride_ = a_stride;
-  rec_matmul.c_stride_ = row2 * FP32_STRASSEN_UINT;
-  StrassenMatmul(a11, b11, x_ptr, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* U2 = P1 + P6
-     U3 = U2 + P7
-     U4 = U2 + P5
-     U7 = U3 + P5
-     U5 = U4 + P3 */
-  MatrixMultiAdd(c11, c12, c21, c22, x_ptr, row2, col2, c_stride, x_stride);
-
-  /* T4 = T2 - B21 */
-  MatrixSub(y_ptr, b21, y_ptr, y_stride, b_stride, y_stride, deep2 * 4, col2);
-
-  /* P4 = A22T4 */
-  rec_matmul.b_stride_ = y_stride;
-  rec_matmul.c_stride_ = c_stride;
-  StrassenMatmul(a22, y_ptr, c11, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* U6 = U3 - P4 */
-  MatrixSub(c21, c11, c21, c_stride, c_stride, c_stride, row2, col2);
-
-  /* P2 = A12B21 */
-  rec_matmul.b_stride_ = b_stride;
-  StrassenMatmul(a12, b21, c11, &rec_matmul, max_recursion, cur_recursion + 1, tmp_a_ptr);
-
-  /* U1 = P1 + P2 */
-  MatrixAdd(x_ptr, c11, c11, x_stride, c_stride, c_stride, row2, col2);
-
-  free(x_ptr);
-  free(y_ptr);
-  return NNACL_OK;
-}
-
-int CommonMatMul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *matmul_param,
-                 float *tmp_a_ptr) {
-  MatrixPack(a_ptr, tmp_a_ptr, matmul_param->row_, matmul_param->deep_, matmul_param->a_stride_);
-  GemmMatMul(tmp_a_ptr, b_ptr, c_ptr, matmul_param->row_, matmul_param->col_, matmul_param->deep_,
-             matmul_param->b_stride_, matmul_param->c_stride_);
-  return NNACL_OK;
-}
-
-int StrassenMatmul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *matmul_param,
-                   int max_recursion, int cur_recursion, float *tmp_a_ptr) {
-  if (CheckRecursion(matmul_param->row_, matmul_param->col_, matmul_param->deep_, cur_recursion, max_recursion)) {
-    return RecursionMatmul(a_ptr, b_ptr, c_ptr, matmul_param, max_recursion, cur_recursion, tmp_a_ptr);
-  }
-  return CommonMatMul(a_ptr, b_ptr, c_ptr, matmul_param, tmp_a_ptr);
-}
diff --git a/mindspore/lite/nnacl/fp32/strassen_matmul.h b/mindspore/lite/nnacl/fp32/strassen_matmul.h
deleted file mode 100644
index cd0dde8b83..0000000000
--- a/mindspore/lite/nnacl/fp32/strassen_matmul.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_FP32_STRASSEN_MATMUL_H_
-#define MINDSPORE_LITE_NNACL_FP32_STRASSEN_MATMUL_H_
-
-#include <memory.h>
-#include "nnacl/pack.h"
-#include "nnacl/op_base.h"
-#include "nnacl/errorcode.h"
-#include "nnacl/strassen_matmul.h"
-#include "nnacl/fp32/common_func.h"
-
-#define FP32_STRASSEN_UINT C4NUM
-#define FP32_STRASSEN_WEIGHT_UINT (C4NUM * C4NUM)
-#define FP32_STRASSEN_MAX_RECURSION 5
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-int RecursionMatmul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *matmul_param,
-                    int max_recursion, int, float *tmp_a_ptr);
-int CommonMatMul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *Matmul_param,
-                 float *tmp_a_ptr);
-
-int StrassenMatmul(const float *a_ptr, const float *b_ptr, float *c_ptr, StrassenMatMulParameter *matmul_param,
-                   int max_recursion, int cur_recursion, float *tmp_a_ptr);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // MINDSPORE_LITE_NNACL_FP32_STRASSEN_MATMUL_H_
diff --git a/mindspore/lite/nnacl/fp32/topk.c b/mindspore/lite/nnacl/fp32/topk.c
index a8b13f3e38..96b3b05e46 100644
--- a/mindspore/lite/nnacl/fp32/topk.c
+++ b/mindspore/lite/nnacl/fp32/topk.c
@@ -16,9 +16,27 @@
 
 #include "nnacl/fp32/topk.h"
 
-int DescendCmp(const void *a, const void *b) { return ((const TopkNode *)b)->element - ((const TopkNode *)a)->element; }
+int DescendCmp(const void *a, const void *b) {
+  float sub = ((const TopkNode *)b)->element - ((const TopkNode *)a)->element;
+  if (sub > 0) {
+    return 1;
+  } else if (sub < 0) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
 
-int AscendCmp(const void *a, const void *b) { return ((const TopkNode *)a)->element - ((const TopkNode *)b)->element; }
+int AscendCmp(const void *a, const void *b) {
+  float sub = ((const TopkNode *)a)->element - ((const TopkNode *)b)->element;
+  if (sub > 0) {
+    return 1;
+  } else if (sub < 0) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
 
 void Topk(float *input_data, float *output_data, int32_t *output_index, TopkParameter *parameter) {
   int last_dim_size = parameter->last_dim_size_;
diff --git a/mindspore/lite/nnacl/fp32_grad/pack_ext.c b/mindspore/lite/nnacl/fp32_grad/pack_ext.c
index d95cc6daf9..48665e83f2 100644
--- a/mindspore/lite/nnacl/fp32_grad/pack_ext.c
+++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.c
@@ -20,9 +20,9 @@
 static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
 
 void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) {
-  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
+  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
   // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
-  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
+  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
   // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
 
   const int stride_h = conv_param->stride_h_;
@@ -72,9 +72,9 @@ void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param
 
 // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w)
 void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) {
-  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
+  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
   // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
-  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
+  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
   // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
 
   const int stride_h = conv_param->stride_h_;
diff --git a/mindspore/lite/nnacl/strassen_matmul.h b/mindspore/lite/nnacl/gather_parameter.h
similarity index 56%
rename from mindspore/lite/nnacl/strassen_matmul.h
rename to mindspore/lite/nnacl/gather_parameter.h
index 4fff48219d..3b606476b6 100644
--- a/mindspore/lite/nnacl/strassen_matmul.h
+++ b/mindspore/lite/nnacl/gather_parameter.h
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_NNACL_STRASSEN_MATMUL_H_
-#define MINDSPORE_LITE_NNACL_STRASSEN_MATMUL_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_GATHER_PARAMETER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_GATHER_PARAMETER_H_
 
 #include "nnacl/op_base.h"
 
-/* hw*inc4  X  inc4*oc4 */
-typedef struct StrassenMatMulParameter {
-  OpParameter op_parameter;
-  int row_;      /*  h * w        */
-  int col_;      /*  oc4 / 4      */
-  int deep_;     /*  inc4 / 4     */
-  int a_stride_; /*  h * w * 4    */
-  int b_stride_; /*  inc4  * 4    */
-  int c_stride_; /*  h * w * 4    */
-} StrassenMatMulParameter;
+typedef struct GatherParameter {
+  OpParameter op_parameter_;
+  int axis_;
+  int batchDims_;
+} GatherParameter;
 
-#endif  // MINDSPORE_LITE_NNACL_STRASSEN_MATMUL_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_GATHER_PARAMETER_H_
diff --git a/mindspore/lite/nnacl/int8/common_func.h b/mindspore/lite/nnacl/int8/common_func.h
index bc8b35a0b8..1e1b965d34 100644
--- a/mindspore/lite/nnacl/int8/common_func.h
+++ b/mindspore/lite/nnacl/int8/common_func.h
@@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co
                       size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
                       size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
                       int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
+void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+                   int output_channel, int input_step, int8_t input_zp);
+void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+                          int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
 #endif
 
 #ifdef __cplusplus
diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
index fc17397387..7e7d9d4067 100644
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
@@ -20,6 +20,99 @@
 #include "nnacl/int8/common_func.h"
 
 /*conv depthwise int8 begin*/
+// only support perlayer
+#ifndef ENABLE_ARM64
+void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+                   int output_channel, int input_step, int8_t input_zp) {
+  for (int i = 0; i < num_pixels; i++) {
+    for (int c = 0; c < output_channel; c++) {
+      const int16_t input = input_ptr[c] - input_zp;
+      *output_ptr++ += input * weight_ptr[c];
+    }
+    input_ptr += input_step;
+  }
+}
+#endif
+
+void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) {
+  int align_num = 0;
+#ifdef ENABLE_ARM64
+  align_num = num_pixels / 4 * 4;
+  ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max);
+#endif
+  for (int i = align_num; i < num_pixels; i++) {
+    buffer[i] = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift);
+    buffer[i] += output_zp;
+    buffer[i] = MSMAX(buffer[i], acc_min);
+    buffer[i] = MSMIN(buffer[i], acc_max);
+    dst[i] = (buffer[i]);
+  }
+}
+
+void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data,
+                const int32_t *bias_data, const ConvParameter *conv_param, int task_id) {
+  int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
+  int h_start = h_step * task_id;
+  int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
+
+  int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0];
+  int left_shift = conv_param->conv_quant_arg_.left_shift_[0];
+  int right_shift = conv_param->conv_quant_arg_.right_shift_[0];
+
+  int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
+  int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
+  int acc_min = conv_param->conv_quant_arg_.out_act_min_[0];
+  int acc_max = conv_param->conv_quant_arg_.out_act_max_[0];
+
+  for (int b = 0; b < conv_param->output_batch_; b++) {
+    const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+    int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
+    for (int oh = h_start; oh < h_end; oh++) {
+      int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
+
+      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
+      int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
+      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
+
+      // init acc
+      for (int ow = 0; ow < conv_param->output_w_; ow++) {
+        memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t));
+      }
+      for (int kh = start_kh; kh < end_kh; kh++) {
+        int ih = ih_origin + conv_param->dilation_w_ * kh;
+
+        const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
+        const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
+
+        int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
+        for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
+          int out_w_start = MSMAX(
+            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
+          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
+                                                        conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
+                                                         conv_param->stride_w_);
+
+          int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_;
+          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
+
+          const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
+          int num_pixels = out_w_end - out_w_start;
+
+          ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp);
+          weight_kh += conv_param->output_channel_;
+        }
+      }
+      // post func, acc int32 -> dst int8
+      ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp,
+                     out_multiplier, left_shift, right_shift, acc_min, acc_max);
+    }
+  }
+}
+/*conv depthwise int8 end*/
+
+/*conv depthwise sliding window int8 begin*/
 void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
                               int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier,
                               int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max,
@@ -68,14 +161,14 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
                          bool per_channel) {
   int8_t *dst_h = dst + top * sliding->out_h_step_;
   for (int oh = top; oh < bottom; oh++) {
-    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
+    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
     const int16_t *src_h = src + ih * sliding->in_h_step_;
 
     int8_t *dst_kernel = dst_h + left * sliding->block_channel_;
     for (int ow = left; ow < right; ow++) {
-      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
+      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
       const int16_t *src_w = src_h + iw * sliding->block_channel_;
@@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
 }
 #endif
 
-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
+void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
   const int16_t *src = input_data;
   int8_t *dst = output_data;
   bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
@@ -186,8 +279,8 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
                           per_channel);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
         int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
   }  // batch loop
   // output nhwc4
 }
-/*conv depthwise int8 end*/
+/*conv depthwise sliding window int8 end*/
 
 /*deconv depthwise int8 begin*/
 void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
@@ -241,14 +334,14 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
                                int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
   const int16_t *src_h = src + top * sliding->out_h_step_;
   for (int ih = top; ih < bottom; ih++) {
-    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
+    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
     int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
     int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
     int32_t *dst_h = dst + oh * sliding->in_h_step_;
 
     const int16_t *src_kernel = src_h + left * sliding->block_channel_;
     for (int iw = left; iw < right; iw++) {
-      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
       int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
       int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
       int32_t *dst_w = dst_h + ow * C4NUM;
@@ -341,8 +434,8 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
                                 conv_param->input_w_, conv_param, sliding);
 
       if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
-        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
-        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
+        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
         int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
         const int16_t *in_t =
           src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
index 19a4ad0fd8..004b9dff27 100644
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
@@ -23,8 +23,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
+
+void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data,
+                const int32_t *bias_data, const ConvParameter *conv_param, int task_id);
+
+void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
 
 void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data,
                   const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c
index 90d3ef2318..756bd13850 100644
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -28,7 +28,7 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
   int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
   int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
   int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
-  int oc4 = UP_DIV(output_channel, C4NUM);
+
 #ifdef ENABLE_ARM64
   size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
   size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
@@ -36,6 +36,7 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
                        output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
                        shift_before, shift_after, asymmetric, per_channel);
 #else
+  int oc4 = UP_DIV(output_channel, C4NUM);
   int tile_num = conv_param->tile_num_;
   int plane_c4 = UP_DIV(kernel_plane, C4NUM);
   for (int oc = 0; oc < output_channel; oc++) {
@@ -198,7 +199,7 @@ void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const
   }
 }
 
-void Conv3x3Uint8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) {
+void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) {
   int oc4 = UP_DIV(oc, C4NUM);
 #ifdef ENABLE_ARM64
   IndirectGemmInt16to32_8x4(dst, src, weight, 16, ic8, oc4, oc4 * 4 * 16 * sizeof(int32_t));
@@ -263,7 +264,8 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
   int output_tile_count = UP_DIV(output_count, tile_n);
   int ic4 = UP_DIV(in_channel, C4NUM);
   int kernel_plane = kernel_h * kernel_w;
-  int unit_size = kernel_plane * ic4 * C4NUM;
+  int plane_block = UP_DIV(kernel_plane, C4NUM);
+  int unit_size = plane_block * C4NUM * ic4 * C4NUM;
   int packed_input_size = output_tile_count * tile_n * unit_size;
   int input_sum_offset;
   if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
@@ -297,9 +299,10 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
                          out_channel, tmp_input_sum, conv_param);
       } else {
         // res part
-        IndirectGemmInt8(tmp_out, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
+        int8_t *tmp_out_ptr = tmp_out + task_id * tile_n * out_channel;
+        IndirectGemmInt8(tmp_out_ptr, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
                          out_channel, tmp_input_sum, conv_param);
-        memcpy(output_data + out_offset, tmp_out, real_cal_num * out_channel);
+        memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel);
       }
     }
   }
@@ -359,12 +362,272 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
                             kernel_plane, out_channel, tmp_input_sum, conv_param, gemm_func);
       } else {
         // res part
-        IndirectGemmInt8Opt(tmp_out, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
-                            out_channel, tmp_input_sum, conv_param, gemm_func);
-        memcpy(output_data + out_offset, tmp_out, real_cal_num * out_channel);
+        int8_t *tmp_out_ptr = tmp_out + task_id * tile_n * out_channel;
+        IndirectGemmInt8Opt(tmp_out_ptr, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4,
+                            kernel_plane, out_channel, tmp_input_sum, conv_param, gemm_func);
+        memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel);
+      }
+    }
+  }
+}
+
+void Conv1x1PreOpt(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel,
+                   size_t output_channel, size_t plane_size, ConvParameter *conv_param) {
+  int ic4 = UP_ROUND(input_channel, C4NUM);
+  size_t hw_8div = plane_size / C8NUM * C8NUM;
+  size_t hw_8res = plane_size - hw_8div;
+  size_t ic_4div = input_channel / C4NUM * C4NUM;
+  int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[0].zp_;
+
+  if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
+    const int8_t *src_r = src_input;
+    int8_t *pack_r = packed_input;
+    /* per layer */
+    for (int hwi = 0; hwi < hw_8div; hwi += C8NUM) {
+      const int8_t *src_ic = src_r;
+      int8_t *pack_ic = pack_r;
+      int32_t *input_sum_r = input_sum + hwi;
+#ifdef ENABLE_ARM64
+      size_t src_stride = input_channel;
+      size_t ic_4res = input_channel - ic_4div;
+      asm volatile(
+        "dup v10.4s, wzr \n"
+        "dup v11.4s, wzr \n"
+        "mov x20, %[input_sum_r] \n"
+        "dup v20.4s, %w[filter_zp]  \n"
+
+        "mov x10, %[src_ic] \n"
+        "mov x11, %[pack_ic] \n"
+
+        "mov x0, #0 \n"
+        "1: \n"
+        "cmp x0, %[ic_4div] \n"
+        "add x0, x0, #4\n"
+        "mov x12, x10 \n"
+        "add x10, x10, #4\n"
+        "blt 2f \n"
+        "cmp %[ic_4res], #0\n"
+        "beq 6f \n"
+        "cmp %[ic_4res], #1\n"
+        "beq 3f \n"
+        "cmp %[ic_4res], #2\n"
+        "beq 4f \n"
+        "cmp %[ic_4res], #3\n"
+        "beq 5f \n"
+
+        "2: \n"
+        "ld1 {v0.s}[0], [x12], %[src_stride]\n"
+        "ld1 {v0.s}[1], [x12], %[src_stride]\n"
+        "ld1 {v0.s}[2], [x12], %[src_stride]\n"
+        "ld1 {v0.s}[3], [x12], %[src_stride]\n"
+        "ld1 {v1.s}[0], [x12], %[src_stride]\n"
+        "ld1 {v1.s}[1], [x12], %[src_stride]\n"
+        "ld1 {v1.s}[2], [x12], %[src_stride]\n"
+        "ld1 {v1.s}[3], [x12], %[src_stride]\n"
+
+        "st1 {v0.16b}, [x11], #16\n"
+        "st1 {v1.16b}, [x11], #16\n"
+
+        "saddlp v4.8h, v0.16b \n"
+        "saddlp v5.8h, v1.16b \n"
+
+        "saddlp v0.4s, v4.8h \n"
+        "saddlp v1.4s, v5.8h \n"
+
+        "add v10.4s, v10.4s, v0.4s \n"
+        "add v11.4s, v11.4s, v1.4s \n"
+        "b 1b \n"
+
+        "3: \n" /* col res 1 */
+        "dup v0.4s, wzr \n"
+        "dup v1.4s, wzr \n"
+
+        "ld1 {v0.b}[0],  [x12], %[src_stride]\n"
+        "ld1 {v0.b}[4],  [x12], %[src_stride]\n"
+        "ld1 {v0.b}[8],  [x12], %[src_stride]\n"
+        "ld1 {v0.b}[12], [x12], %[src_stride]\n"
+        "ld1 {v1.b}[0],  [x12], %[src_stride]\n"
+        "ld1 {v1.b}[4],  [x12], %[src_stride]\n"
+        "ld1 {v1.b}[8],  [x12], %[src_stride]\n"
+        "ld1 {v1.b}[12], [x12], %[src_stride]\n"
+
+        "st1 {v0.16b}, [x11], #16\n"
+        "st1 {v1.16b}, [x11], #16\n"
+        "saddlp v4.8h, v0.16b \n"
+        "saddlp v5.8h, v1.16b \n"
+        "saddlp v0.4s, v4.8h \n"
+        "saddlp v1.4s, v5.8h \n"
+        "add v10.4s, v10.4s, v0.4s \n"
+        "add v11.4s, v11.4s, v1.4s \n"
+        "b 6f \n"
+
+        "4: \n" /* col res 2 */
+        "dup v0.4s, wzr \n"
+        "dup v1.4s, wzr \n"
+
+        "ld1 {v0.h}[0], [x12], %[src_stride]\n"
+        "ld1 {v0.h}[2], [x12], %[src_stride]\n"
+        "ld1 {v0.h}[4], [x12], %[src_stride]\n"
+        "ld1 {v0.h}[6], [x12], %[src_stride]\n"
+        "ld1 {v1.h}[0], [x12], %[src_stride]\n"
+        "ld1 {v1.h}[2], [x12], %[src_stride]\n"
+        "ld1 {v1.h}[4], [x12], %[src_stride]\n"
+        "ld1 {v1.h}[6], [x12], %[src_stride]\n"
+
+        "st1 {v0.16b}, [x11], #16\n"
+        "st1 {v1.16b}, [x11], #16\n"
+        "saddlp v4.8h, v0.16b \n"
+        "saddlp v5.8h, v1.16b \n"
+        "saddlp v0.4s, v4.8h \n"
+        "saddlp v1.4s, v5.8h \n"
+        "add v10.4s, v10.4s, v0.4s \n"
+        "add v11.4s, v11.4s, v1.4s \n"
+        "b 6f \n"
+
+        "5: \n" /* col res 3 */
+        "dup v0.4s, wzr \n"
+        "dup v1.4s, wzr \n"
+        "add x13, x12, #2 \n"
+
+        "ld1 {v0.h}[0], [x12], %[src_stride]\n"
+        "ld1 {v0.b}[2], [x13], %[src_stride]\n"
+        "ld1 {v0.h}[2], [x12], %[src_stride]\n"
+        "ld1 {v0.b}[6], [x13], %[src_stride]\n"
+        "ld1 {v0.h}[4], [x12], %[src_stride]\n"
+        "ld1 {v0.b}[10], [x13], %[src_stride]\n"
+        "ld1 {v0.h}[6], [x12], %[src_stride]\n"
+        "ld1 {v0.b}[14], [x13], %[src_stride]\n"
+        "ld1 {v1.h}[0], [x12], %[src_stride]\n"
+        "ld1 {v1.b}[2], [x13], %[src_stride]\n"
+        "ld1 {v1.h}[2], [x12], %[src_stride]\n"
+        "ld1 {v1.b}[6], [x13], %[src_stride]\n"
+        "ld1 {v1.h}[4], [x12], %[src_stride]\n"
+        "ld1 {v1.b}[10], [x13], %[src_stride]\n"
+        "ld1 {v1.h}[6], [x12], %[src_stride]\n"
+        "ld1 {v1.b}[14], [x13], %[src_stride]\n"
+
+        "st1 {v0.16b}, [x11], #16\n"
+        "st1 {v1.16b}, [x11], #16\n"
+        "saddlp v4.8h, v0.16b \n"
+        "saddlp v5.8h, v1.16b \n"
+        "saddlp v0.4s, v4.8h \n"
+        "saddlp v1.4s, v5.8h \n"
+        "add v10.4s, v10.4s, v0.4s \n"
+        "add v11.4s, v11.4s, v1.4s \n"
+        "b 6f \n"
+
+        "6: \n"
+        "mul v10.4s, v10.4s, v20.4s \n"
+        "mul v11.4s, v11.4s, v20.4s \n"
+
+        "st1 {v10.4s}, [x20], #16 \n"
+        "st1 {v11.4s}, [x20], #16 \n"
+
+        :
+        : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r),
+          [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res),
+          [ filter_zp ] "r"(filter_zp)
+        : "x0", "x1", "x10", "x11", "x12", "x13", "x20", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v11",
+          "v20");
+#else
+      int32_t tmp_sum_value[8] = {0};
+      for (int ici = 0; ici < ic_4div; ici += C4NUM) {
+        for (int i = 0; i < C8NUM; i++) {
+          tmp_sum_value[i] += src_ic[0 + i * input_channel];
+          tmp_sum_value[i] += src_ic[1 + i * input_channel];
+          tmp_sum_value[i] += src_ic[2 + i * input_channel];
+          tmp_sum_value[i] += src_ic[3 + i * input_channel];
+          pack_ic[0 + i * C4NUM] = src_ic[0 + i * input_channel];
+          pack_ic[1 + i * C4NUM] = src_ic[1 + i * input_channel];
+          pack_ic[2 + i * C4NUM] = src_ic[2 + i * input_channel];
+          pack_ic[3 + i * C4NUM] = src_ic[3 + i * input_channel];
+        }
+        src_ic += C4NUM;
+        pack_ic += C4NUM * C8NUM;
+      }
+      for (int ici = ic_4div; ici < input_channel; ici += 1) {
+        for (int i = 0; i < C8NUM; i++) {
+          tmp_sum_value[i] += src_ic[i * input_channel];
+          pack_ic[i * C4NUM] = src_ic[i * input_channel];
+        }
+        src_ic += 1;
+        pack_ic += 1;
+      }
+
+      for (int i = 0; i < C8NUM; i++) {
+        input_sum_r[i] = tmp_sum_value[i] * filter_zp;
+      }
+#endif
+      src_r += input_channel * C8NUM;
+      pack_r += ic4 * C8NUM;
+    }
+
+    if (hw_8div != plane_size) {
+      memset(pack_r, 0, C8NUM * ic4);
+      for (int hwi = hw_8div; hwi < plane_size; hwi += 1) {
+        int32_t tmp_sum_value = 0;
+        const int8_t *src_ic = src_r;
+        int8_t *pack_ic = pack_r;
+        for (int ici = 0; ici < ic_4div; ici += C4NUM) {
+          tmp_sum_value += src_ic[0];
+          tmp_sum_value += src_ic[1];
+          tmp_sum_value += src_ic[2];
+          tmp_sum_value += src_ic[3];
+          pack_ic[0] = src_ic[0];
+          pack_ic[1] = src_ic[1];
+          pack_ic[2] = src_ic[2];
+          pack_ic[3] = src_ic[3];
+          src_ic += C4NUM;
+          pack_ic += C4NUM * C8NUM;
+        }
+        for (int ici = ic_4div; ici < input_channel; ici += 1) {
+          tmp_sum_value += src_ic[0];
+          pack_ic[0] = src_ic[0];
+          src_ic += 1;
+          pack_ic += 1;
+        }
+        input_sum[hwi] = tmp_sum_value * filter_zp;
+        src_r += input_channel;
+        pack_r += C4NUM;
+      }
+      for (int hwi = plane_size; hwi < plane_size + hw_8res; hwi++) {
+        input_sum[hwi] = 0;
       }
     }
+  } else {
+    /* per channel */
+    RowMajor2Row4x8MajorInt8(src_input, packed_input, plane_size, input_channel);
+    PackInputSum8x4Int8(packed_input, input_sum, input_channel, output_channel, plane_size, conv_param);
   }
+  return;
+}
+
+void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                    const int32_t *bias, int row, int col, int deep4, ConvParameter *conv_param,
+                    MATMUL_OPT_R_FUNC matmul_func) {
+  matmul_func(packed_input, packed_weight, dst, row, col, deep4, conv_param->output_channel_, input_sum, bias,
+              conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
+              conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+              conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], false);
+  return;
+}
+
+void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param) {
+#ifdef ENABLE_ARM64
+  MatmulInt8Neon64(packed_input, packed_weight, dst, UP_ROUND(row, C4NUM), UP_ROUND(col, C4NUM), deep16, input_sum,
+                   bias, conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
+                   conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                   conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
+                   conv_param->conv_quant_arg_.right_shift_[0], row, col, conv_param->output_channel_);
+#else
+  MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
+                    conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
+                    conv_param->conv_quant_arg_.quant_multiplier_,
+                    conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
+                    conv_param->conv_quant_arg_.out_act_max_[0], false);
+#endif
+  return;
 }
 
 // int8 convolution 3x3
@@ -391,15 +654,15 @@ void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bi
       int start_index = thread_id * TILE_NUM;
       int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
 
-      Conv3x3Uint8InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset,
-                                 block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num,
-                                 out_w_block, conv_param);
+      Conv3x3Int8InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset,
+                                block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num,
+                                out_w_block, conv_param);
 
-      Conv3x3Uint8Gemm(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset,
-                       transed_weight, output_channel, ic8, real_cal_num);
+      Conv3x3Int8Gemm(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset,
+                      transed_weight, output_channel, ic8, real_cal_num);
 
-      Conv3x3Uint8OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tmp_out + tmp_out_batch_offset,
-                                  bias_data, start_index, real_cal_num, out_w_block, conv_param);
+      Conv3x3Int8OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tmp_out + tmp_out_batch_offset,
+                                 bias_data, start_index, real_cal_num, out_w_block, conv_param);
     }
   }
 }
diff --git a/mindspore/lite/nnacl/int8/conv_int8.h b/mindspore/lite/nnacl/int8/conv_int8.h
index 730b031cef..5741ee3117 100644
--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@@ -25,6 +25,8 @@
 #include "nnacl/conv_parameter.h"
 #include "nnacl/winograd_utils.h"
 #include "nnacl/quantization/quantize.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/int8/matmul_int8.h"
 
 typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
                           size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
@@ -51,6 +53,15 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
                  int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
                  ConvParameter *conv_param, GEMM_FUNC gemm_func);
 
+// int8 convolution 1x1
+void Conv1x1PreOpt(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel,
+                   size_t output_channel, size_t plane_size, ConvParameter *conv_param);
+void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param);
+void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                    const int32_t *bias, int row, int col, int deep4, ConvParameter *conv_param,
+                    MATMUL_OPT_R_FUNC matmul_func);
+
 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
                  int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
diff --git a/mindspore/lite/nnacl/int8/deconv.c b/mindspore/lite/nnacl/int8/deconv.c
index b1389f1953..2195f2728b 100644
--- a/mindspore/lite/nnacl/int8/deconv.c
+++ b/mindspore/lite/nnacl/int8/deconv.c
@@ -33,8 +33,8 @@ int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
 
     for (int ih = 0; ih < conv_param->input_h_; ih++) {
       for (int iw = 0; iw < conv_param->input_w_; iw++) {
-        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
-        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
 
         int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
         int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -88,8 +88,8 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
 
     for (int ih = 0; ih < conv_param->input_h_; ih++) {
       for (int iw = 0; iw < conv_param->input_w_; iw++) {
-        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
-        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
 
         int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
         int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp,
 void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16,
                         bool suppport_opt) {
   /* optimize normal -> same layout */
-#ifdef ENABLE_ARM64
-  asm volatile(
-    "mov x10, %[src] \n"
-    "mov x11, %[dst] \n"
-    "dup v15.4s, %w[filter_zp]  \n"
-
-    "mov x0, #0 \n"
-    "1: \n"
-    "cmp x0, %[row4] \n"
-    "beq 4f \n"
-    "add x0, x0, #4\n"
-    "dup v10.4s, wzr \n"
-    "mov x2, #0 \n"
-
-    "2: \n"
-    "cmp x2, %[col16] \n"
-    "beq 3f \n"
-    "add x2, x2, #16\n"
-
-    "ld1 {v0.16b}, [x10], #16\n"
-    "ld1 {v1.16b}, [x10], #16\n"
-    "ld1 {v2.16b}, [x10], #16\n"
-    "ld1 {v3.16b}, [x10], #16\n"
-
-    "saddlp v4.8h, v0.16b \n"
-    "saddlp v5.8h, v1.16b \n"
-    "saddlp v6.8h, v2.16b \n"
-    "saddlp v7.8h, v3.16b \n"
-
-    "saddlp v0.4S, v4.8h \n"
-    "saddlp v1.4S, v5.8h \n"
-    "saddlp v2.4S, v6.8h \n"
-    "saddlp v3.4S, v7.8h \n"
-
-    "addv s4, v0.4S \n"
-    "addv s5, v1.4S \n"
-    "addv s6, v2.4S \n"
-    "addv s7, v3.4S \n"
-
-    "mov v0.s[0], v4.s[0] \n"
-    "mov v0.s[1], v5.s[0] \n"
-    "mov v0.s[2], v6.s[0] \n"
-    "mov v0.s[3], v7.s[0] \n"
-
-    "add v10.4s, v10.4s, v0.4s \n"
-    "b 2b\n"
-
-    "3: \n"
-    "mul v10.4s, v10.4s, v15.4s \n"
-    "st1 {v10.4s}, [x11], #16 \n"
-    "beq 1b \n"
-
-    "4: \n"
-
-    :
-    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
-    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
-#else
-  for (int r = 0; r < row4; r++) {
-    int32_t tmp_value = 0;
-    for (int c = 0; c < col16; c++) {
-      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
-      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
-      tmp_value += src[src_index];
-    }
-  }
-#endif
+  PackInputSum16x4PerLayer(src, dst, filter_zp, row4, col16);
   return;
 }
 
diff --git a/mindspore/lite/nnacl/int8/div_int8.c b/mindspore/lite/nnacl/int8/div_int8.c
index f3b8d86b66..1f852cbb39 100644
--- a/mindspore/lite/nnacl/int8/div_int8.c
+++ b/mindspore/lite/nnacl/int8/div_int8.c
@@ -29,8 +29,8 @@ int DivInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64
     }
 
     int recip_shift;
-    const int32_t input1_inv = (input1_val > 0) ? ComputerReciproal(input1_val, 31, &recip_shift)
-                                                : -ComputerReciproal(-input1_val, 31, &recip_shift);
+    const int32_t input1_inv = (input1_val > 0) ? ComputerReciprocal(input1_val, 31, &recip_shift)
+                                                : -ComputerReciprocal(-input1_val, 31, &recip_shift);
     const int leading_bits = CountLeadingSignBits(input0_val);
     const int32_t raw_data =
       SaturatingRoundingDoublingHighMul(input0_val * (1 << (unsigned int)leading_bits), input1_inv);
diff --git a/mindspore/lite/nnacl/int8/gatherNd_int8.c b/mindspore/lite/nnacl/int8/gatherNd_int8.c
new file mode 100644
index 0000000000..02141cf856
--- /dev/null
+++ b/mindspore/lite/nnacl/int8/gatherNd_int8.c
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/int8/gatherNd_int8.h"
+#include <string.h>
+#include "nnacl/errorcode.h"
+
+int GatherNdInt8(int8_t *input, int8_t *output, int *in_offset, int area, int count, GatherQuantArg param) {
+  double alpha = param.alpha_;
+  int z1 = param.zp_in_;
+  int z2 = param.zp_out_;
+  for (int i = 0; i < count; ++i) {
+    for (int j = 0; j < area; ++j) {
+      int32_t tmp = round(alpha * (input[in_offset[i] + j] - z1)) + z2;
+      tmp = tmp > 127 ? 127 : tmp;
+      tmp = tmp < -128 ? -128 : tmp;
+      output[area * i + j] = (int8_t)tmp;
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/int8/gatherNd_int8.h b/mindspore/lite/nnacl/int8/gatherNd_int8.h
new file mode 100644
index 0000000000..0ad07795fe
--- /dev/null
+++ b/mindspore/lite/nnacl/int8/gatherNd_int8.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHERND_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHERND_INT8_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/quantization/quantize.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int GatherNdInt8(int8_t *in_data, int8_t *out_data, int *in_offset, int area, int count, GatherQuantArg param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHERND_INT8_H_
diff --git a/mindspore/lite/nnacl/int8/gather_int8.c b/mindspore/lite/nnacl/int8/gather_int8.c
new file mode 100644
index 0000000000..042e24b5f8
--- /dev/null
+++ b/mindspore/lite/nnacl/int8/gather_int8.c
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include "nnacl/int8/gather_int8.h"
+#include "nnacl/op_base.h"
+#include "nnacl/quantization/quantize.h"
+#include "nnacl/errorcode.h"
+
+int GatherInt8(int8_t *in_data, int8_t *out_data, int outer_size, int inner_size, int limit, int *indices,
+               int indices_element_size, GatherQuantArg para) {
+  double alpha = para.alpha_;
+  int z1 = para.zp_in_;
+  int z2 = para.zp_out_;
+  int i, m, j;
+  for (m = 0; m < outer_size; ++m) {
+    const int8_t *inputm = in_data + inner_size * m * limit;
+    int8_t *outputm = out_data + inner_size * m * indices_element_size;
+    for (i = 0; i < indices_element_size; ++i) {
+      if (indices[i] < 0 || indices[i] > limit) {
+        return NNACL_ERR;
+      }
+      for (j = 0; j < inner_size; ++j) {
+        int32_t tmp = round(alpha * (inputm[indices[i] * inner_size + j] - z1)) + z2;
+        tmp = tmp > 127 ? 127 : tmp;
+        tmp = tmp < -128 ? -128 : tmp;
+        outputm[i * inner_size + j] = (int8_t)tmp;
+      }
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/int8/gather_int8.h b/mindspore/lite/nnacl/int8/gather_int8.h
new file mode 100644
index 0000000000..4a06e08d6c
--- /dev/null
+++ b/mindspore/lite/nnacl/int8/gather_int8.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHER_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHER_INT8_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/quantization/quantize.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int GatherInt8(int8_t *in_data, int8_t *out_data, int outer_size, int inner_size, int limit, int *indices,
+               int indices_element_size, GatherQuantArg para);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_GATHER_INT8_H_
diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c
index aa93dacf82..1135cc5e09 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -28,6 +28,36 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
   }
 }
 
+void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  int col16 = UP_ROUND(col, C16NUM);
+  for (int r = 0; r < row; r++) {
+    int rd4 = r / C4NUM;
+    int rm4 = r % C4NUM;
+    for (int c = 0; c < col; c++) {
+      int cd16 = c / C16NUM;
+      int cm16 = c % C16NUM;
+      int dst_index = rd4 * col16 * C4NUM + cd16 * C4NUM * C16NUM + rm4 * C16NUM + cm16;
+      int src_index = r * col + c;
+      dst_ptr[dst_index] = src_ptr[src_index];
+    }
+  }
+}
+
+void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  int col4 = UP_ROUND(col, C4NUM);
+  for (int r = 0; r < row; r++) {
+    int rd8 = r / C8NUM;
+    int rm8 = r % C8NUM;
+    for (int c = 0; c < col; c++) {
+      int cd4 = c / C4NUM;
+      int cm4 = c % C4NUM;
+      int dst_index = rd8 * col4 * C8NUM + cd4 * C8NUM * C4NUM + rm8 * C4NUM + cm4;
+      int src_index = r * col + c;
+      dst_ptr[dst_index] = src_ptr[src_index];
+    }
+  }
+}
+
 void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) {
   for (int r = 0; r < row; r++) {
     int8_t *src_r = src + r * stride;
@@ -37,6 +67,29 @@ void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stri
   return;
 }
 
+void MatrixEmptyInt8(int8_t *dst, int row, int col) {
+  for (int r = 0; r < row; r++) {
+    int8_t *dst_r = dst + r * C16NUM;
+    memset(dst_r, 0, col * sizeof(int8_t));
+  }
+  return;
+}
+
+void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  /* Row-major to row16x4-major (block row-major) */
+  int col4 = UP_ROUND(col, C4NUM);
+  for (int r = 0; r < row; r++) {
+    int rd8 = r / C8NUM, rm8 = r % C8NUM;
+    for (int c = 0; c < col; c++) {
+      int cd4 = c / C4NUM, cm4 = c % C4NUM;
+      int src_index = r * col + c;
+      int dst_index = rd8 * col4 * C8NUM + cd4 * C4NUM * C8NUM + rm8 * C4NUM + cm4;
+      dst_ptr[dst_index] = src_ptr[src_index];
+    }
+  }
+  return;
+}
+
 void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
   /* Row-major to row16x4-major (block row-major) */
   int col16 = UP_ROUND(col, C16NUM);
@@ -50,16 +103,17 @@ void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
   for (int ri = 0; ri < row_4div; ri += C4NUM) {
     for (int ci = 0; ci < col_16div; ci += C16NUM) {
 #ifdef ENABLE_ARM64
+      size_t col_offset = col;
       int8_t *src_c = src_r + ci;
       int8_t *dst_c = dst_r + ci * C4NUM;
       asm volatile(
         "mov x10, %[src_c] \n"
         "mov x11, %[dst_c] \n"
 
-        "ld1 {v0.16b}, [x10], %[col]\n"
-        "ld1 {v1.16b}, [x10], %[col]\n"
-        "ld1 {v2.16b}, [x10], %[col]\n"
-        "ld1 {v3.16b}, [x10], %[col]\n"
+        "ld1 {v0.16b}, [x10], %[col_offset]\n"
+        "ld1 {v1.16b}, [x10], %[col_offset]\n"
+        "ld1 {v2.16b}, [x10], %[col_offset]\n"
+        "ld1 {v3.16b}, [x10], %[col_offset]\n"
 
         "st1 {v0.16b}, [x11], #16\n"
         "st1 {v1.16b}, [x11], #16\n"
@@ -67,7 +121,7 @@ void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
         "st1 {v3.16b}, [x11], #16\n"
 
         :
-        : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ col ] "r"(col)
+        : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ col_offset ] "r"(col_offset)
         : "x10", "x11", "v0", "v1", "v2", "v3");
 #else
       MatrixPack4x16UnitInt8(src_r + ci, dst_r + ci * C4NUM, C4NUM, C16NUM, col);
@@ -76,12 +130,15 @@ void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
 
     if (col != col_16div) {
       MatrixPack4x16UnitInt8(src_r + col_16div, dst_r + col_16div * C4NUM, C4NUM, col_16res, col);
+      MatrixEmptyInt8(dst_r + col_16div * C4NUM + col_16res, C4NUM, C16NUM - col_16res);
     }
     src_r += C4NUM * col;
     dst_r += C4NUM * col16;
   }
 
   if (row != row_4div) {
+    memset(dst_r, 0, C4NUM * col16);
+
     for (int ci = 0; ci < col_16div; ci += C16NUM) {
       MatrixPack4x16UnitInt8(src_r + ci, dst_r + ci * C4NUM, row_4res, C16NUM, col);
     }
@@ -103,25 +160,6 @@ void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
   }
 }
 
-void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
-                const int32_t a_zp, const int32_t b_zp) {
-  /*  col8-major * row8-major => row8x8-major  */
-  for (int row = 0; row < row8; row++) {
-    for (int col = 0; col < col8; col++) {
-      int r8div = row / 8, r8mod = row % 8;
-      int c8div = col / 8, c8mod = col % 8;
-      size_t ci = c8div * row8 * 8 + row * 8 + c8mod;
-      int32_t value = 0;
-      for (int d = 0; d < deep; d++) {
-        size_t ai = r8div * deep * 8 + d * 8 + r8mod;
-        size_t bi = c8div * deep * 8 + d * 8 + c8mod;
-        value = value + ((int32_t)a[ai] - a_zp) * ((int32_t)b[bi] - b_zp);
-      }
-      c[ci] = value;
-    }
-  }
-}
-
 void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                      const int *input_sum, const int *bias) {
   /*  row4x16-major * row16x4-major => row4x4-major  */
@@ -145,7 +183,100 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
   return;
 }
 
-#ifdef ENABLE_ARM64
+void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                       bool per_channel) {
+  /*  row4x16-major * row16x4-major => (int8)row-major  : per-channel */
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM;
+      int c4div = c / C4NUM, c4mod = c % C4NUM;
+      size_t ci = r * stride + c;
+      int32_t value = 0;
+      for (int d = 0; d < deep_16; d++) {
+        int d16div = d / C16NUM, d16mod = d % C16NUM;
+        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
+        value = value + a[ai] * b[bi];
+      }
+      int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r];
+      value -= cur_input_sum;
+      value += bias[c];
+      int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0];
+      int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0];
+      int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0];
+      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      dst[ci] = (int8_t)value;
+    }
+  }
+  return;
+}
+
+void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                      bool per_channel) {
+  /*  row8x4-major * row4x8-major => (int8)row-major  */
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r8div = r / C8NUM, r8mod = r % C8NUM;
+      int c8div = c / C8NUM, c8mod = c % C8NUM;
+      size_t ci = r * stride + c;
+      int32_t value = 0;
+      for (int d = 0; d < deep_4; d++) {
+        int d4div = d / C4NUM, d4mod = d % C4NUM;
+        size_t ai = r8div * deep_4 * C8NUM + d4div * C8NUM * C4NUM + r8mod * C4NUM + d4mod;
+        size_t bi = c8div * deep_4 * C8NUM + d4div * C8NUM * C4NUM + c8mod * C4NUM + d4mod;
+        value = value + a[ai] * b[bi];
+      }
+      int32_t cur_input_sum = per_channel ? input_sum[c8div * UP_ROUND(row, C8NUM) + r * C8NUM + c8mod] : input_sum[r];
+      value -= cur_input_sum;
+      value += bias[c];
+      int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0];
+      int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0];
+      int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0];
+      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      dst[ci] = (int8_t)value;
+    }
+  }
+  return;
+}
+
+/*  row4x16-major * col16x4-major => row4x4-major  */
+void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums, const int *bias, int act_min,
+                int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16,
+                int stride) {
+  int8_t *output = dst;
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r4div = r / C4NUM;
+      int r4mod = r % C4NUM;
+      int c4div = c / C4NUM;
+      int c4mod = c % C4NUM;
+      int value = 0;
+      for (int d = 0; d < deep16; d++) {
+        int d16div = d / C16NUM;
+        int d16mod = d % C16NUM;
+        size_t ai = r4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
+        value += a[ai] * b[bi];
+      }
+      value -= a_sums[r];
+      value += bias[c];
+      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + out_zp;
+      value = MSMIN(INT8_MAX, value);
+      value = MSMAX(INT8_MIN, value);
+      output[c] = (int8_t)value;
+    }
+    output += stride;
+  }
+}
+
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) {
   int stride = sizeof(int8_t) * 16 * 4;
   for (int r = 0; r < row; ++r) {
@@ -168,23 +299,35 @@ void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_1
   }
 }
 
-void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst) {
+// dst: weight_zp * input_row_sums
+void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order) {
   for (int r = 0; r < row; ++r) {
+    int sum = 0;
     for (int c = 0; c < col; ++c) {
-      int src_idx = r * col + c;
-      dst[r] += a[src_idx];
+      if (order == RowMajor) {
+        sum += input[r * col + c];
+      } else {
+        sum += input[c * row + r];
+      }
     }
-    dst[r] *= b_zp;
+    sum *= weight_zp;
+    dst[r] = sum;
   }
 }
 
-void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst) {
+// dst: bias + depth*input_zp*weight_zp - input_zp*weight_col_sums
+void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, int weight_zp, int *bias, int *dst,
+                        DataOrder order) {
   for (int c = 0; c < col; ++c) {
+    int sum = 0;
     for (int r = 0; r < row; ++r) {
-      int src_idx = r * col + c;
-      dst[c] += b[src_idx];
+      if (order == RowMajor) {
+        sum += weight[r * col + c];
+      } else {
+        sum += weight[c * row + r];
+      }
     }
-    dst[c] = row * a_zp * b_zp - a_zp * dst[c];
+    dst[c] = row * input_zp * weight_zp - input_zp * sum;
     if (bias) {
       dst[c] += bias[c];
     }
@@ -201,4 +344,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow)
     }
   }
 }
-#endif
diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h
index bf6ab900a9..03028c49ec 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -24,25 +24,37 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const int col8, const int deep,
-                const int a_zp, const int b_zp);
 void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                      const int *input_sum, const int *bias);
+void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                       bool per_channel);
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col);
 
-#ifdef ENABLE_ARM64
+void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                      bool per_channel);
+void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
 void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
-void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst);
-void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst);
-void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow);
+void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
+void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, int weight_zp, int *bias, int *dst,
+                        DataOrder order);
+void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums, const int *bias, int act_min,
+                int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16,
+                int stride);
 
-// bias = bias + depth * a_zp * b_zp - a_zp * b_sums
+#ifdef ENABLE_ARM64
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
                       const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,
-                      int right_shift);
+                      int right_shift, int row, int col, int stride);
 
 void MatMulR4Int8Neon64(const int8_t *a, const int8_t *b, int32_t *dst, int row4, int col4, int deep16,
                         const int *input_sum, const int *bias);
diff --git a/mindspore/lite/nnacl/int8/pooling_int8.c b/mindspore/lite/nnacl/int8/pooling_int8.c
index 540d43deed..f27332776a 100644
--- a/mindspore/lite/nnacl/int8/pooling_int8.c
+++ b/mindspore/lite/nnacl/int8/pooling_int8.c
@@ -89,8 +89,13 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
   int output_batch = pooling_param->output_batch_;
   int out_plane = output_w * output_h;
   int out_tile_count = UP_DIV(out_plane, TILE_NUM);
-  int thread_num = pooling_param->thread_num_;
-  int c8 = UP_DIV(channel, C8NUM);
+  int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
+  float input_scale = pooling_param->quant_args_[0][0].scale_;
+  int input_zp = pooling_param->quant_args_[0][0].zp_;
+  float output_scale = pooling_param->quant_args_[1][0].scale_;
+  int output_zp = pooling_param->quant_args_[1][0].zp_;
+  double real_multiplier = input_scale / output_scale;
+  int c16 = channel / C16NUM;
   const int8_t out_min = INT8_MIN;
   const int8_t out_max = INT8_MAX;
 
@@ -107,89 +112,159 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
         int in_w_index = out_w_index * stride_w - pad_w;
         int in_h_index = out_h_index * stride_h - pad_h;
         int out_plane_offset = out_batch_offset + index * channel;
-        for (int j = 0; j < c8 - 1; j++) {
-          int in_channel_offset = in_batch_offset + j * C8NUM;
-          int out_channel_offset = out_plane_offset + j * C8NUM;
-          int16_t tmp_avg1 = 0;
-          int16_t tmp_avg2 = 0;
-          int16_t tmp_avg3 = 0;
-          int16_t tmp_avg4 = 0;
-          int16_t tmp_avg5 = 0;
-          int16_t tmp_avg6 = 0;
-          int16_t tmp_avg7 = 0;
-          int16_t tmp_avg8 = 0;
-          int real_count = 0;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-                tmp_avg1 += *(input_ptr + in_offset);
-                tmp_avg2 += *(input_ptr + in_offset + 1);
-                tmp_avg3 += *(input_ptr + in_offset + 2);
-                tmp_avg4 += *(input_ptr + in_offset + 3);
-                tmp_avg5 += *(input_ptr + in_offset + 4);
-                tmp_avg6 += *(input_ptr + in_offset + 5);
-                tmp_avg7 += *(input_ptr + in_offset + 6);
-                tmp_avg8 += *(input_ptr + in_offset + 7);
-                ++real_count;
+        int input_stride = (in_h_index * in_w + in_w_index) * channel;
+        int kw_s = MSMAX(0, -in_w_index);
+        int kw_e = MSMIN(win_w, in_w - in_w_index);
+        int kh_s = MSMAX(0, -in_h_index);
+        int kh_e = MSMIN(win_h, in_h - in_h_index);
+        int real_count = (kw_e - kw_s) * (kh_e - kh_s);
+
+        // 16 channels
+        for (int j = 0; j < c16; j++) {
+#ifdef ENABLE_NEON
+          int16x8_t tmp_avg[2];
+          tmp_avg[0] = vmovq_n_s16(0);
+          tmp_avg[1] = vmovq_n_s16(0);
+#else
+          int16_t tmp_avg[16];
+          int16_t real_out[16];
+          for (int m = 0; m < C16NUM; ++m) {
+            tmp_avg[m] = 0;
+          }
+#endif
+          int in_channel_offset = in_batch_offset + j * C16NUM;
+          int out_channel_offset = out_plane_offset + j * C16NUM;
+
+          for (int h = kh_s; h < kh_e; h++) {
+            for (int w = kw_s; w < kw_e; w++) {
+              int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
+#ifdef ENABLE_NEON
+              int8x16_t in_ptr = vld1q_s8(input_ptr + in_offset);
+              int8x8_t in_data1 = vget_low_s8(in_ptr);
+              int8x8_t in_data2 = vget_high_s8(in_ptr);
+              int16x8_t data1 = vmovl_s8(in_data1);
+              int16x8_t data2 = vmovl_s8(in_data2);
+              tmp_avg[0] = vaddq_s16(tmp_avg[0], data1);
+              tmp_avg[1] = vaddq_s16(tmp_avg[1], data2);
+#else
+              for (int k = 0; k < C16NUM; ++k) {
+                tmp_avg[k] += input_ptr[in_offset + k];
               }
+#endif
             }  // win_w loop
           }    // win_h loop
-          int16_t tmp_out1 = round((float)tmp_avg1 / (float)real_count);
-          int16_t tmp_out2 = round((float)tmp_avg2 / (float)real_count);
-          int16_t tmp_out3 = round((float)tmp_avg3 / (float)real_count);
-          int16_t tmp_out4 = round((float)tmp_avg4 / (float)real_count);
-          int16_t tmp_out5 = round((float)tmp_avg5 / (float)real_count);
-          int16_t tmp_out6 = round((float)tmp_avg6 / (float)real_count);
-          int16_t tmp_out7 = round((float)tmp_avg7 / (float)real_count);
-          int16_t tmp_out8 = round((float)tmp_avg8 / (float)real_count);
-          int16_t real_out1 = tmp_out1 < out_min ? out_min : tmp_out1;
-          int16_t real_out2 = tmp_out2 < out_min ? out_min : tmp_out2;
-          int16_t real_out3 = tmp_out3 < out_min ? out_min : tmp_out3;
-          int16_t real_out4 = tmp_out4 < out_min ? out_min : tmp_out4;
-          int16_t real_out5 = tmp_out5 < out_min ? out_min : tmp_out5;
-          int16_t real_out6 = tmp_out6 < out_min ? out_min : tmp_out6;
-          int16_t real_out7 = tmp_out7 < out_min ? out_min : tmp_out7;
-          int16_t real_out8 = tmp_out8 < out_min ? out_min : tmp_out8;
-          real_out1 = real_out1 > out_max ? out_max : real_out1;
-          real_out2 = real_out2 > out_max ? out_max : real_out2;
-          real_out3 = real_out3 > out_max ? out_max : real_out3;
-          real_out4 = real_out4 > out_max ? out_max : real_out4;
-          real_out5 = real_out5 > out_max ? out_max : real_out5;
-          real_out6 = real_out6 > out_max ? out_max : real_out6;
-          real_out7 = real_out7 > out_max ? out_max : real_out7;
-          real_out8 = real_out8 > out_max ? out_max : real_out8;
-          *(output_ptr + out_channel_offset) = (int8_t)real_out1;
-          *(output_ptr + out_channel_offset + 1) = (int8_t)real_out2;
-          *(output_ptr + out_channel_offset + 2) = (int8_t)real_out3;
-          *(output_ptr + out_channel_offset + 3) = (int8_t)real_out4;
-          *(output_ptr + out_channel_offset + 4) = (int8_t)real_out5;
-          *(output_ptr + out_channel_offset + 5) = (int8_t)real_out6;
-          *(output_ptr + out_channel_offset + 6) = (int8_t)real_out7;
-          *(output_ptr + out_channel_offset + 7) = (int8_t)real_out8;
-        }  // in_channel loop
-        int channel_s = (c8 - 1) * C8NUM;
-        for (int k = channel_s; k < channel; k++) {
-          int in_channel_offset = in_batch_offset + k;
-          int out_channel_offset = out_plane_offset + k;
-          int16_t tmp_avg = 0;
-          int real_count = 0;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-                tmp_avg += *(input_ptr + in_offset);
-                ++real_count;
+#ifdef ENABLE_NEON
+          int16_t tmp_data[8];
+          int16_t tmp_out[8];
+          int16_t tmp_data1[8];
+          int16_t tmp_out1[8];
+          for (int l = 0; l < C8NUM; l++) {
+            tmp_data[l] = tmp_avg[0][l] + 128 * real_count;
+            tmp_out[l] = (tmp_data[l] + real_count / 2) / real_count;
+            tmp_out[l] -= 128;
+            tmp_out[l] = round((tmp_out[l] - input_zp) * real_multiplier) + output_zp;
+          }
+          for (int l = 0; l < C8NUM; l++) {
+            tmp_data1[l] = tmp_avg[1][l] + 128 * real_count;
+            tmp_out1[l] = (tmp_data1[l] + real_count / 2) / real_count;
+            tmp_out1[l] -= 128;
+            tmp_out1[l] = round((tmp_out1[l] - input_zp) * real_multiplier) + output_zp;
+          }
+          int8x8_t real_out[2];
+          int8x8_t output_min = vdup_n_s8(out_min);
+          int8x8_t output_max = vdup_n_s8(out_max);
+          real_out[0] = vqmovn_s16(vld1q_s16(tmp_out));
+          real_out[0] = vmin_s8(real_out[0], output_max);
+          real_out[0] = vmax_s8(real_out[0], output_min);
+          vst1_s8(output_ptr + out_channel_offset, real_out[0]);
+          real_out[1] = vqmovn_s16(vld1q_s16(tmp_out1));
+          real_out[1] = vmin_s8(real_out[1], output_max);
+          real_out[1] = vmax_s8(real_out[1], output_min);
+          vst1_s8(output_ptr + out_channel_offset + 8, real_out[1]);
+#else
+          for (int l = 0; l < C16NUM; ++l) {
+            int16_t tmp_data = tmp_avg[l] + 128 * real_count;
+            real_out[l] = (tmp_data + real_count / 2) / real_count - 128;
+            real_out[l] = (int8_t)(round((real_out[l] - input_zp) * real_multiplier) + output_zp);
+            real_out[l] = real_out[l] < out_min ? out_min : real_out[l];
+            real_out[l] = real_out[l] > out_max ? out_max : real_out[l];
+            *(output_ptr + out_channel_offset + l) = (int8_t)real_out[l];
+          }
+#endif
+        }
+
+        // 8 channels
+        int channel_16_res = channel - c16 * C16NUM;
+        int c8 = channel_16_res / C8NUM;
+        int in_c16_offset = in_batch_offset + c16 * C16NUM;
+        int out_c16_offset = out_plane_offset + c16 * C16NUM;
+        for (int j = 0; j < c8; j++) {
+#ifdef ENABLE_NEON
+          int16x8_t tmp_avg = vmovq_n_s16(0);
+#else
+          int16_t tmp_avg[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+          int16_t real_out[8];
+#endif
+          int in_channel_offset = in_c16_offset + j * C8NUM;
+          int out_channel_offset = out_c16_offset + j * C8NUM;
+          for (int h = kh_s; h < kh_e; h++) {
+            for (int w = kw_s; w < kw_e; w++) {
+              int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
+#ifdef ENABLE_NEON
+              int8x8_t in_ptr = vld1_s8(input_ptr + in_offset);
+              int16x8_t data = vmovl_s8(in_ptr);
+              tmp_avg = vaddq_s16(tmp_avg, data);
+#else
+              for (int k = 0; k < C8NUM; ++k) {
+                tmp_avg[k] += input_ptr[in_offset + k];
               }
+#endif
             }  // win_w loop
           }    // win_h loop
-          int16_t tmp_out = round((float)tmp_avg / (float)real_count);
+#ifdef ENABLE_NEON
+          int16_t tmp_data[8];
+          int16_t tmp_out[8];
+          for (int l = 0; l < C8NUM; l++) {
+            tmp_data[l] = tmp_avg[l] + 128 * real_count;
+            tmp_out[l] = (tmp_data[l] + real_count / 2) / real_count;
+            tmp_out[l] -= 128;
+            tmp_out[l] = round((tmp_out[l] - input_zp) * real_multiplier) + output_zp;
+          }
+          int8x8_t real_out;
+          int8x8_t output_min = vdup_n_s8(out_min);
+          int8x8_t output_max = vdup_n_s8(out_max);
+          real_out = vqmovn_s16(vld1q_s16(tmp_out));
+          real_out = vmin_s8(real_out, output_max);
+          real_out = vmax_s8(real_out, output_min);
+          vst1_s8(output_ptr + out_channel_offset, real_out);
+#else
+          for (int l = 0; l < C8NUM; ++l) {
+            int16_t tmp_data = tmp_avg[l] + 128 * real_count;
+            real_out[l] = (tmp_data + real_count / 2) / real_count - 128;
+            real_out[l] = (int8_t)(round((real_out[l] - input_zp) * real_multiplier) + output_zp);
+            real_out[l] = real_out[l] < out_min ? out_min : real_out[l];
+            real_out[l] = real_out[l] > out_max ? out_max : real_out[l];
+            *(output_ptr + out_channel_offset + l) = (int8_t)real_out[l];
+          }
+#endif
+        }
+
+        // less than 8 channel
+        int channel_8_res = channel_16_res - c8 * C8NUM;
+        int in_c8_offset = in_c16_offset + c8 * C8NUM;
+        int out_c8_offset = out_c16_offset + c8 * C8NUM;
+        for (int k = 0; k < channel_8_res; k++) {
+          int in_channel_offset = in_c8_offset + k;
+          int out_channel_offset = out_c8_offset + k;
+          int16_t tmp_avg = 0;
+          for (int h = kh_s; h < kh_e; h++) {
+            for (int w = kw_s; w < kw_e; w++) {
+              int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
+              tmp_avg += input_ptr[in_offset];
+            }  // win_w loop
+          }    // win_h loop
+          int16_t tmp_out = round((float)tmp_avg / (float)real_count + 128) - 128;
+          tmp_out = (int8_t)(round((tmp_out - input_zp) * real_multiplier) + output_zp);
           int16_t real_out = tmp_out < out_min ? out_min : tmp_out;
           real_out = real_out > out_max ? out_max : real_out;
           *(output_ptr + out_channel_offset) = (int8_t)real_out;
@@ -249,6 +324,109 @@ void MaxPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParamete
   }      // out_batch loop
 }
 
+void MaxPoolingWithQuantInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param,
+                             int task_id) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
+  int c16 = UP_DIV(channel, 16);
+  // input channel is equal to output channel
+  float input_scale = pooling_param->quant_args_[0][0].scale_;
+  int input_zp = pooling_param->quant_args_[0][0].zp_;
+  float output_scale = pooling_param->quant_args_[1][0].scale_;
+  int output_zp = pooling_param->quant_args_[1][0].zp_;
+  double real_multiplier = input_scale / output_scale;
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    int in_batch_offset = batch * in_h * in_w * channel;
+    int out_batch_offset = batch * output_h * output_w * channel;
+    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
+      int cal_start_index = thread_id * TILE_NUM;
+      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+      for (int i = 0; i < real_cal_num; i++) {
+        int index = cal_start_index + i;
+        int out_w_index = index % output_w;
+        int out_h_index = index / output_w;
+        int in_w_index = out_w_index * stride_w - pad_w;
+        int in_h_index = out_h_index * stride_h - pad_h;
+        int out_plane_offset = out_batch_offset + index * channel;
+        for (int j = 0; j < c16 - 1; j++) {
+          int in_channel_offset = in_batch_offset + j * 16;
+          int out_channel_offset = out_plane_offset + j * 16;
+#ifdef ENABLE_NEON
+          int8x16_t tmp_max = vdupq_n_s8(INT8_MIN);
+#else
+          int8_t tmp_max[16];
+          for (int m = 0; m < C16NUM; ++m) {
+            tmp_max[m] = INT8_MIN;
+          }
+#endif
+          for (int h = 0; h < win_h; h++) {
+            for (int w = 0; w < win_w; w++) {
+              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
+                  (in_w_index + w) >= in_w) {
+                continue;
+              } else {
+                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+#ifdef ENABLE_NEON
+                tmp_max = vmaxq_s8(tmp_max, vld1q_s8(input_ptr + in_offset));
+#else
+                for (int k = 0; k < C16NUM; ++k) {
+                  tmp_max[k] = MaxInt8(tmp_max[k], *(input_ptr + in_offset + k));
+                }
+#endif
+              }
+            }  // win_w loop
+          }    // win_h loop
+#ifdef ENABLE_NEON
+          for (int l = 0; l < C16NUM; ++l) {
+            tmp_max[l] = (int8_t)(round((tmp_max[l] - input_zp) * real_multiplier) + output_zp);
+          }
+          vst1q_s8(output_ptr + out_channel_offset, tmp_max);
+#else
+          for (int l = 0; l < C16NUM; ++l) {
+            *(output_ptr + out_channel_offset + l) =
+              (int8_t)(round((tmp_max[l] - input_zp) * real_multiplier) + output_zp);
+          }
+#endif
+        }  // in_channel loop
+
+        // res channel
+        int channel_s = (c16 - 1) * 16;
+        for (int k = channel_s; k < channel; k++) {
+          int in_channel_offset = in_batch_offset + k;
+          int out_channel_offset = out_plane_offset + k;
+          int8_t tmp_max = INT8_MIN;
+          for (int h = 0; h < win_h; h++) {
+            for (int w = 0; w < win_w; w++) {
+              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
+                  (in_w_index + w) >= in_w) {
+                continue;
+              } else {
+                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+                tmp_max = MaxInt8(tmp_max, *(input_ptr + in_offset));
+              }
+            }  // win_w loop
+          }    // win_h loop
+          *(output_ptr + out_channel_offset) = (int8_t)(round((tmp_max - input_zp) * real_multiplier) + output_zp);
+        }  // channel_res loop
+      }    // out_plane loop
+    }      // out_batch loop
+  }
+}
+
 void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
   int stride_w = pooling_param->stride_w_;
   int stride_h = pooling_param->stride_h_;
@@ -264,7 +442,7 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
   int output_batch = pooling_param->output_batch_;
   int out_plane = output_w * output_h;
   int out_tile_count = UP_DIV(out_plane, TILE_NUM);
-  int thread_num = pooling_param->thread_num_;
+  int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
   int c16 = UP_DIV(channel, 16);
 
   for (int batch = 0; batch < output_batch; batch++) {
@@ -286,22 +464,10 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
 #ifdef ENABLE_NEON
           int8x16_t tmp_max = vdupq_n_s8(INT8_MIN);
 #else
-          int8_t tmp_max1 = INT8_MIN;
-          int8_t tmp_max2 = INT8_MIN;
-          int8_t tmp_max3 = INT8_MIN;
-          int8_t tmp_max4 = INT8_MIN;
-          int8_t tmp_max5 = INT8_MIN;
-          int8_t tmp_max6 = INT8_MIN;
-          int8_t tmp_max7 = INT8_MIN;
-          int8_t tmp_max8 = INT8_MIN;
-          int8_t tmp_max9 = INT8_MIN;
-          int8_t tmp_max10 = INT8_MIN;
-          int8_t tmp_max11 = INT8_MIN;
-          int8_t tmp_max12 = INT8_MIN;
-          int8_t tmp_max13 = INT8_MIN;
-          int8_t tmp_max14 = INT8_MIN;
-          int8_t tmp_max15 = INT8_MIN;
-          int8_t tmp_max16 = INT8_MIN;
+          int8_t tmp_max[16];
+          for (int m = 0; m < C16NUM; ++m) {
+            tmp_max[m] = INT8_MIN;
+          }
 #endif
           for (int h = 0; h < win_h; h++) {
             for (int w = 0; w < win_w; w++) {
@@ -313,22 +479,9 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
 #ifdef ENABLE_NEON
                 tmp_max = vmaxq_s8(tmp_max, vld1q_s8(input_ptr + in_offset));
 #else
-                tmp_max1 = MaxInt8(tmp_max1, *(input_ptr + in_offset));
-                tmp_max2 = MaxInt8(tmp_max2, *(input_ptr + in_offset + 1));
-                tmp_max3 = MaxInt8(tmp_max3, *(input_ptr + in_offset + 2));
-                tmp_max4 = MaxInt8(tmp_max4, *(input_ptr + in_offset + 3));
-                tmp_max5 = MaxInt8(tmp_max5, *(input_ptr + in_offset + 4));
-                tmp_max6 = MaxInt8(tmp_max6, *(input_ptr + in_offset + 5));
-                tmp_max7 = MaxInt8(tmp_max7, *(input_ptr + in_offset + 6));
-                tmp_max8 = MaxInt8(tmp_max8, *(input_ptr + in_offset + 7));
-                tmp_max9 = MaxInt8(tmp_max9, *(input_ptr + in_offset + 8));
-                tmp_max10 = MaxInt8(tmp_max10, *(input_ptr + in_offset + 9));
-                tmp_max11 = MaxInt8(tmp_max11, *(input_ptr + in_offset + 10));
-                tmp_max12 = MaxInt8(tmp_max12, *(input_ptr + in_offset + 11));
-                tmp_max13 = MaxInt8(tmp_max13, *(input_ptr + in_offset + 12));
-                tmp_max14 = MaxInt8(tmp_max14, *(input_ptr + in_offset + 13));
-                tmp_max15 = MaxInt8(tmp_max15, *(input_ptr + in_offset + 14));
-                tmp_max16 = MaxInt8(tmp_max16, *(input_ptr + in_offset + 15));
+                for (int k = 0; k < C16NUM; ++k) {
+                  tmp_max[k] = MaxInt8(tmp_max[k], *(input_ptr + in_offset + k));
+                }
 #endif
               }
             }  // win_w loop
@@ -336,24 +489,13 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
 #ifdef ENABLE_NEON
           vst1q_s8(output_ptr + out_channel_offset, tmp_max);
 #else
-          *(output_ptr + out_channel_offset) = tmp_max1;
-          *(output_ptr + out_channel_offset + 1) = tmp_max2;
-          *(output_ptr + out_channel_offset + 2) = tmp_max3;
-          *(output_ptr + out_channel_offset + 3) = tmp_max4;
-          *(output_ptr + out_channel_offset + 4) = tmp_max5;
-          *(output_ptr + out_channel_offset + 5) = tmp_max6;
-          *(output_ptr + out_channel_offset + 6) = tmp_max7;
-          *(output_ptr + out_channel_offset + 7) = tmp_max8;
-          *(output_ptr + out_channel_offset + 8) = tmp_max9;
-          *(output_ptr + out_channel_offset + 9) = tmp_max10;
-          *(output_ptr + out_channel_offset + 10) = tmp_max11;
-          *(output_ptr + out_channel_offset + 11) = tmp_max12;
-          *(output_ptr + out_channel_offset + 12) = tmp_max13;
-          *(output_ptr + out_channel_offset + 13) = tmp_max14;
-          *(output_ptr + out_channel_offset + 14) = tmp_max15;
-          *(output_ptr + out_channel_offset + 15) = tmp_max16;
+          for (int l = 0; l < C16NUM; ++l) {
+            *(output_ptr + out_channel_offset + l) = tmp_max[l];
+          }
 #endif
         }  // in_channel loop
+
+        // res channel
         int channel_s = (c16 - 1) * 16;
         for (int k = channel_s; k < channel; k++) {
           int in_channel_offset = in_batch_offset + k;
diff --git a/mindspore/lite/nnacl/int8/pooling_int8.h b/mindspore/lite/nnacl/int8/pooling_int8.h
index 3926f6e682..498ad36774 100644
--- a/mindspore/lite/nnacl/int8/pooling_int8.h
+++ b/mindspore/lite/nnacl/int8/pooling_int8.h
@@ -32,6 +32,8 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
 
 void MaxPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
 
+void MaxPoolingWithQuantInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
+
 void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
 #ifdef __cplusplus
 }
diff --git a/mindspore/lite/nnacl/int8/resize.c b/mindspore/lite/nnacl/int8/resize.c
index 8512aa5960..9c7bcdee1b 100644
--- a/mindspore/lite/nnacl/int8/resize.c
+++ b/mindspore/lite/nnacl/int8/resize.c
@@ -86,6 +86,62 @@ int ResizeBilinearInt8(const int8_t *input_data, int8_t *output_data, const int
   return NNACL_OK;
 }
 
+int ResizeBilinearInt8WithFloatWeight(const int8_t *input_data, int8_t *output_data, const int *input_shape,
+                                      const int *output_shape, const bool align_corners, QuantArg *quant_in,
+                                      QuantArg *quant_out, const QuantMulArg *mul_arg, int tid, int thread_num) {
+  if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL) {
+    return NNACL_NULL_PTR;
+  }
+
+  int32_t in_n = input_shape[0];
+  int32_t in_h = input_shape[1];
+  int32_t in_w = input_shape[2];
+  int32_t in_c = input_shape[3];
+
+  int32_t new_height = output_shape[1];
+  int32_t new_width = output_shape[2];
+  float height_scale, width_scale;
+  ComputeScaleFloat(in_h, new_height, align_corners, &height_scale);
+  ComputeScaleFloat(in_w, new_width, align_corners, &width_scale);
+
+  int n, h, w, c;
+  for (n = 0; n < in_n; n++) {
+    for (h = tid; h < new_height; h += thread_num) {
+      float actual_y;
+      int bottom, top;
+      float bottom_weight, top_weight;
+      ComputeInterpolationArgsFloatWeight(h, height_scale, in_h, &actual_y, &bottom, &bottom_weight, &top, &top_weight);
+      for (w = 0; w < new_width; w++) {
+        float actual_x;
+        int left, right;
+        float left_weight, right_weight;
+        ComputeInterpolationArgsFloatWeight(w, width_scale, in_w, &actual_x, &left, &left_weight, &right,
+                                            &right_weight);
+        for (c = 0; c < in_c; c++) {
+          float bottom_left_value = ((int32_t)input_data[offset(input_shape, n, bottom, left, c)] - quant_in->zp_) *
+                                    bottom_weight * left_weight;
+          float bottom_right_value = ((int32_t)input_data[offset(input_shape, n, bottom, right, c)] - quant_in->zp_) *
+                                     bottom_weight * right_weight;
+          float top_left_value =
+            ((int32_t)input_data[offset(input_shape, n, top, left, c)] - quant_in->zp_) * top_weight * left_weight;
+          float top_right_value =
+            ((int32_t)input_data[offset(input_shape, n, top, right, c)] - quant_in->zp_) * top_weight * right_weight;
+          float interp_value = bottom_left_value + bottom_right_value + top_left_value + top_right_value;
+
+          const int out_interp_value = MultiplyByQuantizedMultiplier((int32_t)interp_value, mul_arg->multiplier_,
+                                                                     mul_arg->left_shift_, mul_arg->right_shift_) +
+                                       quant_out->zp_;
+          int8_t out_value;
+          out_value = out_interp_value > INT8_MAX ? INT8_MAX : out_interp_value;
+          out_value = out_value < INT8_MIN ? INT8_MIN : out_value;
+          output_data[offset(output_shape, n, h, w, c)] = out_value;
+        }
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
 int ResizeNearestNeighborInt8Simple(const int8_t *input_data, int8_t *output_data, const int *input_shape,
                                     const int *output_shape, const bool align_corners, int tid, int thread_num) {
   int batch, y, x, c;
@@ -133,6 +189,22 @@ void ComputeInterpolationArgs(const int32_t pos, const int32_t scale, const int3
   *scaled_high_weight = *scaled_pos - (1 << 10) * (*low);
 }
 
+void ComputeScaleFloat(const int32_t in_value, const int32_t out_value, const bool align_corners, float *scale) {
+  *scale = (float)in_value / out_value;
+  if (align_corners && out_value > 1) {
+    *scale = (float)(in_value - 1) / (out_value - 1);
+  }
+}
+
+void ComputeInterpolationArgsFloatWeight(const int32_t pos, const float scale, const int32_t size, float *actual_pos,
+                                         int32_t *low, float *low_weight, int32_t *high, float *high_weight) {
+  *actual_pos = pos * scale;
+  *low = *actual_pos > 0 ? floor(*actual_pos) : 0;
+  *low_weight = 1.0 - (*actual_pos - *low);
+  *high = *low + 1 < size ? *low + 1 : size - 1;
+  *high_weight = *actual_pos - (*low);
+}
+
 void ComputeNearestNeighborInt(const int32_t pos, const int in_size, const int32_t new_size, const bool align_corners,
                                int32_t *nearest) {
   if (new_size == 0) {
diff --git a/mindspore/lite/nnacl/int8/resize.h b/mindspore/lite/nnacl/int8/resize.h
index 48f13e27e3..67a56da8c0 100644
--- a/mindspore/lite/nnacl/int8/resize.h
+++ b/mindspore/lite/nnacl/int8/resize.h
@@ -31,6 +31,20 @@ int ResizeBilinearInt8(const int8_t *input_data, int8_t *output_data, const int
                        const bool align_corners, QuantArg *quant_in, QuantArg *quant_out, const QuantMulArg *mul_arg,
                        int tid, int thread_num);
 
+int ResizeBilinearInt8WithFloatWeight(const int8_t *input_data, int8_t *output_data, const int *input_shape,
+                                      const int *output_shape, const bool align_corners, QuantArg *quant_in,
+                                      QuantArg *quant_out, const QuantMulArg *mul_arg, int tid, int thread_num);
+
+void ComputeScale(const int32_t in_value, const int32_t out_value, const bool align_corners, int32_t *scale);
+
+void ComputeInterpolationArgs(const int32_t pos, const int32_t scale, const int32_t size, int32_t *scaled_pos,
+                              int32_t *low, int32_t *scaled_low_weight, int32_t *high, int32_t *scaled_high_weight);
+
+void ComputeScaleFloat(const int32_t in_value, const int32_t out_value, const bool align_corners, float *scale);
+
+void ComputeInterpolationArgsFloatWeight(const int32_t pos, const float scale, const int32_t size, float *actual_pos,
+                                         int32_t *low, float *low_weight, int32_t *high, float *high_weight);
+
 int ResizeNearestNeighborInt8Simple(const int8_t *input_data, int8_t *output_data, const int *input_shape,
                                     const int *output_shape, const bool align_corners, int tid, int thread_num);
 
@@ -38,11 +52,6 @@ int ResizeNearestNeighborInt8(const int8_t *input_data, int8_t *output_data, con
                               const int *output_shape, const bool align_corners, const QuantMulArg *multiplier,
                               QuantArg *quant_in, QuantArg *quant_out, int tid, int thread_num);
 
-void ComputeScale(const int32_t in_value, const int32_t out_value, const bool align_corners, int32_t *scale);
-
-void ComputeInterpolationArgs(const int32_t pos, const int32_t scale, const int32_t size, int32_t *scaled_pos,
-                              int32_t *low, int32_t *scaled_low_weight, int32_t *high, int32_t *scaled_high_weight);
-
 void ComputeNearestNeighborInt(const int32_t pos, const int in_size, const int32_t new_size, const bool align_corners,
                                int32_t *nearest);
 #ifdef __cplusplus
diff --git a/mindspore/lite/nnacl/int8/softmax_int8.c b/mindspore/lite/nnacl/int8/softmax_int8.c
index 0ffa437d8b..7979cf09e6 100644
--- a/mindspore/lite/nnacl/int8/softmax_int8.c
+++ b/mindspore/lite/nnacl/int8/softmax_int8.c
@@ -58,7 +58,7 @@ int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp
       int axis_offset = outter_offset + i * inner_size;
       for (int c = 0; c < inner_size; ++c) {
         int num_bits_over_unit;
-        int shifted_scale = ComputerReciproal(sum_data[c], 12, &num_bits_over_unit);
+        int shifted_scale = ComputerReciprocal(sum_data[c], 12, &num_bits_over_unit);
         int unsat_output = RoundingDivideByPOT(
           SaturatingRoundingDoublingHighMul(shifted_scale, exp_data[axis_offset + c]), num_bits_over_unit + 31 - 8);
 
diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h
index 9e290e7841..7be90402c8 100644
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -22,18 +22,28 @@
 typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                                    const int *input_sum, const int *bias);
 
+typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, bool per_channel);
+
 typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);
 
-typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
+typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType;
 
 typedef struct MatMulParameter {
   OpParameter op_parameter_;
   int row_;
   int col_;
+  int row_4_;
   int row_8_;
+  int row_12_;
   int row_16_;
+  int col_4_;
   int col_8_;
   int deep_;
+  int deep_4_;
+  int deep_16_;
   bool has_bias_;
   int batch;
   bool a_transpose_; /* false :  row-major  */
diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h
index f7c90bce49..e5bf293ed3 100644
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@@ -23,8 +23,8 @@
 
 #define C4NUM 4
 #define C8NUM 8
+#define C12NUM 12
 #define C16NUM 16
-#define BLOCK 4
 #define TILE_NUM 8
 
 #define MSMIN(x, y) ((x) < (y) ? (x) : (y))
@@ -55,10 +55,17 @@ typedef enum LiteDataType {
   kDataTypeInt8,
 } LiteDataType;
 
+typedef enum DataOrder {
+  RowMajor,
+  ColMajor,
+} DataOrder;
+
 typedef struct OpParameter {
   char name_[100];
   int type_;
   int thread_num_;
 } OpParameter;
 
+typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6, ActType_Prelu } ActType;
+
 #endif  // MINDSPORE_LITE_NNACL_OP_BASE_H_
diff --git a/mindspore/lite/nnacl/opt_op_handler.c b/mindspore/lite/nnacl/opt_op_handler.c
index 14d6309f17..a3fc07a1d8 100644
--- a/mindspore/lite/nnacl/opt_op_handler.c
+++ b/mindspore/lite/nnacl/opt_op_handler.c
@@ -15,6 +15,8 @@
  */
 
 #include <stdlib.h>
+#include <stdbool.h>
+#include "nnacl/op_base.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,6 +29,10 @@ extern void IndirectGemmInt8_24x4_dp(int8_t *dst, const int8_t *src, const int8_
 
 extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                   const int *input_sum, const int *bias);
+extern void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
+                               const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int multiplier,
+                               int left_shift, int right_shift, int row, int col, int stride);
+
 #ifdef __cplusplus
 }
 #endif
@@ -36,7 +42,7 @@ void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int
                                        size_t ksize, size_t ic4, size_t output_channel, size_t offset,
                                        const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
                                        int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
-                                     size_t asymmetric, size_t per_channel) {
+                                       size_t asymmetric, size_t per_channel) {
   return IndirectGemmInt8_24x4_dp(dst, src, weight, bias, ksize, ic4, output_channel, offset, input_sum, act_min,
                                   act_max, out_zp, out_multiplier, shift_before, shift_after, asymmetric, per_channel);
 }
@@ -45,4 +51,12 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
                                    const int *input_sum, const int *bias) {
   return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
 }
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, bool per_channel) {
+  return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, 8), UP_ROUND(col, 8), deep_4, input_sum, bias, mini, maxi,
+                            output_zp, multiplier[0], left_shift[0], right_shift[0], row, col, col);
+}
 #endif
diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c
index 327ae04676..4a94118da4 100644
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -62,6 +62,10 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed
   }        // kernel plane loop
 }
 
+void PackWeightKHWToHWKFp32(const void *src, void *dst, int plane, int channel) {
+  return PackNCHWToNHWCFp32(src, dst, 1, plane, channel);
+}
+
 void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum) {
   // original weight format : ohwi
   int kernel_h = conv_param->kernel_h_;
@@ -153,22 +157,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
   }        // kernel plane loop
 }
 
-void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) {
+void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) {
   /* support nhwc */
+  char *src = (char *)src_ptr;
+  char *dst = (char *)dst_ptr;
   for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
-    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
+    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
     if (src_h < 0 || src_h >= conv_param->input_h_) {
       continue;
     }
-    const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
-    float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
+    const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
+    char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
     for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
-      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
+      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
       if (src_w < 0 || src_w >= conv_param->input_w_) {
         continue;
       }
-      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_,
-             conv_param->input_channel_ * sizeof(float));
+      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size,
+             src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size);
     }
   }
   return;
@@ -188,6 +194,139 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam
   return;
 }
 
+void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) {
+  /* optimize normal -> same layout */
+#ifdef ENABLE_ARM64
+  asm volatile(
+    "mov x10, %[src] \n"
+    "mov x11, %[dst] \n"
+    "dup v15.4s, %w[filter_zp]  \n"
+
+    "mov x0, #0 \n"
+    "1: \n"
+    "cmp x0, %[row4] \n"
+    "beq 4f \n"
+    "add x0, x0, #4\n"
+    "dup v10.4s, wzr \n"
+    "mov x2, #0 \n"
+
+    "2: \n"
+    "cmp x2, %[col16] \n"
+    "beq 3f \n"
+    "add x2, x2, #16\n"
+
+    "ld1 {v0.16b}, [x10], #16\n"
+    "ld1 {v1.16b}, [x10], #16\n"
+    "ld1 {v2.16b}, [x10], #16\n"
+    "ld1 {v3.16b}, [x10], #16\n"
+
+    "saddlp v4.8h, v0.16b \n"
+    "saddlp v5.8h, v1.16b \n"
+    "saddlp v6.8h, v2.16b \n"
+    "saddlp v7.8h, v3.16b \n"
+
+    "saddlp v0.4S, v4.8h \n"
+    "saddlp v1.4S, v5.8h \n"
+    "saddlp v2.4S, v6.8h \n"
+    "saddlp v3.4S, v7.8h \n"
+
+    "addv s4, v0.4S \n"
+    "addv s5, v1.4S \n"
+    "addv s6, v2.4S \n"
+    "addv s7, v3.4S \n"
+
+    "mov v0.s[0], v4.s[0] \n"
+    "mov v0.s[1], v5.s[0] \n"
+    "mov v0.s[2], v6.s[0] \n"
+    "mov v0.s[3], v7.s[0] \n"
+
+    "add v10.4s, v10.4s, v0.4s \n"
+    "b 2b\n"
+
+    "3: \n"
+    "mul v10.4s, v10.4s, v15.4s \n"
+    "st1 {v10.4s}, [x11], #16 \n"
+    "beq 1b \n"
+
+    "4: \n"
+
+    :
+    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
+    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
+#else
+  for (int r = 0; r < row4; r++) {
+    int32_t tmp_value = 0;
+    for (int c = 0; c < col16; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
+      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
+      tmp_value += src[src_index];
+    }
+    dst[r] = tmp_value * filter_zp;
+  }
+#endif
+  return;
+}
+
+void PackInputSum16x4Int8(const int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                          size_t plane_size, ConvParameter *conv_param) {
+  size_t hw4 = UP_ROUND(plane_size, C4NUM);
+  size_t ic16 = UP_ROUND(input_channel, C16NUM);
+  if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
+    PackInputSum16x4PerLayer(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
+  } else {
+    for (int ri = 0; ri < plane_size; ri++) {
+      int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
+      for (int ci = 0; ci < output_channel; ci++) {
+        int32_t tmp_sum_value = 0;
+        int ci4div = ci / C4NUM, ci4mod = ci % C4NUM;
+        int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_;
+        for (int di = 0; di < input_channel; di++) {
+          size_t di16div = di / C16NUM, di16mod = di % C16NUM;
+          int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
+          tmp_sum_value += input_value[src_index];
+        }
+        int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod;
+        input_sum[dst_index] = tmp_sum_value * filter_zp;
+      }
+    }
+  }
+  return;
+}
+
+void PackInputSum8x4Int8(const int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                         size_t plane_size, ConvParameter *conv_param) {
+  size_t hw8 = UP_ROUND(plane_size, C8NUM);
+  size_t ic4 = UP_ROUND(input_channel, C4NUM);
+  if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
+    for (int r = 0; r < hw8; r++) {
+      int32_t tmp_value = 0;
+      for (int c = 0; c < ic4; c++) {
+        int r8div = r / C8NUM, r8mod = r % C8NUM, c4div = c / C4NUM, c4mod = c % C4NUM;
+        int src_index = r8div * C8NUM * ic4 + c4div * C8NUM * C4NUM + r8mod * C4NUM + c4mod;
+        tmp_value += input_value[src_index];
+      }
+      input_sum[r] = tmp_value * conv_param->conv_quant_arg_.filter_quant_args_[0].zp_;
+    }
+  } else {
+    for (int ri = 0; ri < plane_size; ri++) {
+      int ri8div = ri / C8NUM, ri8mod = ri % C8NUM;
+      for (int ci = 0; ci < output_channel; ci++) {
+        int32_t tmp_sum_value = 0;
+        int ci8div = ci / C8NUM, ci8mod = ci % C8NUM;
+        int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_;
+        for (int di = 0; di < input_channel; di++) {
+          size_t di4div = di / C4NUM, di4mod = di % C4NUM;
+          int src_index = ri8div * C8NUM * ic4 + di4div * C8NUM * C4NUM + ri8mod * C4NUM + di4mod;
+          tmp_sum_value += input_value[src_index];
+        }
+        int dst_index = ci8div * C8NUM * hw8 + ri * C8NUM + ci8mod;
+        input_sum[dst_index] = tmp_sum_value * filter_zp;
+      }
+    }
+  }
+  return;
+}
+
 void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
                         int block_index) {
   // input format : nhwc
@@ -195,8 +334,8 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
   int kernel_w = conv_param->kernel_w_;
   int stride_h = conv_param->stride_h_;
   int stride_w = conv_param->stride_w_;
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int dilation_h = conv_param->dilation_h_;
   int dilation_w = conv_param->dilation_w_;
   int in_channel = conv_param->input_channel_;
@@ -204,23 +343,21 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
   int in_w = conv_param->input_w_;
   int out_w = conv_param->output_w_;
   int ic4 = UP_DIV(in_channel, C4NUM);
+  memset(packed_input, 0, kernel_h * kernel_w * ic4 * C4NUM * TILE_NUM * sizeof(float));
 
   for (int i = 0; i < real_cal_num; i++) {
     int block_start = block_index + i;
     int input_h = block_start / out_w * stride_h - pad_h;
     int input_w = block_start % out_w * stride_w - pad_w;
-    for (int j = 0; j < kernel_h; j++) {
-      int input_y = input_h + j * dilation_h;
-      if (input_y < 0 || input_y >= in_h) {
-        continue;
-      }
-      int input_y_stride = input_y * in_w * ic4 * C4NUM;
-      for (int n = 0; n < kernel_w; n++) {
-        int input_x = input_w + n * dilation_w;
-        if (input_x < 0 || input_x >= in_w) {
-          continue;
-        }
-        int input_x_stride = input_y_stride + input_x * ic4 * C4NUM;
+    int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
+    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
+    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
+    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
+    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
+    for (int j = kh_s; j < kh_e; j++) {
+      int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
+      for (int n = kw_s; n < kw_e; n++) {
+        int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
         int input_plane_offset = (j * kernel_w + n) * C8NUM * C4NUM * ic4 + i * C4NUM;
         for (int m = 0; m < ic4; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
@@ -247,8 +384,8 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
   int kernel_w = conv_param->kernel_w_;
   int stride_h = conv_param->stride_h_;
   int stride_w = conv_param->stride_w_;
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int dilation_h = conv_param->dilation_h_;
   int dilation_w = conv_param->dilation_w_;
   int in_channel = conv_param->input_channel_;
@@ -318,8 +455,8 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
   int kernel_w = conv_param->kernel_w_;
   int stride_h = conv_param->stride_h_;
   int stride_w = conv_param->stride_w_;
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int dilation_h = conv_param->dilation_h_;
   int dilation_w = conv_param->dilation_w_;
   int in_channel = conv_param->input_channel_;
@@ -350,9 +487,7 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
         for (int m = 0; m < ic4; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
           int channel_block_offset = input_plane_offset + m * tile_num * C4NUM;
-          for (int k = 0; k < C4NUM; k++) {
-            (packed_input + channel_block_offset)[k] = (input_data + channel_block_stride)[k];
-          }
+          memcpy(packed_input + channel_block_offset, input_data + channel_block_stride, 4);
         }  // channel_block loop
       }    // kernel_w loop
     }      // kernel_h loop
@@ -660,6 +795,8 @@ void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane,
   }
 }
 
+void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel) {}
+
 void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) {
   int c4 = UP_DIV(channel, C4NUM);
   for (int b = 0; b < batch; b++) {
diff --git a/mindspore/lite/nnacl/pack.h b/mindspore/lite/nnacl/pack.h
index 3567732fe9..b05083c52d 100644
--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@@ -35,10 +35,18 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
                            int32_t *input_sum, ConvParameter *conv_param);
 
-void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param);
+void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);
+
+void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size);
 
 void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param);
 
+void PackInputSum16x4Int8(const int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                          size_t plane_size, ConvParameter *conv_param);
+
+void PackInputSum8x4Int8(const int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                         size_t plane_size, ConvParameter *conv_param);
+
 void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);
 
 void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);
@@ -46,6 +54,8 @@ void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvPara
 void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block,
                     int oc_block_num);
 
+void PackWeightKHWToHWKFp32(const void *src, void *dst, int plane, int channel);
+
 void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum);
 
 void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum);
@@ -76,6 +86,8 @@ void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane,
 
 void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel);
 
+void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel);
+
 void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);
 
 void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel);
diff --git a/mindspore/lite/nnacl/pooling_parameter.h b/mindspore/lite/nnacl/pooling_parameter.h
index 42205af006..e3d7a239db 100644
--- a/mindspore/lite/nnacl/pooling_parameter.h
+++ b/mindspore/lite/nnacl/pooling_parameter.h
@@ -19,14 +19,16 @@
 #include "nnacl/op_base.h"
 #include "nnacl/quantization/quantize.h"
 
+typedef enum PoolMode { PoolMode_No, PoolMode_MaxPool, PoolMode_AvgPool } PoolMode;
+
+typedef enum RoundMode { RoundMode_No, RoundMode_Ceil, RoundMode_Floor } RoundMode;
+
 typedef struct PoolingParameter {
   OpParameter op_parameter_;
+  PoolMode pool_mode_;
+  RoundMode round_mode_;
+  ActType act_type_;
   QuantArg **quant_args_;
-  bool global_;
-  bool max_pooling_;
-  bool avg_pooling_;
-  bool round_ceil_;
-  bool round_floor_;
   int window_w_;
   int window_h_;
   int input_w_;
@@ -44,6 +46,8 @@ typedef struct PoolingParameter {
   int stride_w_;
   int stride_h_;
   int thread_num_;
+  bool global_;
+  bool quantize_;
 } PoolingParameter;
 
 #endif  // MINDSPORE_LITE_NNACL_POOLING_PARAMETER_H_
diff --git a/mindspore/lite/nnacl/quantization/fixed_point.c b/mindspore/lite/nnacl/quantization/fixed_point.c
index c12bac9111..7733976454 100644
--- a/mindspore/lite/nnacl/quantization/fixed_point.c
+++ b/mindspore/lite/nnacl/quantization/fixed_point.c
@@ -54,76 +54,34 @@ int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t lef
   return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift);
 }
 
-int FractionsBits(int kIntegerBits) {
-  const int totalBits = 8 * sizeof(int32_t) - 1;
-  return totalBits - kIntegerBits;
-}
+inline int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; }
 
-int FixedPoint_One(int kIntegerBits, int kFractionsBits) {
-  return (kIntegerBits == 0 ? INT32_MAX : ((1) << (uint32_t)(kIntegerBits == 0 ? 0 : kFractionsBits)));
+inline int FixedPoint_One(int integer_bits, int fractions_bits) {
+  return (integer_bits == 0 ? INT32_MAX : ((1) << (uint32_t)(integer_bits == 0 ? 0 : fractions_bits)));
 }
 
-int RoundingHalfSum(int a, int b) {
-  int64_t a64 = a;
-  int64_t b64 = b;
-  int64_t sum = a64 + b64;
-  int64_t sign = sum > 0 ? 1 : -1;
-  return (int32_t)((sum + sign) / 2);
+int RoundingHalfSum(int32_t a, int32_t b) {
+  int64_t sum = (int64_t)a + (int64_t)b;
+  return (int32_t)((sum + (sum > 0 ? 1 : -1)) / 2);
 }
 
-int32_t BitAnd(int32_t a, int32_t b) { return (uint32_t)a & (uint32_t)b; }
+inline int32_t BitAnd(int32_t a, int32_t b) { return (uint32_t)a & (uint32_t)b; }
 
-int32_t BitOr(int32_t a, int32_t b) { return (uint32_t)a | (uint32_t)b; }
+inline int32_t BitOr(int32_t a, int32_t b) { return (uint32_t)a | (uint32_t)b; }
 
-int32_t BitXor(int32_t a, int32_t b) { return (uint32_t)a ^ (uint32_t)b; }
+inline int32_t BitXor(int32_t a, int32_t b) { return (uint32_t)a ^ (uint32_t)b; }
 
-int32_t BitNot(int32_t a) { return ~(uint32_t)a; }
+inline int32_t BitNot(int32_t a) { return ~(uint32_t)a; }
 
-int SelectUsingMask(int mask, int bound, int val) { return BitXor(BitAnd(mask, bound), BitAnd(BitNot(mask), val)); }
+inline int BitsSelect(int mask, int bound, int val) { return BitXor(BitAnd(mask, bound), BitAnd(BitNot(mask), val)); }
 
-int32_t MaskNonZero(int32_t a) {
-  const int32_t zreo = 0;
-  return a ? BitNot(zreo) : zreo;
-}
+inline int ConstantPOT(int fractional_bits, int exponent) { return (1 << (uint32_t)(fractional_bits + exponent)); }
 
-static inline int SaturatingRoundingMultiplyByPOT(int32_t x, int Exponent) {
-  if (Exponent > 0) {
-    const int min = INT32_MIN;
-    const int max = INT32_MAX;
-    const int scalar_int_bits = 8 * sizeof(int32_t);
-    const int thresold = ((1 << (uint32_t)(scalar_int_bits - 1 - Exponent)) - 1);
-    const int postive_mask = MaskNonZero(x > thresold);
-    const int negative_mask = MaskNonZero(x < -thresold);
-    int result = x * ((int32_t)(1) << (uint32_t)Exponent);
-    result = SelectUsingMask(postive_mask, max, result);
-    result = SelectUsingMask(negative_mask, min, result);
-    return result;
-  } else if (Exponent < 0) {
-    return RoundingDivideByPOT(x, -Exponent);
-  } else {
-    return x;
-  }
-}
+inline int32_t MaskIfNonZero(int32_t a) { return a ? BitNot(0) : 0; }
 
-int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst) {
-  int kExponent = kIntegerBitsSrc - kIntegerBitsDst;
-  int result = SaturatingRoundingMultiplyByPOT(x, kExponent);
-  return result;
-}
+inline int32_t MaskIfZero(int32_t a) { return MaskIfNonZero(!a); }
 
-int32_t one_over_one_plus_x_for_x_in_0_1(int32_t a) {
-  int one = FixedPoint_One(0, FractionsBits(0));
-  int half_denominator = RoundingHalfSum(a, one);
-  const int constant_48_over_17 = 1515870810;
-  const int constant_neg_32_over_17 = -1010580540;
-  int x = constant_48_over_17 + SaturatingRoundingDoublingHighMul(half_denominator, constant_neg_32_over_17);
-  for (int i = 0; i < 3; i++) {
-    int half_denominator_times_x = SaturatingRoundingDoublingHighMul(half_denominator, x);
-    int one_minus_half_denominator_times_x = FixedPoint_One(2, FractionsBits(2)) - half_denominator_times_x;
-    x = x + Rescale(SaturatingRoundingDoublingHighMul(x, one_minus_half_denominator_times_x), 2 + 2, 2);
-  }
-  return Rescale(x, 2 - 1, 0);
-}
+inline int32_t MaskIfLessThan(int32_t a, int32_t b) { return MaskIfNonZero((a < b)); }
 
 int CountLeadingZeroBits(uint32_t x) {
 #if defined(__GUNC__)
@@ -150,75 +108,97 @@ int CountLeadingSignBits(int32_t x) {
 #endif
 }
 
-int32_t ComputerReciproal(int32_t x, int x_digits, int *recip_shift) {
+int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent) {
+  if (exponent > 0) {
+    const int min = INT32_MIN;
+    const int max = INT32_MAX;
+    const int scalar_int_bits = 8 * sizeof(int32_t);
+    const int thresold = ((1 << (uint32_t)(scalar_int_bits - 1 - exponent)) - 1);
+    const int postive_mask = x > thresold ? BitNot(0) : 0;
+    const int negative_mask = x < -thresold ? BitNot(0) : 0;
+    int result = x * ((int32_t)(1) << (uint32_t)exponent);
+    result = BitsSelect(postive_mask, max, result);
+    result = BitsSelect(negative_mask, min, result);
+    return result;
+  } else if (exponent < 0) {
+    return RoundingDivideByPOT(x, -exponent);
+  } else {
+    return x;
+  }
+}
+
+int32_t Rescale(int x, int integer_bits_src, int integer_bits_dst) {
+  int exponent = integer_bits_src - integer_bits_dst;
+  return SaturatingRoundingMultiplyByPOT(x, exponent);
+}
+
+int32_t reciprocal_on_interval_between_0_1(int32_t a) {
+  int one = FixedPoint_One(0, FractionsBits(0));
+  int half_sum = RoundingHalfSum(a, one);
+  const int constant_48_over_17 = 1515870810;
+  const int constant_neg_32_over_17 = -1010580540;
+  int x = constant_48_over_17 + SaturatingRoundingDoublingHighMul(half_sum, constant_neg_32_over_17);
+  for (int i = 0; i < 3; i++) {
+    int half_sum_times_x = SaturatingRoundingDoublingHighMul(half_sum, x);
+    int one_minus_half_sum_times_x = FixedPoint_One(2, FractionsBits(2)) - half_sum_times_x;
+    x = x + Rescale(SaturatingRoundingDoublingHighMul(x, one_minus_half_sum_times_x), 2 + 2, 2);
+  }
+  return Rescale(x, 2 - 1, 0);
+}
+
+int32_t ComputerReciprocal(int32_t x, int x_digits, int *recip_shift) {
   int leading_zreos_plus_one = CountLeadingZeroBits((uint32_t)x);
   *recip_shift = x_digits - leading_zreos_plus_one;
   const int32_t shifted_minus_one = (int32_t)(((uint32_t)x << leading_zreos_plus_one) - ((uint32_t)(1) << 31));
-  const int32_t shifted_scaled = one_over_one_plus_x_for_x_in_0_1(shifted_minus_one);
+  const int32_t shifted_scaled = reciprocal_on_interval_between_0_1(shifted_minus_one);
   return shifted_scaled;
 }
-int ConstantPOT(int fractional_bits, int exponent) {
-  int offset = fractional_bits + exponent;
-  return (1 << (uint32_t)offset);
-}
-
-int32_t MaskIfNonZero(int32_t a) { return a ? BitNot(0) : 0; }
 
-int32_t MaskIfZero(int32_t a) { return MaskIfNonZero(!a); }
-
-int32_t MaskIfLessThan(int32_t a, int32_t b) { return MaskIfNonZero((a < b)); }
-
-int exp_on_interval_between_negative_one_quarter_and_0_excl(int a) {
-  const int constant_term = 1895147668;
+int exp_on_interval_values(int a) {
+  const int constant_neg_1_over_8 = 1895147668;
   const int constant_1_over_3 = 715827883;
-  // We're evaluating a Taylor expansion around -1/8, so we do the change of
-  // variable: x = a + 1/8.
-  // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28.
-  int kFractionalBits = FractionsBits(0);
-  int x = a + ConstantPOT(kFractionalBits, -3);
+  int fractional_bits = FractionsBits(0);
+  int x = a + ConstantPOT(fractional_bits, -3);
   int x2 = SaturatingRoundingDoublingHighMul(x, x);
   int x3 = SaturatingRoundingDoublingHighMul(x2, x);
   int x4 = SaturatingRoundingDoublingHighMul(x2, x2);
   int x4_over_4 = SaturatingRoundingMultiplyByPOT(x4, -2);
   int x4_over_24_plus_x3_over_6_plus_x2_over_2 =
     SaturatingRoundingMultiplyByPOT((SaturatingRoundingDoublingHighMul((x4_over_4 + x3), constant_1_over_3) + x2), -1);
-  return constant_term +
-         SaturatingRoundingDoublingHighMul(constant_term, (x + x4_over_24_plus_x3_over_6_plus_x2_over_2));
-}
-
-int exp_on_negative_values(int a, const int tIntegerBits) {
-  int kIntegerBits = tIntegerBits;
-  int kFractionalBits = FractionsBits(tIntegerBits);
-  const int kOneQuarter = ConstantPOT(kFractionalBits, -2);
-  int mask = kOneQuarter - 1;
-  int a_mod_quarter_minus_one_quarter = ((unsigned)(a)&mask) - kOneQuarter;
-  int result =
-    exp_on_interval_between_negative_one_quarter_and_0_excl(Rescale(a_mod_quarter_minus_one_quarter, tIntegerBits, 0));
-  int remainder = a_mod_quarter_minus_one_quarter - a;
+  return constant_neg_1_over_8 +
+         SaturatingRoundingDoublingHighMul(constant_neg_1_over_8, (x + x4_over_24_plus_x3_over_6_plus_x2_over_2));
+}
 
-#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier)                           \
-  if (kIntegerBits > Exponent) {                                                              \
-    const int kMultiplier = FixedPointMultiplier;                                             \
-    int kShiftAmount = kIntegerBits > Exponent ? kFractionalBits + Exponent : 0;              \
-    result = SelectUsingMask(MaskIfNonZero(BitAnd(remainder, (1 << (uint32_t)kShiftAmount))), \
-                             SaturatingRoundingDoublingHighMul(result, kMultiplier), result); \
-  }
-  GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947);
-  GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674);
-  GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084);
-  GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308);
-  GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535);
-  GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401);
-  GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242);
-#undef GEMMLOWP_EXP_BARREL_SHIFTER
-
-  int clampB = kIntegerBits > 5 ? 36 - kIntegerBits : 0;
-  if (kIntegerBits > 5) {
-    const int clamp = -(1 << (uint32_t)clampB);
-    result = SelectUsingMask(MaskIfLessThan(a, clamp), 0, result);
+void exp_barrel_shifter(int exponent, int muliplier, int integer_bits, int fractional_bits, int remainder,
+                        int *result) {
+  if (integer_bits > exponent) {
+    int total_shift = integer_bits > exponent ? fractional_bits + exponent : 0;
+    *result = BitsSelect(MaskIfNonZero(BitAnd(remainder, (1 << (uint32_t)total_shift))),
+                         SaturatingRoundingDoublingHighMul(*result, muliplier), *result);
   }
+}
 
-  result = SelectUsingMask(MaskIfZero(a), FixedPoint_One(0, kFractionalBits), result);
+int exp_on_negative_values(int a, const int integer_bits) {
+  int fractional_bits = FractionsBits(integer_bits);
+  const int one_quarter = ConstantPOT(fractional_bits, -2);
+  int a_mod_quarter_minus_one_quarter = ((unsigned)(a) & (one_quarter - 1)) - one_quarter;
+  int result = exp_on_interval_values(Rescale(a_mod_quarter_minus_one_quarter, integer_bits, 0));
+  int remainder = a_mod_quarter_minus_one_quarter - a;
+
+  exp_barrel_shifter(-2, 1672461947, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(-1, 1302514674, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(+0, 790015084, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(+1, 290630308, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(+2, 39332535, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(+3, 720401, integer_bits, fractional_bits, remainder, &result);
+  exp_barrel_shifter(+4, 242, integer_bits, fractional_bits, remainder, &result);
+
+  int clamp_bits = integer_bits > 5 ? 36 - integer_bits : 0;
+  if (integer_bits > 5) {
+    const int clamp = -(1 << (uint32_t)clamp_bits);
+    result = BitsSelect(MaskIfLessThan(a, clamp), 0, result);
+  }
+  result = BitsSelect(MaskIfZero(a), FixedPoint_One(0, fractional_bits), result);
   return result;
 }
 
diff --git a/mindspore/lite/nnacl/quantization/fixed_point.h b/mindspore/lite/nnacl/quantization/fixed_point.h
index e64d76c08f..8a2fe1602d 100644
--- a/mindspore/lite/nnacl/quantization/fixed_point.h
+++ b/mindspore/lite/nnacl/quantization/fixed_point.h
@@ -42,46 +42,14 @@ int RoundingDivideByPOT(int x, int exponent);
 
 int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift);
 
-int FractionsBits(int kIntegerBits);
-
-int FixedPoint_One(int kIntegerBits, int kFractionsBits);
-
-int RoundingHalfSum(int a, int b);
-
-int32_t BitAnd(int32_t a, int32_t b);
-
-int32_t BitOr(int32_t a, int32_t b);
-
-int32_t BitXor(int32_t a, int32_t b);
-
-int32_t BitNot(int32_t a);
-
-int SelectUsingMask(int mask, int bound, int val);
-
-int32_t MaskNonZero(int32_t a);
-
 int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst);
 
-int32_t one_over_one_plus_x_for_x_in_0_1(int32_t a);
-
-int CountLeadingZeroBits(uint32_t x);
-
 int CountLeadingSignBits(int32_t x);
 
-int32_t ComputerReciproal(int32_t x, int x_digits, int *recip_shift);
+int32_t ComputerReciprocal(int32_t x, int x_digits, int *recip_shift);
 
 int exp_on_negative_values(int a, const int tIntegerBits);
 
-int ConstantPOT(int fractional_bits, int exponent);
-
-int32_t MaskIfNonZero(int32_t a);
-
-int32_t MaskIfZero(int32_t a);
-
-int32_t MaskIfLessThan(int32_t a, int32_t b);
-
-int exp_on_interval_between_negative_one_quarter_and_0_excl(int a);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/quantization/quantize.h b/mindspore/lite/nnacl/quantization/quantize.h
index 3a2cec217a..5ea3733678 100644
--- a/mindspore/lite/nnacl/quantization/quantize.h
+++ b/mindspore/lite/nnacl/quantization/quantize.h
@@ -159,6 +159,12 @@ typedef struct ArithSelfQuantArg {
   int shift_right_;
 } ArithSelfQuantArg;
 
+typedef struct GatherQuantArg {
+  double alpha_;
+  int zp_in_;
+  int zp_out_;
+} GatherQuantArg;
+
 typedef struct SplitQuantArg {
   QuantArg in_args_;
   QuantArg out_args_[20];
diff --git a/mindspore/lite/nnacl/scale.c b/mindspore/lite/nnacl/scale.c
deleted file mode 100644
index c52e7d773c..0000000000
--- a/mindspore/lite/nnacl/scale.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/scale.h"
-#include "nnacl/errorcode.h"
-
-int DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param) {
-  if (in_data == NULL || out_data == NULL || scale == NULL || offset == NULL || scale_param == NULL) {
-    return NNACL_ERR;
-  }
-
-  if (scale_param->has_offset_) {
-    for (int out = task_id; out < scale_param->outer_size_; out += scale_param->op_parameter_.thread_num_) {
-      int out_offset = out * scale_param->axis_size_ * scale_param->inner_size_;
-      for (int i = 0; i < scale_param->axis_size_; i++) {
-        int axis_offset = out_offset + i * scale_param->inner_size_;
-        for (int in = 0; in < scale_param->inner_size_; in++) {
-          int in_offset = axis_offset + in;
-          out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
-        }
-      }
-    }
-  } else {
-    for (int out = task_id; out < scale_param->outer_size_; out += scale_param->op_parameter_.thread_num_) {
-      int out_offset = out * scale_param->axis_size_ * scale_param->inner_size_;
-      for (int i = 0; i < scale_param->axis_size_; i++) {
-        int axis_offset = out_offset + i * scale_param->inner_size_;
-        for (int in = 0; in < scale_param->inner_size_; in++) {
-          int in_offset = axis_offset + in;
-          out_data[in_offset] = in_data[in_offset] * scale[i];
-        }
-      }
-    }
-  }
-  return NNACL_OK;
-}
diff --git a/mindspore/lite/nnacl/scale.h b/mindspore/lite/nnacl/scale.h
index fd0156f389..83244eb146 100644
--- a/mindspore/lite/nnacl/scale.h
+++ b/mindspore/lite/nnacl/scale.h
@@ -18,7 +18,6 @@
 #define MINDSPORE_LITE_NNACL_SCALE_H_
 
 #include "nnacl/op_base.h"
-
 typedef struct ScaleParameter {
   OpParameter op_parameter_;
   int outer_size_;
@@ -26,15 +25,6 @@ typedef struct ScaleParameter {
   int inner_size_;
   int axis_;
   bool const_scale_;
-  bool has_offset_;
 } ScaleParameter;
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-int DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param);
-#ifdef __cplusplus
-}
-#endif
-
 #endif  // MINDSPORE_LITE_NNACL_SCALE_H_
diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c
index dbadf3155a..cc8f814d70 100644
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -24,8 +24,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
   int output_unit = conv_param->output_unit_;
   int in_channel = conv_param->input_channel_;
   int ic4 = UP_DIV(in_channel, C4NUM);
-  int pad_h = conv_param->pad_h_;
-  int pad_w = conv_param->pad_w_;
+  int pad_h = conv_param->pad_u_;
+  int pad_w = conv_param->pad_l_;
   int input_h = conv_param->input_h_;
   int input_w = conv_param->input_w_;
   if (out_w_block_num == 0) {
@@ -42,7 +42,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
     int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);
 
     int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s);
-    int dst_plane_offset = c * C4NUM;
+    int dst_plane_offset = c * C4NUM * ic4;
     for (int ic = 0; ic < ic4; ic++) {
       // clear tmp buffer
       memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
@@ -67,8 +67,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
         }
       }
       // input transform
-      int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
-      size_t dst_step = ic4 * C4NUM * TILE_NUM;
+      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
+      size_t dst_step = C12NUM * ic4 * C4NUM;
       float *trans_input_ptr = trans_input + dst_ic4_offset;
       input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step);
     }
@@ -86,6 +86,7 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
   int output_h_unit_block = UP_DIV(output_h, output_unit);
   int output_channel = conv_param->output_channel_;
   int oc4 = UP_DIV(output_channel, C4NUM);
+  int oc8 = UP_DIV(output_channel, C8NUM);
   int input_unit = conv_param->input_unit_;
   if (output_unit_num == 0) {
     return;
@@ -93,17 +94,19 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
   for (int i = 0; i < cal_num; i++) {
     int dst_x_s = out_tile_index % output_unit_num;
     int dst_y_s = out_tile_index / output_unit_num;
-    int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
+    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
     int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);
 
     for (int j = 0; j < oc4; j++) {
-      int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
+      int c8_block = j / 2;
+      int c8_res = j % 2;
+      int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
       int dst_oc4_offset =
         dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
       const float *src_ptr = gemm_out + src_oc4_offset;
       const float *bias_ptr = bias_data + j * C4NUM;
       float *dst_ptr = tmp_out_data + dst_oc4_offset;
-      output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit);
+      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
     }
     out_tile_index++;
   }
@@ -283,8 +286,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
   int input_channel = conv_param->input_channel_;
   int input_width = conv_param->input_w_;
   int input_height = conv_param->input_h_;
-  int pad_w = conv_param->pad_w_;
-  int pad_h = conv_param->pad_h_;
+  int pad_w = conv_param->pad_l_;
+  int pad_h = conv_param->pad_u_;
   int ic4 = UP_DIV(input_channel, C4NUM);
   const int input_unit = 4;
   if (out_w_block == 0) {
@@ -300,7 +303,7 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
     int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y);
 
     int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x);
-    int dst_plane_offset = cal_id * C4NUM;
+    int dst_plane_offset = cal_id * C4NUM * ic4;
     for (int ic = 0; ic < ic4; ic++) {
       // clear tmp buffer
       memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
@@ -326,8 +329,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
       }
 
       // input transform
-      int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
-      size_t dst_step = ic4 * C4NUM * TILE_NUM;
+      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
+      size_t dst_step = C12NUM * ic4 * C4NUM;
       float *trans_input_ptr = trans_input + dst_ic4_offset;
       Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step);
     }
@@ -336,8 +339,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
 
 void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane,
                                 int oc_block) {
-  const int input_unit = 4;
-  int dst_step = iC4 * C4NUM * oc_block;
+  int oc_plane_block = UP_DIV(output_channel, oc_block);
+  int dst_step = iC4 * C4NUM * oc_block * oc_plane_block;
   if (oc_block == 0) {
     return;
   }
@@ -345,7 +348,7 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4
     int oc_block_num = o / oc_block;
     int oc_block_rem = o % oc_block;
     int src_oc_offset = o * iC4 * C4NUM * kernel_plane;
-    int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem;
+    int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM + oc_block_rem;
     for (int i = 0; i < iC4; i++) {
       float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM;
       float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM;
@@ -559,24 +562,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
   float32x4_t bias_ptr = vld1q_f32(bias_data);
 
   float32x4_t s00 = vld1q_f32(gemm_out);
-  float32x4_t s01 = vld1q_f32(gemm_out + 4);
-  float32x4_t s02 = vld1q_f32(gemm_out + 8);
-  float32x4_t s03 = vld1q_f32(gemm_out + 12);
+  float32x4_t s01 = vld1q_f32(gemm_out + 8);
+  float32x4_t s02 = vld1q_f32(gemm_out + 16);
+  float32x4_t s03 = vld1q_f32(gemm_out + 24);
 
-  float32x4_t s10 = vld1q_f32(gemm_out + 16);
-  float32x4_t s11 = vld1q_f32(gemm_out + 20);
-  float32x4_t s12 = vld1q_f32(gemm_out + 24);
-  float32x4_t s13 = vld1q_f32(gemm_out + 28);
+  float32x4_t s10 = vld1q_f32(gemm_out + 32);
+  float32x4_t s11 = vld1q_f32(gemm_out + 40);
+  float32x4_t s12 = vld1q_f32(gemm_out + 48);
+  float32x4_t s13 = vld1q_f32(gemm_out + 56);
 
-  float32x4_t s20 = vld1q_f32(gemm_out + 32);
-  float32x4_t s21 = vld1q_f32(gemm_out + 36);
-  float32x4_t s22 = vld1q_f32(gemm_out + 40);
-  float32x4_t s23 = vld1q_f32(gemm_out + 44);
+  float32x4_t s20 = vld1q_f32(gemm_out + 64);
+  float32x4_t s21 = vld1q_f32(gemm_out + 72);
+  float32x4_t s22 = vld1q_f32(gemm_out + 80);
+  float32x4_t s23 = vld1q_f32(gemm_out + 88);
 
-  float32x4_t s30 = vld1q_f32(gemm_out + 48);
-  float32x4_t s31 = vld1q_f32(gemm_out + 52);
-  float32x4_t s32 = vld1q_f32(gemm_out + 56);
-  float32x4_t s33 = vld1q_f32(gemm_out + 60);
+  float32x4_t s30 = vld1q_f32(gemm_out + 96);
+  float32x4_t s31 = vld1q_f32(gemm_out + 104);
+  float32x4_t s32 = vld1q_f32(gemm_out + 112);
+  float32x4_t s33 = vld1q_f32(gemm_out + 120);
 
   float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20);
   float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21);
@@ -609,24 +612,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
     const float *bias_ptr = bias_data + i;
 
     float s00 = local_ptr[0];
-    float s01 = (local_ptr + 4)[0];
-    float s02 = (local_ptr + 8)[0];
-    float s03 = (local_ptr + 12)[0];
+    float s01 = (local_ptr + 8)[0];
+    float s02 = (local_ptr + 16)[0];
+    float s03 = (local_ptr + 24)[0];
 
-    float s10 = (local_ptr + 16)[0];
-    float s11 = (local_ptr + 20)[0];
-    float s12 = (local_ptr + 24)[0];
-    float s13 = (local_ptr + 28)[0];
+    float s10 = (local_ptr + 32)[0];
+    float s11 = (local_ptr + 40)[0];
+    float s12 = (local_ptr + 48)[0];
+    float s13 = (local_ptr + 56)[0];
 
-    float s20 = (local_ptr + 32)[0];
-    float s21 = (local_ptr + 36)[0];
-    float s22 = (local_ptr + 40)[0];
-    float s23 = (local_ptr + 44)[0];
+    float s20 = (local_ptr + 64)[0];
+    float s21 = (local_ptr + 72)[0];
+    float s22 = (local_ptr + 80)[0];
+    float s23 = (local_ptr + 88)[0];
 
-    float s30 = (local_ptr + 48)[0];
-    float s31 = (local_ptr + 52)[0];
-    float s32 = (local_ptr + 56)[0];
-    float s33 = (local_ptr + 60)[0];
+    float s30 = (local_ptr + 96)[0];
+    float s31 = (local_ptr + 104)[0];
+    float s32 = (local_ptr + 112)[0];
+    float s33 = (local_ptr + 120)[0];
 
     float t00 = s00 + s10 + s20;
     float t01 = s01 + s11 + s21;
@@ -663,6 +666,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
   int output_w = conv_param->output_w_;
   int output_h = conv_param->output_h_;
   int oc4 = UP_DIV(output_channel, C4NUM);
+  int oc8 = UP_DIV(output_channel, C8NUM);
   const int input_unit = 4;
   if (out_w_block == 0) {
     return;
@@ -670,11 +674,13 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
   for (int i = 0; i < real_cal_num; i++) {
     int out_w_index = (start_index + i) % out_w_block;
     int out_h_index = (start_index + i) / out_w_block;
-    int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
+    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
     int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w);
 
     for (int j = 0; j < oc4; j++) {
-      int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
+      int c8_block = j / 2;
+      int c8_res = j % 2;
+      int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
       int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w;
       const float *src_ptr = gemm_out + src_oc4_offset;
       const float *bias_ptr = bias_data + j * C4NUM;
@@ -689,7 +695,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
 }
 
 // int8 conv3x3
-void Conv3x3Uint8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t step, int input_zp) {
+void Conv3x3Int8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t step, int input_zp) {
 #ifdef ENABLE_ARM
   int16x8_t zp = vdupq_n_s16(input_zp);
 
@@ -858,14 +864,14 @@ void Conv3x3Uint8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t
 #endif
 }
 
-void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index,
+void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index,
                                 int real_cal_num, int out_w_block, ConvParameter *conv_param) {
   // input data format : nhwc
   int input_channel = conv_param->input_channel_;
   int input_width = conv_param->input_w_;
   int input_height = conv_param->input_h_;
-  int pad_w = conv_param->pad_w_;
-  int pad_h = conv_param->pad_h_;
+  int pad_w = conv_param->pad_l_;
+  int pad_h = conv_param->pad_u_;
   ConvQuantArg quant_arg = conv_param->conv_quant_arg_;
   int input_zp = quant_arg.input_quant_args_[0].zp_;
   const int ic8 = UP_DIV(input_channel, C8NUM);
@@ -898,7 +904,7 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input,
       int dst_ic8_offset = dst_plane_offset + ic * TILE_NUM * C8NUM;
       size_t dst_step = ic8 * C8NUM * TILE_NUM;
       int16_t *trans_input_ptr = trans_input + dst_ic8_offset;
-      Conv3x3Uint8InputUnit(tmp_data, trans_input_ptr, dst_step, input_zp);
+      Conv3x3Int8InputUnit(tmp_data, trans_input_ptr, dst_step, input_zp);
     }
   }
 }
@@ -1169,7 +1175,7 @@ void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weigh
   }
 }
 
-void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound,
+void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound,
                             bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param) {
   int32_t *left_shift = conv_param->conv_quant_arg_.left_shift_;
   int32_t *right_shift = conv_param->conv_quant_arg_.right_shift_;
@@ -1221,13 +1227,13 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i
   int32x4_t ls;
   int32x4_t rs;
   if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
-    out_multiplier = vld1q_s32(quant_multiplier);
-    ls = vld1q_s32(left_shift);
-    rs = vld1q_s32(right_shift);
+    out_multiplier = vld1q_s32(quant_multiplier + oc_start);
+    ls = vld1q_s32(left_shift + oc_start);
+    rs = vld1q_s32(right_shift + oc_start);
   } else {
-    out_multiplier = vdupq_n_s32(quant_multiplier);
-    ls = vdupq_n_s32(left_shift);
-    rs = vdupq_n_s32(right_shift);
+    out_multiplier = vdupq_n_s32(quant_multiplier[0]);
+    ls = vdupq_n_s32(left_shift[0]);
+    rs = vdupq_n_s32(right_shift[0]);
   }
   int32x4_t out_zp = vdupq_n_s32(output_zp);
   int32x4_t output_min = vdupq_n_s32(out_min);
@@ -1261,27 +1267,27 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i
   d11 = vmaxq_s32(d11, output_min);
   d11 = vminq_s32(d11, output_max);
 
-  (output_data)[0] = (uint8_t)d00[0];
-  (output_data + 1)[0] = (uint8_t)d00[1];
-  (output_data + 2)[0] = (uint8_t)d00[2];
-  (output_data + 3)[0] = (uint8_t)d00[3];
+  (output_data)[0] = (int8_t)d00[0];
+  (output_data + 1)[0] = (int8_t)d00[1];
+  (output_data + 2)[0] = (int8_t)d00[2];
+  (output_data + 3)[0] = (int8_t)d00[3];
 
   if (w_not_bound) {
-    *(output_data + 4) = (uint8_t)d01[0];
-    *(output_data + 5) = (uint8_t)d01[1];
-    *(output_data + 6) = (uint8_t)d01[2];
-    *(output_data + 7) = (uint8_t)d01[3];
+    *(output_data + 4) = (int8_t)d01[0];
+    *(output_data + 5) = (int8_t)d01[1];
+    *(output_data + 6) = (int8_t)d01[2];
+    *(output_data + 7) = (int8_t)d01[3];
   }
   if (h_not_bound) {
-    *(output_data + output_w * 4) = (uint8_t)d10[0];
-    *(output_data + output_w * 4 + 1) = (uint8_t)d10[1];
-    *(output_data + output_w * 4 + 2) = (uint8_t)d10[2];
-    *(output_data + output_w * 4 + 3) = (uint8_t)d10[3];
+    *(output_data + output_w * 4) = (int8_t)d10[0];
+    *(output_data + output_w * 4 + 1) = (int8_t)d10[1];
+    *(output_data + output_w * 4 + 2) = (int8_t)d10[2];
+    *(output_data + output_w * 4 + 3) = (int8_t)d10[3];
     if (w_not_bound) {
-      *(output_data + output_w * 4 + 4) = (uint8_t)d11[0];
-      *(output_data + output_w * 4 + 5) = (uint8_t)d11[1];
-      *(output_data + output_w * 4 + 6) = (uint8_t)d11[2];
-      *(output_data + output_w * 4 + 7) = (uint8_t)d11[3];
+      *(output_data + output_w * 4 + 4) = (int8_t)d11[0];
+      *(output_data + output_w * 4 + 5) = (int8_t)d11[1];
+      *(output_data + output_w * 4 + 6) = (int8_t)d11[2];
+      *(output_data + output_w * 4 + 7) = (int8_t)d11[3];
     }
   }
 #else
@@ -1450,7 +1456,7 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i
 #endif
 }
 
-void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index,
+void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index,
                                  int real_cal_num, int out_w_block, ConvParameter *conv_param) {
   int output_channel = conv_param->output_channel_;
   int output_w = conv_param->output_w_;
@@ -1477,7 +1483,7 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
       int real_num = (output_channel - j * C4NUM) < C4NUM ? (output_channel - j * C4NUM) : C4NUM;
       bool w_not_bound = out_w_index * OUPUT_UNIT + 1 < output_w;
       bool h_not_bound = out_h_index * OUPUT_UNIT + 1 < output_h;
-      Conv3x3Uint8OutputUnit(src_ptr, bias_ptr, dst_ptr, h_not_bound, w_not_bound, output_w, real_num, j * C4NUM,
+      Conv3x3Int8OutputUnit(src_ptr, bias_ptr, dst_ptr, h_not_bound, w_not_bound, output_w, real_num, j * C4NUM,
                              conv_param);
     }
   }
diff --git a/mindspore/lite/nnacl/winograd_transform.h b/mindspore/lite/nnacl/winograd_transform.h
index 4edbb75db2..0cea9da41b 100644
--- a/mindspore/lite/nnacl/winograd_transform.h
+++ b/mindspore/lite/nnacl/winograd_transform.h
@@ -56,18 +56,18 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
                                 int real_cal_num, int out_w_block, ConvParameter *conv_param);
 
 // for int8 convolution 3x3 filter/input/output transform
-void Conv3x3Uint8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t step, int input_zp);
+void Conv3x3Int8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t step, int input_zp);
 
-void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index,
+void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index,
                                 int real_cal_num, int out_w_block, ConvParameter *conv_param);
 
 void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weight, int iC8, int output_channel,
                                 int kernel_plane);
 
-void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound,
+void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound,
                             bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param);
 
-void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index,
+void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index,
                                  int real_cal_num, int out_w_block, ConvParameter *conv_param);
 #ifdef __cplusplus
 }
diff --git a/mindspore/lite/nnacl/winograd_utils.c b/mindspore/lite/nnacl/winograd_utils.c
index 6a63d9a844..f53c919715 100644
--- a/mindspore/lite/nnacl/winograd_utils.c
+++ b/mindspore/lite/nnacl/winograd_utils.c
@@ -4649,43 +4649,41 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float
 // Utilize cost model to compute performance gain.
 // If the gain is greater than got from Im2col, winograd algorithm will be chosen.
 int SelectOutputUnit(ConvParameter *conv_param) {
-  int input_batch = conv_param->input_batch_;
   int kernel_h = conv_param->kernel_h_;
   int kernel_w = conv_param->kernel_w_;
-  int in_channel = conv_param->input_channel_;
-  int out_h = conv_param->output_h_;
+  int in_c = conv_param->input_channel_;
   int out_w = conv_param->output_w_;
-  int out_channel = conv_param->output_channel_;
-  int out_plane = out_h * out_w;
-
-  int max_unit = sqrt((float)(out_plane));
-  max_unit = max_unit > MIN_UNIT ? max_unit : MIN_UNIT;
-  max_unit = max_unit < MAX_UNIT ? max_unit : MAX_UNIT;
-  int output_unit = 1;
-  float ratio = 0.0f;
-  // cost of conventional convolution multiplications
-  float ori_cost = out_plane * out_channel * in_channel * kernel_h * kernel_w;
-
-  for (int u = MIN_UNIT; u < max_unit; u++) {
-    int input_unit = u + kernel_h - 1;
-    if (input_unit != 4 && input_unit != 8) {
+  int out_h = conv_param->output_h_;
+  int out_c = conv_param->output_channel_;
+  int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_);
+  int max_out_unit = (int)(sqrtf((float)unit2));
+  max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit;
+  max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT;
+
+  int unit = 0;
+  float max_rate = 0.0f;
+  float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w;
+
+  for (int i = MIN_UNIT; i <= max_out_unit; ++i) {
+    int input_unit = i + kernel_w - 1;
+    OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i);
+    if (output_trans_func == NULL) {
       continue;
     }
-    // don't count filter transform cost, because it can be processed once offline.
-    const float input_trans_unit_cost = 2 * input_unit * input_unit * input_unit * in_channel;
-    float gemm_unit_cost = input_unit * input_unit * in_channel * out_channel;
-    float output_trans_unit_cost = input_unit * u * (u + input_unit) * out_channel;
-    // equation (23) in papar
-    float winograd_cost = (input_trans_unit_cost + gemm_unit_cost + output_trans_unit_cost) *
-                          (UP_DIV(out_w, u) * (UP_DIV(out_h, u))) * input_batch;
-    float reduce_rate = ori_cost / winograd_cost;
-    if (reduce_rate > ratio && reduce_rate > 1) {
-      ratio = reduce_rate;
-      output_unit = u;
+    float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f;
+    float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) *
+                      UP_DIV(out_w, i) * UP_DIV(out_h, i);
+    float reduce_rate = common_cost / wino_cost - penalty;
+    if (reduce_rate > max_rate) {
+      max_rate = reduce_rate;
+      unit = i;
     }
   }
+  if (max_rate < 1.0f) {
+    return 1;
+  }
   // If output_unit is 1, then it is conventional convolution
-  return output_unit;
+  return unit;
 }
 
 InputTransformUnitFunc GetInputTransFunc(int input_unit) {
@@ -4719,17 +4717,6 @@ void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *con
     *output_unit = SelectOutputUnit(conv_param);
     if (*output_unit > 1) {
       *use_winograd = true;
-      int input_unit = conv_param->kernel_h_ + *output_unit - 1;
-      input_trans_func = GetInputTransFunc(input_unit);
-      if (input_trans_func == NULL) {
-        *use_winograd = false;
-      }
-      output_trans_func = GetOutputTransFunc(input_unit, *output_unit);
-      if (output_trans_func == NULL) {
-        *use_winograd = false;
-      }
-    } else {
-      *use_winograd = false;
     }
   } else {
     *use_winograd = false;
diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs
index e78fe29d1d..4031af06f5 100644
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -80,7 +80,7 @@ union PrimitiveType {
     Pad,
     Maximum,
     Minimum,
-    CaffePReLU,
+    PReLU,
     LeakyReLU,
     ArgMax,
     ArgMin,
@@ -126,7 +126,6 @@ union PrimitiveType {
     Broadcast,
     BroadcastTo,
     Lrn,
-    Prelu,
     ZerosLike,
     TopK,
     SpaceToDepth,
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index 78c6140e43..592b23f964 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -290,6 +290,7 @@ table Pooling {
     padLeft: int;
     padRight: int;
     roundMode: RoundMode;
+    activationType: ActivationType = 0;
 }
 
 table DepthwiseConv2D {
@@ -456,6 +457,7 @@ table Min {
 
 table Slice {
     format: Format = 0;
+    axes: [int];
     begin: [int];
     size: [int];
 }
@@ -538,7 +540,7 @@ table MatMul {
     transposeB : bool = false;
 }
 
-table CaffePReLU {
+table PReLU {
     channelShared : bool = false;
     slope: [float];
 }
@@ -648,10 +650,6 @@ table Reduce {
     mode: ReduceMode;
 }
 
-table Prelu {
-    slope: [float];
-}
-
 table Transpose {
     perm: [int];
     conjugate: bool = false;
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 6aa0150206..0dbf379c4f 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -1,11 +1,21 @@
+if (PLATFORM_ARM32 OR PLATFORM_ARM64)
+  # for performance
+  if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
+  endif()
+endif ()
+
 set(LITE_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/ms_tensor_utils.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/runtime_api.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/thread_pool.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/thread_pool.c
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/workspace_pool.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/ir/tensor.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/ir/meta_tensor_extends.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/context.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/executor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/kernel_registry.cc
@@ -28,16 +38,23 @@ if (SUPPORT_GPU)
       )
 endif ()
 
-set(ANF_SRC
-    ${ANF_SRC}
-    ${CMAKE_CURRENT_SOURCE_DIR}/ir/meta_tensor_extends.cc
-    )
 file(GLOB_RECURSE C_OPS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ops/*.cc)
-add_library(mindspore-lite SHARED ${LITE_SRC} ${ANF_SRC} ${C_OPS_SRC})
-target_link_libraries(mindspore-lite
-    cpu_kernel_mid_
-    )
-
+add_library(mindspore-lite SHARED ${LITE_SRC} ${C_OPS_SRC})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-private-field")
+add_library(core_mid_ OBJECT ${CORE_SRC})
+if (SUPPORT_GPU)
+  add_subdirectory(runtime/kernel/opencl)
+  target_link_libraries(mindspore-lite
+core_mid_
+      cpu_kernel_mid_
+      opencl_kernel_lib_
+      )
+else ()
+  target_link_libraries(mindspore-lite
+      core_mid_
+      cpu_kernel_mid_
+      )
+endif ()
 add_subdirectory(runtime/kernel/arm)
 if (PLATFORM_ARM32 OR PLATFORM_ARM64)
   target_link_libraries(mindspore-lite log)
@@ -46,8 +63,6 @@ if (BUILD_MINDDATA)
   target_link_libraries(mindspore-lite minddata-eager minddata-lite)
 endif ()
 
-add_subdirectory(ops)
-
 if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND (PLATFORM_ARM64 OR PLATFORM_ARM32))
   add_custom_command(TARGET mindspore-lite POST_BUILD
       COMMAND ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip
diff --git a/mindspore/lite/src/common/graph_util.cc b/mindspore/lite/src/common/graph_util.cc
old mode 100755
new mode 100644
index 1ff45db63d..835f9b9e22
--- a/mindspore/lite/src/common/graph_util.cc
+++ b/mindspore/lite/src/common/graph_util.cc
@@ -27,55 +27,56 @@ namespace lite {
 std::vector<size_t> GetGraphInputNodes(const schema::MetaGraph *meta_graph) {
   MS_ASSERT(nullptr != meta_graph);
   std::vector<size_t> ret;
-  for (size_t i = 0; i < meta_graph->inputIndex()->size(); i++) {
-    auto input_index = meta_graph->inputIndex()->GetAs<uint32_t>(i);
+  for (auto graph_in_index : *(meta_graph->inputIndex())) {
     for (size_t j = 0; j < meta_graph->nodes()->size(); j++) {
       auto *cNode = meta_graph->nodes()->GetAs<schema::CNode>(j);
       MS_ASSERT(nullptr != cNode);
       MS_ASSERT(nullptr != cNode->inputIndex());
-      for (size_t k = 0; k < cNode->inputIndex()->size(); k++) {
-        if (cNode->inputIndex()->GetAs<uint32_t>(k) == input_index) {
-          if (!IsContain<size_t>(ret, j)) {
-            ret.emplace_back(j);
-          }
-          break;
+      if (std::any_of(cNode->inputIndex()->begin(), cNode->inputIndex()->end(),
+                      [&](const uint32_t &node_in_index) { return node_in_index == graph_in_index; })) {
+        if (!IsContain<size_t>(ret, j)) {
+          ret.emplace_back(j);
         }
       }
     }
   }
-  return std::move(ret);
+  return ret;
 }
 
 std::vector<size_t> GetGraphOutputNodes(const schema::MetaGraph *meta_graph) {
   MS_ASSERT(nullptr != meta_graph);
   std::vector<size_t> ret;
-  for (size_t i = 0; i < meta_graph->outputIndex()->size(); i++) {
-    auto output_index = meta_graph->outputIndex()->GetAs<uint32_t>(i);
+  for (auto graph_out_index : *(meta_graph->outputIndex())) {
     for (size_t j = 0; j < meta_graph->nodes()->size(); j++) {
       auto *cNode = meta_graph->nodes()->GetAs<schema::CNode>(j);
       MS_ASSERT(nullptr != cNode);
-      for (size_t k = 0; k < cNode->outputIndex()->size(); k++) {
-        if (cNode->outputIndex()->GetAs<uint32_t>(k) == output_index) {
-          if (!IsContain<size_t>(ret, j)) {
-            ret.emplace_back(j);
-          }
-          break;
+      MS_ASSERT(nullptr != cNode->outputIndex());
+      if (std::any_of(cNode->outputIndex()->begin(), cNode->outputIndex()->end(),
+                      [&](const uint32_t &node_out_index) { return node_out_index == graph_out_index; })) {
+        if (!IsContain<size_t>(ret, j)) {
+          ret.emplace_back(j);
         }
       }
     }
   }
-  return std::move(ret);
+  return ret;
 }
 
-// NODE_ID OpNode::ID() { return id; }
-//
-// void OpNode::AddInEdge(NODE_ID nodeId) { inEdges.insert(nodeId); }
-//
-// void OpNode::AddOutEdge(NODE_ID nodeId) { outEdges.insert(nodeId); }
-//
-// std::unordered_set<NODE_ID> OpNode::GetAllInEdges() { return inEdges; }
-//
-// std::unordered_set<NODE_ID> OpNode::GetAllOutEdges() { return outEdges; }
+std::vector<size_t> GetLinkedPostNodeIdx(const schema::MetaGraph &graph, const size_t &tensor_idx) {
+  std::vector<size_t> post_node_idxes;
+  for (size_t i = 0; i < graph.nodes()->size(); i++) {
+    auto node = graph.nodes()->GetAs<schema::CNode>(i);
+    if (node == nullptr) {
+      continue;
+    }
+    auto node_input_idxes = node->inputIndex();
+    auto is_contain = std::any_of(node_input_idxes->begin(), node_input_idxes->end(),
+                                  [&](const uint32_t &node_input_idx) { return node_input_idx == tensor_idx; });
+    if (is_contain) {
+      post_node_idxes.emplace_back(i);
+    }
+  }
+  return post_node_idxes;
+}
 }  // namespace lite
 }  // namespace mindspore
-
diff --git a/mindspore/lite/src/common/graph_util.h b/mindspore/lite/src/common/graph_util.h
old mode 100755
new mode 100644
index 7b1abf36b7..5f494aa202
--- a/mindspore/lite/src/common/graph_util.h
+++ b/mindspore/lite/src/common/graph_util.h
@@ -34,215 +34,8 @@ std::vector<size_t> GetGraphInputNodes(const schema::MetaGraph *meta_graph);
 
 std::vector<size_t> GetGraphOutputNodes(const schema::MetaGraph *meta_graph);
 
-class OpNode {
- public:
-    explicit OpNode(const NODE_ID &nodeId) : id(nodeId) {}
-    NODE_ID ID() { return id; };
-    void AddInEdge(NODE_ID nodeId) { inEdges.insert(nodeId); }
-    void AddOutEdge(NODE_ID nodeId) { outEdges.insert(nodeId); }
-    std::unordered_set<NODE_ID> GetAllInEdges() { return inEdges; }
-    std::unordered_set<NODE_ID> GetAllOutEdges() { return outEdges; }
-
- protected:
-    NODE_ID id;
-    std::unordered_set<NODE_ID> inEdges;
-    std::unordered_set<NODE_ID> outEdges;
-};
-
-
-template <typename NODE_T>
-class OpGraph {
- public:
-  OpGraph() {}
-
-  ~OpGraph();
-
-  int Build(const schema::MetaGraph *subGraphDef);
-  NODE_T *GetNode(NODE_ID nodeId);
-  NODE_T *AddNode(NODE_ID nodeId);
-  std::unordered_set<NODE_T *> GetInputNode();
-  std::unordered_set<NODE_T *> GetOutputNode();
-
-  void AddNodes(std::vector<NODE_T *> addNodes);
-  void DeleteNodes(std::vector<NODE_T *> deleteNodes);
-
-  void AddEdge(NODE_ID nodeId);
-  int AddEdge(NODE_ID srcId, NODE_ID dstId);
-  int AddEdge(const schema::CNode *srcNodeDef, const flatbuffers::Vector<flatbuffers::Offset<schema::CNode>> *opDefs);
-  std::unordered_map<NODE_T *, std::unordered_set<NODE_T *>> GetDepends();
-
- protected:
-  std::unordered_map<NODE_ID, NODE_T *> nodes;
-};
-
-template <typename NODE_T>
-int OpGraph<NODE_T>::Build(const schema::MetaGraph *subGraphDef) {
-  if (subGraphDef == nullptr) {
-    // MS_LOGE("subGraphDef is nullptr");
-    return RET_ERROR;
-  }
-
-  auto opDefs = subGraphDef->nodes();
-
-  uint32_t opCount = opDefs->size();
-  for (uint32_t i = 0; i < opCount; i++) {
-    auto opDef = opDefs->GetAs<schema::CNode>(i);
-    auto node = AddNode(std::string(opDef->name()->c_str()));
-    if (node == nullptr) {
-      // MS_LOGE("add srcNode failed,name %s", opDef->name()->c_str());
-      return RET_ERROR;
-    }
-    auto ret = AddEdge(opDef, opDefs);
-    if (ret != RET_OK) {
-      // MS_LOGE("%s add edge failed. ret:%d", opDef->name()->c_str(), ret);
-      return RET_ERROR;
-    }
-  }
-
-  return RET_OK;
-}
-template <typename NODE_T>
-int OpGraph<NODE_T>::AddEdge(const schema::CNode *srcNodeDef,
-                             const flatbuffers::Vector<flatbuffers::Offset<schema::CNode>> *nodeDefs) {
-  MS_ASSERT(srcNodeDef != nullptr);
-  MS_ASSERT(nodeDefs != nullptr);
-  NODE_ID srcId = std::string(srcNodeDef->name()->c_str());
-  uint32_t opCount = nodeDefs->size();
-  // for single op condition
-  AddNode(srcId);
-  for (auto index : *(srcNodeDef->outputIndex())) {
-    for (uint32_t i = 0; i < opCount; i++) {
-      auto dstNodeDef = nodeDefs->GetAs<schema::CNode>(i);
-      bool find = false;
-      auto inputIndex = dstNodeDef->inputIndex();
-      if (std::any_of(inputIndex->begin(), inputIndex->end(), [&index](int i) { return i == index; })) {
-        find = true;
-      }
-
-      if (!find) {
-        continue;
-      }
-      NODE_ID dstId = std::string(dstNodeDef->name()->c_str());
-      auto ret = AddEdge(srcId, dstId);
-      if (ret != RET_OK) {
-        return ret;
-      }
-    }
-  }
-
-  return RET_OK;
-}
-
-template <typename NODE_T>
-int OpGraph<NODE_T>::AddEdge(NODE_ID srcId, NODE_ID dstId) {
-  auto srcNode = AddNode(srcId);
-  if (srcNode == nullptr) {
-    // MS_LOGE("add srcNode failed");
-    return RET_ERROR;
-  }
-  auto dstNode = AddNode(dstId);
-  if (dstNode == nullptr) {
-    // MS_LOGE("add dstNode failed");
-    return RET_ERROR;
-  }
-
-  srcNode->AddOutEdge(dstNode);
-
-  dstNode->AddInEdge(srcNode);
-  return RET_OK;
-}
-
-template <typename NODE_T>
-NODE_T *OpGraph<NODE_T>::GetNode(NODE_ID nodeId) {
-  auto node = nodes.find(nodeId);
-  if (node == nodes.end()) {
-    return nullptr;
-  }
-  return node->second;
-}
-
-template <typename NODE_T>
-NODE_T *OpGraph<NODE_T>::AddNode(NODE_ID nodeId) {
-  auto node = GetNode(nodeId);
-  if (node != nullptr) {
-    return node;
-  }
-  node = new (std::nothrow) NODE_T(nodeId);
-  if (node == nullptr) {
-    // MS_LOGE("new node failed");
-    return nullptr;
-  }
-  nodes[nodeId] = node;
-  return node;
-}
-
-template <typename NODE_T>
-void OpGraph<NODE_T>::AddNodes(std::vector<NODE_T *> addNodes) {
-  for (auto node : addNodes) {
-    if (node == nullptr) {
-      return;
-    }
-
-    nodes[node->ID()] = node;
-  }
-}
-
-template <typename NODE_T>
-void OpGraph<NODE_T>::DeleteNodes(std::vector<NODE_T *> deleteNodes) {
-  for (auto deletenode : deleteNodes) {
-    if (deletenode == nullptr) {
-      continue;
-    }
-    auto node = GetNode(deletenode->ID());
-    if (node == nullptr) {
-      continue;
-    }
-    nodes.erase(deletenode->ID());
-  }
-}
-
-template <typename NODE_T>
-std::unordered_set<NODE_T *> OpGraph<NODE_T>::GetInputNode() {
-  std::unordered_set<NODE_T *> inputNodes;
-  for (const auto &iter : nodes) {
-    auto node = iter.second;
-    if (node->GetAllInEdges().empty()) {
-      inputNodes.insert(node);
-    }
-  }
-  return inputNodes;
-}
-
-template <typename NODE_T>
-std::unordered_set<NODE_T *> OpGraph<NODE_T>::GetOutputNode() {
-  std::unordered_set<NODE_T *> outputNodes;
-  for (const auto &iter : nodes) {
-    auto node = iter.second;
-    if (node->GetAllOutEdges().empty()) {
-      outputNodes.insert(node);
-    }
-  }
-  return outputNodes;
-}
-
-template <typename NODE_T>
-std::unordered_map<NODE_T *, std::unordered_set<NODE_T *>> OpGraph<NODE_T>::GetDepends() {
-  std::unordered_map<NODE_T *, std::unordered_set<NODE_T *>> depends;
-  for (auto nodeIter : nodes) {
-    depends[nodeIter.second] = nodeIter.second->GetAllInEdges();
-  }
-  return depends;
-}
-
-template <typename NODE_T>
-OpGraph<NODE_T>::~OpGraph() {
-  for (auto iter : nodes) {
-    delete iter.second;
-  }
-  nodes.clear();
-}
+std::vector<size_t> GetLinkedPostNodeIdx(const schema::MetaGraph &graph, const size_t &tensor_idx);
 }  // namespace lite
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_COMMON_GRAPH_UTIL_H_
-
diff --git a/mindspore/lite/src/kernel_registry.cc b/mindspore/lite/src/kernel_registry.cc
index 7b4d8eb9cf..d283877516 100644
--- a/mindspore/lite/src/kernel_registry.cc
+++ b/mindspore/lite/src/kernel_registry.cc
@@ -100,15 +100,14 @@ kernel::LiteKernel *KernelRegistry::GetKernel(const std::vector<tensor::Tensor *
                                               const std::vector<tensor::Tensor *> &out_tensors,
                                               const PrimitiveC *primitive, const Context *ctx,
                                               const kernel::KernelKey &key) {
-  MS_EXCEPTION_IF_NULL(primitive);
-  MS_EXCEPTION_IF_NULL(ctx);
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != ctx);
   auto parameter = kernel::PopulateParameter(primitive);
   if (parameter == nullptr) {
     MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
                   << schema::EnumNamePrimitiveType((schema::PrimitiveType)primitive->Type());
     return nullptr;
   }
-  this->op_parameters_.emplace_back(parameter);
   auto creator = GetCreator(key);
   if (creator != nullptr) {
     auto kernel = creator(in_tensors, out_tensors, parameter, ctx, key, primitive);
@@ -117,10 +116,5 @@ kernel::LiteKernel *KernelRegistry::GetKernel(const std::vector<tensor::Tensor *
   return nullptr;
 }
 
-KernelRegistry::~KernelRegistry() {
-  for (auto op_parameter : op_parameters_) {
-    delete(op_parameter);
-  }
-  op_parameters_.clear();
-}
+KernelRegistry::~KernelRegistry() {}
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/src/lite_kernel.cc b/mindspore/lite/src/lite_kernel.cc
index 5d0cda8ebf..7d3edf0dfd 100644
--- a/mindspore/lite/src/lite_kernel.cc
+++ b/mindspore/lite/src/lite_kernel.cc
@@ -39,6 +39,30 @@ int LiteKernel::DecOutTensorRefCount() {
   return 0;
 }
 
+int LiteKernel::Prepare() {
+  if (!InferShapeDone()) {
+    (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->SetInferFlag(true);
+    auto ret = (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->InferShape(in_tensors_, out_tensors_);
+    if (ret != 0) {
+      (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->SetInferFlag(false);
+      MS_LOG(ERROR) << "InferShape fail!";
+      return ret;
+    }
+    ret = ReSize();
+    if (ret != 0) {
+      MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
+      return ret;
+    }
+  }
+
+  auto &outputs = this->out_tensors();
+  for (auto *output : outputs) {
+    MS_ASSERT(output != nullptr);
+    output->MallocData();
+  }
+  return RET_OK;
+}
+
 std::vector<kernel::LiteKernel *> LiteKernelUtil::SubgraphInputKernels(
   const std::vector<kernel::LiteKernel *> &kernels) {
   std::vector<kernel::LiteKernel *> input_kernels;
diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index 64682c3e73..d646135267 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -73,22 +73,15 @@ class LiteKernel {
     this->out_kernels_.clear();
   }
 
-  virtual ~LiteKernel() = default;
-
-  virtual int Prepare() {
-    if (!InferShapeDone()) {
-      (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->InferShape(in_tensors_, out_tensors_);
-      ReSize();
-    }
-
-    auto &outputs = this->out_tensors();
-    for (auto *output : outputs) {
-      MS_ASSERT(output != nullptr);
-      output->MallocData();
+  virtual ~LiteKernel() {
+    if (op_parameter_ != nullptr) {
+      free(op_parameter_);
+      op_parameter_ = nullptr;
     }
-    return RET_OK;
   }
 
+  virtual int Prepare();
+
   virtual int Init() { return -1; }
 
   virtual int ReSize() { return -1; }
diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc
index bae52d7093..2f3b1a7951 100644
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -16,6 +16,7 @@
 
 #include "src/lite_session.h"
 #include <vector>
+#include <utility>
 #include "include/errorcode.h"
 #include "utils/log_adapter.h"
 #include "src/scheduler.h"
@@ -31,10 +32,29 @@
 
 namespace mindspore {
 namespace lite {
+static std::vector<schema::PrimitiveType> packed_op = {
+  schema::PrimitiveType_Conv2D,          schema::PrimitiveType_DeConv2D,
+  schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_DeDepthwiseConv2D,
+  schema::PrimitiveType_MatMul};
+
+// this method will not check whether tensor_idx is a weight tensor index, caller should ensure this.
+static bool WeightTensorNeedCopy(const lite::Model *model, const uint32_t tensor_idx) {
+  MS_ASSERT(nullptr != model);
+  auto meta_graph = model->GetMetaGraph();
+  MS_ASSERT(nullptr != meta_graph);
+  auto post_node_idxes = GetLinkedPostNodeIdx(*meta_graph, tensor_idx);
+  return std::none_of(post_node_idxes.begin(), post_node_idxes.end(), [&](const size_t &post_node_idx) {
+    auto cNode = meta_graph->nodes()->GetAs<schema::CNode>(post_node_idx);
+    MS_ASSERT(cNode != nullptr);
+    return IsContain(packed_op, cNode->primitive()->value_type());
+  });
+}
+
 int LiteSession::ConvertTensors(const lite::Model *model) {
-  MS_EXCEPTION_IF_NULL(model);
+  MS_ASSERT(nullptr != model);
   auto meta_graph = model->GetMetaGraph();
-  MS_EXCEPTION_IF_NULL(meta_graph);
+  MS_ASSERT(nullptr != meta_graph);
+  copyed_tensor_idxes_.clear();
   uint32_t tensorCount = meta_graph->allTensors()->size();
   for (uint32_t i = 0; i < tensorCount; i++) {
     auto *srcTensor = meta_graph->allTensors()->GetAs<schema::Tensor>(i);
@@ -53,16 +73,30 @@ int LiteSession::ConvertTensors(const lite::Model *model) {
       }
     }
     int dataType = srcTensor->dataType();
-    auto *dstTensor = new tensor::Tensor(TypeId(dataType), shape, srcTensor->format(), srcTensor->nodeType());
+    auto *dstTensor =
+      new (std::nothrow) tensor::Tensor(TypeId(dataType), shape, srcTensor->format(), srcTensor->nodeType());
+    if (dstTensor == nullptr) {
+      MS_LOG(ERROR) << "new " << i << "th tensor failed";
+      return RET_NULL_PTR;
+    }
     if (srcTensor->nodeType() == schema::NodeType_ValueNode && srcTensor->data() != nullptr &&
         srcTensor->data()->size() > 0) {
       if (shape.empty()) {
         shape.push_back(1);
+        dstTensor->set_shape(shape);
       }
-      MS_ASSERT(dstTensor != nullptr);
       MS_ASSERT(dstTensor->Size() == srcTensor->data()->size());
-      // no copy data, do copy when call LiteKernel::Init
-      dstTensor->SetData(const_cast<unsigned char *>(srcTensor->data()->data()));
+      if (WeightTensorNeedCopy(model, i)) {
+        auto ret = dstTensor->MallocData();
+        if (ret != RET_OK) {
+          MS_LOG(ERROR) << "Malloc data for " << i << "th tensor failed";
+          return RET_ERROR;
+        }
+        memcpy(dstTensor->Data(), srcTensor->data()->data(), dstTensor->Size());
+        copyed_tensor_idxes_.emplace_back(i);
+      } else {
+        dstTensor->SetData(const_cast<unsigned char *>(srcTensor->data()->data()));
+      }
     }
     auto quant_params = srcTensor->quantParams();
     if (quant_params != nullptr) {
@@ -73,7 +107,6 @@ int LiteSession::ConvertTensors(const lite::Model *model) {
         dstTensor->AddQuantParam(quant_arg);
       }
     }
-
     this->tensors_.emplace_back(dstTensor);
   }
 
@@ -81,6 +114,7 @@ int LiteSession::ConvertTensors(const lite::Model *model) {
 }
 
 void LiteSession::InitGraphInputTensors(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
   auto meta_graph = model->GetMetaGraph();
   MS_ASSERT(this->inputs_.empty());
   MS_ASSERT(meta_graph != nullptr);
@@ -93,7 +127,7 @@ void LiteSession::InitGraphInputTensors(const lite::Model *model) {
   }
 }
 
-void LiteSession::InitGraphInputMSTensors(const lite::Model *model) {
+void LiteSession::InitGraphInputMSTensors() {
   MS_ASSERT(this->input_vec_.empty());
   for (auto &input_tensor : this->inputs_) {
     MS_ASSERT(input_tensor != nullptr);
@@ -102,6 +136,7 @@ void LiteSession::InitGraphInputMSTensors(const lite::Model *model) {
 }
 
 void LiteSession::InitGraphOutputTensors(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
   auto meta_graph = model->GetMetaGraph();
   MS_ASSERT(this->outputs_.empty());
   MS_ASSERT(meta_graph != nullptr);
@@ -115,6 +150,7 @@ void LiteSession::InitGraphOutputTensors(const lite::Model *model) {
 }
 
 void LiteSession::InitGraphInputMap(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
   auto meta_graph = model->GetMetaGraph();
   MS_ASSERT(this->input_map_.empty());
   MS_ASSERT(meta_graph != nullptr);
@@ -145,9 +181,10 @@ void LiteSession::InitGraphInputMap(const lite::Model *model) {
   }
 }
 
-void LiteSession::InitGraphOutputMap(const lite::Model *model) {
+void LiteSession::InitGraphOutputNodeMap(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
   auto meta_graph = model->GetMetaGraph();
-  MS_ASSERT(this->output_map_.empty());
+  MS_ASSERT(this->output_node_map_.empty());
   MS_ASSERT(meta_graph != nullptr);
   auto graph_output_node_indexes = GetGraphOutputNodes(meta_graph);
   for (auto out_node_index : graph_output_node_indexes) {
@@ -171,17 +208,44 @@ void LiteSession::InitGraphOutputMap(const lite::Model *model) {
       MS_ASSERT(out_tensor != nullptr);
       auto *ms_tensor = new tensor::LiteTensor(out_tensor);
       MS_ASSERT(nullptr != ms_tensor);
-      this->output_map_[out_node->name()->str()].emplace_back(ms_tensor);
+      this->output_node_map_[out_node->name()->str()].emplace_back(ms_tensor);
     }
   }
 }
 
+void LiteSession::InitGraphOutputTensorNames(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
+  auto meta_graph = model->GetMetaGraph();
+  MS_ASSERT(this->output_tensor_names_.empty());
+  MS_ASSERT(meta_graph != nullptr);
+  for (auto output_index : *meta_graph->outputIndex()) {
+    this->output_tensor_names_.emplace_back(std::to_string(output_index));
+  }
+}
+
+void LiteSession::InitGraphOutputTensorMap(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
+  auto meta_graph = model->GetMetaGraph();
+  MS_ASSERT(this->output_tensor_map_.empty());
+  MS_ASSERT(meta_graph != nullptr);
+  for (auto graph_out_index : *(meta_graph->outputIndex())) {
+    MS_ASSERT(graph_out_index < this->tensors_.size());
+    auto *out_tensor = this->tensors_.at(graph_out_index);
+    MS_ASSERT(out_tensor != nullptr);
+    auto *ms_tensor = new tensor::LiteTensor(out_tensor);
+    MS_ASSERT(nullptr != ms_tensor);
+    this->output_tensor_map_.insert(std::make_pair(std::to_string(graph_out_index), ms_tensor));
+  }
+}
+
 void LiteSession::InitGraphInOutTensors(const lite::Model *model) {
   InitGraphInputTensors(model);
-  InitGraphInputMSTensors(model);
+  InitGraphInputMSTensors();
   InitGraphOutputTensors(model);
   InitGraphInputMap(model);
-  InitGraphOutputMap(model);
+  InitGraphOutputNodeMap(model);
+  InitGraphOutputTensorNames(model);
+  InitGraphOutputTensorMap(model);
 }
 
 int LiteSession::CompileGraph(Model *model) {
@@ -208,14 +272,14 @@ int LiteSession::CompileGraph(Model *model) {
   }
 
   executor->Prepare(this->kernels_);
+  model->FreeMetaGraph();
   return RET_OK;
 }
 
 std::vector<mindspore::tensor::MSTensor *> LiteSession::GetInputs() const { return this->input_vec_; }
 
 int LiteSession::RunGraph(const session::KernelCallBack &before, const session::KernelCallBack &after) {
-  MS_EXCEPTION_IF_NULL(this->context_);
-  SetMaxWokerNum(context_->thread_num_);
+  MS_ASSERT(this->context_);
   if (before == nullptr && after == nullptr) {
     return executor->Run(this->inputs_, this->outputs_, this->kernels_, this->context_->allocator.get());
   } else {
@@ -223,12 +287,8 @@ int LiteSession::RunGraph(const session::KernelCallBack &before, const session::
   }
 }
 
-std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> LiteSession::GetOutputs() const {
-  return this->output_map_;
-}
-
 int LiteSession::Init(Context *context) {
-  MS_EXCEPTION_IF_NULL(context);
+  MS_ASSERT(nullptr != context);
   this->context_ = new (std::nothrow) Context(context->thread_num_, context->allocator, context->device_ctx_);
   if (this->context_ == nullptr) {
     MS_LOG(ERROR) << "new context failed";
@@ -236,7 +296,7 @@ int LiteSession::Init(Context *context) {
   }
   this->context_->float16_priority = context->float16_priority;
   this->context_->cpu_bind_mode_ = context->cpu_bind_mode_;
-  ConfigThreadPool(context->cpu_bind_mode_, context->thread_num_);
+  ConfigThreadPool(THREAD_POOL_DEFAULT, context->thread_num_, context->cpu_bind_mode_);
   auto ret = KernelRegistry::GetInstance()->Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "KernelRegistry Init Failed.";
@@ -246,23 +306,30 @@ int LiteSession::Init(Context *context) {
   if (context_->device_ctx_.type == DT_GPU) {
     auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
     opencl_runtime->Init();
+    MS_LOG(INFO) << "Init OpenCL runtime.";
   }
 #endif
   executor = new Executor();
-  MS_EXCEPTION_IF_NULL(executor);
+  if (nullptr == executor) {
+    MS_LOG(ERROR) << "new Executor failed";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
 void LiteSession::BindThread(bool if_bind) {
   if (this->context_->cpu_bind_mode_ != NO_BIND) {
-    DoAllThreadBind(if_bind, static_cast<int>(this->context_->cpu_bind_mode_));
+    BindThreads(THREAD_POOL_DEFAULT, if_bind, this->context_->cpu_bind_mode_);
   }
 }
 
 LiteSession::~LiteSession() {
-  for (auto *tensor : tensors_) {
-    // weight data can not be to free, we will free weight data when freeing meta_graph
-    if (tensor->TensorType() == schema::NodeType_ValueNode && !IsContain(this->inputs_, tensor)) {
+  for (size_t i = 0; i < tensors_.size(); i++) {
+    auto *tensor = tensors_.at(i);
+    MS_ASSERT(tensor != nullptr);
+    // data of weight tensor of node in packed_op can not be to free, we will free weight data when freeing meta_graph
+    if (tensor->TensorType() == schema::NodeType_ValueNode && !IsContain(this->inputs_, tensor) &&
+        !IsContain(copyed_tensor_idxes_, i)) {
       tensor->SetData(nullptr);
     }
     delete tensor;
@@ -276,14 +343,19 @@ LiteSession::~LiteSession() {
     iter.second.clear();
   }
   input_map_.clear();
-  for (auto iter : this->output_map_) {
+  for (auto iter : this->output_node_map_) {
     for (auto *ms_tensor : iter.second) {
       ((tensor::LiteTensor *)ms_tensor)->SetTensorImpl(nullptr);
       delete ms_tensor;
     }
     iter.second.clear();
   }
-  output_map_.clear();
+  output_node_map_.clear();
+  for (auto iter : this->output_tensor_map_) {
+    ((tensor::LiteTensor *)(iter.second))->SetTensorImpl(nullptr);
+    delete (iter.second);
+  }
+  output_tensor_map_.clear();
   for (auto *kernel : kernels_) {
     delete kernel;
   }
@@ -294,6 +366,11 @@ LiteSession::~LiteSession() {
     }
   }
   input_vec_.clear();
+#if SUPPORT_GPU
+  if (context_->device_ctx_.type == DT_GPU) {
+    lite::opencl::OpenCLRuntime::DeleteInstance();
+  }
+#endif
   delete this->context_;
   delete this->executor;
   this->executor = nullptr;
@@ -309,16 +386,35 @@ std::vector<mindspore::tensor::MSTensor *> LiteSession::GetInputsByName(const st
   return ret->second;
 }
 
-std::vector<mindspore::tensor::MSTensor *> LiteSession::GetOutputsByName(const std::string &name) const {
-  auto ret = output_map_.find(name);
-  if (ret == output_map_.end()) {
-    MS_LOG(WARNING) << "Node  " << name << " is not an output node";
+std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> LiteSession::GetOutputMapByNode() const {
+  return this->output_node_map_;
+}
+
+std::vector<mindspore::tensor::MSTensor *> LiteSession::GetOutputsByNodeName(const std::string &node_name) const {
+  auto ret = output_node_map_.find(node_name);
+  if (ret == output_node_map_.end()) {
+    MS_LOG(WARNING) << "Node  " << node_name << " is not an output node";
     std::vector<mindspore::tensor::MSTensor *> empty_ret;
     return empty_ret;
   }
   return ret->second;
 }
 
+std::vector<std::string> LiteSession::GetOutputTensorNames() const { return this->output_tensor_names_; }
+
+mindspore::tensor::MSTensor *LiteSession::GetOutputByTensorName(const std::string &tensor_name) const {
+  auto ret = output_tensor_map_.find(tensor_name);
+  if (ret == output_tensor_map_.end()) {
+    MS_LOG(WARNING) << "Tensor  " << tensor_name << " is not an output node";
+    return nullptr;
+  }
+  return ret->second;
+}
+
+std::unordered_map<std::string, mindspore::tensor::MSTensor *> LiteSession::GetOutputMapByTensor() const {
+  return this->output_tensor_map_;
+}
+
 int LiteSession::ResizeInputs(const std::vector<mindspore::tensor::MSTensor *> &inputs) {
   if (inputs.size() != inputs_.size()) {
     MS_LOG(ERROR) << "Inputs size " << inputs.size() << " is not equal to " << inputs_.size();
diff --git a/mindspore/lite/src/lite_session.h b/mindspore/lite/src/lite_session.h
index c034ade083..53a205cbe5 100644
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@@ -50,9 +50,15 @@ class LiteSession : public session::LiteSession {
   int RunGraph(const session::KernelCallBack &before = nullptr,
                const session::KernelCallBack &after = nullptr) override;
 
-  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> GetOutputs() const override;
+  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> GetOutputMapByNode() const override;
 
-  std::vector<mindspore::tensor::MSTensor *> GetOutputsByName(const std::string &name) const override;
+  std::vector<mindspore::tensor::MSTensor *> GetOutputsByNodeName(const std::string &node_name) const override;
+
+  std::vector<std::string> GetOutputTensorNames() const override;
+
+  mindspore::tensor::MSTensor *GetOutputByTensorName(const std::string &tensor_name) const override;
+
+  std::unordered_map<std::string, mindspore::tensor::MSTensor *> GetOutputMapByTensor() const override;
 
   int Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs) override;
 
@@ -63,13 +69,17 @@ class LiteSession : public session::LiteSession {
 
   void InitGraphInputTensors(const lite::Model *model);
 
-  void InitGraphInputMSTensors(const lite::Model *model);
+  void InitGraphInputMSTensors();
 
   void InitGraphOutputTensors(const lite::Model *model);
 
   void InitGraphInputMap(const lite::Model *model);
 
-  void InitGraphOutputMap(const lite::Model *model);
+  void InitGraphOutputNodeMap(const lite::Model *model);
+
+  void InitGraphOutputTensorNames(const lite::Model *model);
+
+  void InitGraphOutputTensorMap(const lite::Model *model);
 
   int ResizeInputs(const std::vector<mindspore::tensor::MSTensor *> &inputs);
 
@@ -77,6 +87,7 @@ class LiteSession : public session::LiteSession {
   Context *context_ = nullptr;
   std::vector<kernel::LiteKernel *> kernels_;
   std::vector<tensor::Tensor *> tensors_;
+  std::vector<size_t> copyed_tensor_idxes_;
   // graph input tensors
   std::vector<tensor::Tensor *> inputs_;
   // graph output tensors
@@ -86,7 +97,11 @@ class LiteSession : public session::LiteSession {
   // graph input node name -- input tensors
   std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> input_map_;
   // graph output node name -- output tensors
-  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> output_map_;
+  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> output_node_map_;
+
+  std::vector<std::string> output_tensor_names_;
+  // graph output tensor name -- output tensor
+  std::unordered_map<std::string, mindspore::tensor::MSTensor *> output_tensor_map_;
   Executor *executor = nullptr;
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/model.cc b/mindspore/lite/src/model.cc
index 7704c2c856..cdc7c5f1c8 100644
--- a/mindspore/lite/src/model.cc
+++ b/mindspore/lite/src/model.cc
@@ -16,98 +16,6 @@
 
 #include "src/ops/primitive_c.h"
 #include "include/model.h"
-#include "src/ops/unique.h"
-#include "src/ops/space_to_batch.h"
-#include "src/ops/conv2d.h"
-#include "src/ops/roi_pooling.h"
-#include "src/ops/topk.h"
-#include "src/ops/broadcast_to.h"
-#include "src/ops/unsqueeze.h"
-#include "src/ops/unstack.h"
-#include "src/ops/depth_to_space.h"
-#include "src/ops/batch_to_space.h"
-#include "src/ops/prior_box.h"
-#include "src/ops/lstm.h"
-#include "src/ops/softmax.h"
-#include "src/ops/activation.h"
-#include "src/ops/deconv2d.h"
-#include "src/ops/reduce.h"
-#include "src/ops/pooling.h"
-#include "src/ops/fused_batchnorm.h"
-#include "src/ops/batch_norm.h"
-#include "src/ops/power.h"
-#include "src/ops/range.h"
-#include "src/ops/add.h"
-#include "src/ops/sub.h"
-#include "src/ops/div.h"
-#include "src/ops/bias_add.h"
-#include "src/ops/expand_dims.h"
-#include "src/ops/full_connection.h"
-#include "src/ops/shape.h"
-#include "src/ops/elu.h"
-#include "src/ops/embedding_lookup.h"
-#include "src/ops/quant_dtype_cast.h"
-#include "src/ops/matmul.h"
-#include "src/ops/resize.h"
-#include "src/ops/tile.h"
-#include "src/ops/one_hot.h"
-#include "src/ops/space_to_depth.h"
-#include "src/ops/split.h"
-#include "src/ops/argmax.h"
-#include "src/ops/argmin.h"
-#include "src/ops/cast.h"
-#include "src/ops/reshape.h"
-#include "src/ops/scale.h"
-#include "src/ops/concat.h"
-#include "src/ops/nchw2nhwc.h"
-#include "src/ops/slice.h"
-#include "src/ops/squeeze.h"
-#include "src/ops/flatten.h"
-#include "src/ops/mean.h"
-#include "src/ops/nhwc2nchw.h"
-#include "src/ops/stack.h"
-#include "src/ops/crop.h"
-#include "src/ops/addn.h"
-#include "src/ops/gather.h"
-#include "src/ops/gather_nd.h"
-#include "src/ops/local_response_normalization.h"
-#include "src/ops/pad.h"
-#include "src/ops/prelu.h"
-#include "src/ops/caffe_p_relu.h"
-#include "src/ops/reverse_sequence.h"
-#include "src/ops/dedepthwise_conv2d.h"
-#include "src/ops/depthwise_conv2d.h"
-#include "src/ops/mul.h"
-#include "src/ops/eltwise.h"
-#include "src/ops/fill.h"
-#include "src/ops/transpose.h"
-#include "src/ops/log.h"
-#include "src/ops/abs.h"
-#include "src/ops/sin.h"
-#include "src/ops/cos.h"
-#include "src/ops/sqrt.h"
-#include "src/ops/square.h"
-#include "src/ops/exp.h"
-#include "src/ops/rsqrt.h"
-#include "src/ops/maximum.h"
-#include "src/ops/minimum.h"
-#include "src/ops/strided_slice.h"
-#include "src/ops/reverse.h"
-#include "src/ops/logical_and.h"
-#include "src/ops/logical_or.h"
-#include "src/ops/logical_not.h"
-#include "src/ops/floor_div.h"
-#include "src/ops/floor_mod.h"
-#include "src/ops/equal.h"
-#include "src/ops/not_equal.h"
-#include "src/ops/less.h"
-#include "src/ops/less_equal.h"
-#include "src/ops/greater_equal.h"
-#include "src/ops/greater.h"
-#include "src/ops/floor.h"
-#include "src/ops/squared_difference.h"
-#include "src/ops/ceil.h"
-#include "src/ops/round.h"
 #include "utils/log_adapter.h"
 
 namespace mindspore::lite {
@@ -193,7 +101,7 @@ int ModelImpl::BuildOps() {
     MS_LOG(ERROR) << "mete_graph is nullptr";
     return -1;
   }
-  MS_EXCEPTION_IF_NULL(meta_graph_->nodes());
+  MS_ASSERT(nullptr != meta_graph_->nodes());
   for (size_t i = 0; i < meta_graph_->nodes()->size(); i++) {
     auto cNode = meta_graph_->nodes()->GetAs<schema::CNode>(i);
     auto name = cNode->name()->str();
@@ -221,17 +129,17 @@ Model *Model::Import(const char *model_buf, size_t size) {
 Model::~Model() { delete (this->model_impl_); }
 
 mindspore::lite::PrimitiveC *Model::GetOp(const std::string &name) const {
-  MS_EXCEPTION_IF_NULL(model_impl_);
+  MS_ASSERT(nullptr != model_impl_);
   return const_cast<PrimitiveC *>(model_impl_->GetOp(name));
 }
 
 void Model::FreeMetaGraph() {
-  MS_EXCEPTION_IF_NULL(model_impl_);
-  return model_impl_->FreeMetaGraph();
+  MS_ASSERT(nullptr != model_impl_);
+  model_impl_->FreeMetaGraph();
 }
 
 const schema::MetaGraph *Model::GetMetaGraph() const {
-  MS_EXCEPTION_IF_NULL(model_impl_);
+  MS_ASSERT(nullptr != model_impl_);
   return model_impl_->meta_graph();
 }
 
diff --git a/mindspore/lite/src/ops/CMakeLists.txt b/mindspore/lite/src/ops/CMakeLists.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mindspore/lite/src/ops/abs.cc b/mindspore/lite/src/ops/abs.cc
new file mode 100644
index 0000000000..1416513b06
--- /dev/null
+++ b/mindspore/lite/src/ops/abs.cc
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/abs.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+int Abs::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateAbs(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Abs, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/abs.h b/mindspore/lite/src/ops/abs.h
index 9214628204..7b4fdc1e45 100644
--- a/mindspore/lite/src/ops/abs.h
+++ b/mindspore/lite/src/ops/abs.h
@@ -19,11 +19,6 @@
 #include <cmath>
 #include "ir/dtype/type_id.h"
 #include "src/ops/arithmetic_self.h"
-#ifdef PRIMITIVE_WRITEABLE
-#include "schema/inner/model_generated.h"
-#else
-#include "schema/model_generated.h"
-#endif
 
 #ifndef LITE_MINDSPORE_LITE_C_OPS_ABS_H_
 #define LITE_MINDSPORE_LITE_C_OPS_ABS_H_
@@ -33,10 +28,13 @@ namespace lite {
 class Abs : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Abs, ArithmeticSelf);
   Abs() = default;
   explicit Abs(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Abs(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Abs() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/activation.cc b/mindspore/lite/src/ops/activation.cc
index 861ac72905..b4af4c053c 100644
--- a/mindspore/lite/src/ops/activation.cc
+++ b/mindspore/lite/src/ops/activation.cc
@@ -27,7 +27,18 @@ void Activation::SetType(int type) { this->primitive_->value.AsActivation()->typ
 void Activation::SetAlpha(float alpha) { this->primitive_->value.AsActivation()->alpha = alpha; }
 
 int Activation::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Activation;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Activation) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
   auto attr = std::make_unique<schema::ActivationT>();
   if (prim.name() == "ReLU") {
     attr->type = schema::ActivationType_RELU;
@@ -36,18 +47,29 @@ int Activation::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr>
   } else if (prim.name() == "ReLU6") {
     attr->type = schema::ActivationType_RELU6;
   }
-  this->primitive_->value.type = schema::PrimitiveType_Activation;
   this->primitive_->value.value = attr.release();
-
+  if (this->primitive_->value.value == nullptr) {
+    MS_LOG(ERROR) << "new primitiveT value failed";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 #else
-
+int Activation::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Activation();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Activation return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateActivation(*fbb, attr->type(), attr->alpha());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Activation, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Activation::GetType() const { return this->primitive_->value_as_Activation()->type(); }
 float Activation::GetAlpha() const { return this->primitive_->value_as_Activation()->alpha(); }
-
-void Activation::SetType(int type) {}
-void Activation::SetAlpha(float alpha) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/activation.h b/mindspore/lite/src/ops/activation.h
index fead25192e..3934572b9c 100644
--- a/mindspore/lite/src/ops/activation.h
+++ b/mindspore/lite/src/ops/activation.h
@@ -27,16 +27,19 @@ namespace lite {
 class Activation : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Activation, PrimitiveC);
   Activation() = default;
   explicit Activation(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetType(int type);
+  void SetAlpha(float alpha);
 #else
-  explicit Activation(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Activation() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetType() const;
   float GetAlpha() const;
-  void SetType(int type);
-  void SetAlpha(float alpha);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/activation_grad.cc b/mindspore/lite/src/ops/activation_grad.cc
index 6ac7d9181e..a82479a6a6 100644
--- a/mindspore/lite/src/ops/activation_grad.cc
+++ b/mindspore/lite/src/ops/activation_grad.cc
@@ -26,10 +26,21 @@ void ActivationGrad::SetType(int type) {
 }
 
 #else
-
+int ActivationGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_ActivationGrad();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ActivationGrad return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateActivationGrad(*fbb, attr->type());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ActivationGrad, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int ActivationGrad::GetType() const { return this->primitive_->value_as_ActivationGrad()->type(); }
 
-void ActivationGrad::SetType(int type) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/activation_grad.h b/mindspore/lite/src/ops/activation_grad.h
index c6199442f8..f4461d30c2 100644
--- a/mindspore/lite/src/ops/activation_grad.h
+++ b/mindspore/lite/src/ops/activation_grad.h
@@ -28,13 +28,16 @@ namespace lite {
 class ActivationGrad : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ActivationGrad, PrimitiveC);
   ActivationGrad() = default;
   explicit ActivationGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetType(int type);
 #else
-  explicit ActivationGrad(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ActivationGrad() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetType() const;
-  void SetType(int type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/add.cc b/mindspore/lite/src/ops/add.cc
index 7f93442870..d01d1e16e3 100644
--- a/mindspore/lite/src/ops/add.cc
+++ b/mindspore/lite/src/ops/add.cc
@@ -36,7 +36,7 @@ int Add::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs
     this->primitive_->value.type = schema::PrimitiveType_Add;
   }
   if (this->primitive_->value.type != schema::PrimitiveType_Add) {
-    MS_LOG(ERROR) << "Primitive type should be add";
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
     return RET_ERROR;
   }
   if (this->primitive_->value.value == nullptr) {
@@ -50,10 +50,21 @@ int Add::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs
 }
 
 #else
-
+int Add::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Add();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Add return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateAdd(*fbb, attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Add, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Add::GetActivationType() const { return this->primitive_->value_as_Add()->activationType(); }
 
-void Add::SetActivationType(int activation_type) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/add.h b/mindspore/lite/src/ops/add.h
index 83f58a8ec8..78583b909b 100644
--- a/mindspore/lite/src/ops/add.h
+++ b/mindspore/lite/src/ops/add.h
@@ -22,25 +22,23 @@
 #include <cmath>
 #include "ir/dtype/type_id.h"
 #include "src/ops/arithmetic.h"
-#ifdef PRIMITIVE_WRITEABLE
-#include "schema/inner/model_generated.h"
-#else
-#include "schema/model_generated.h"
-#endif
 
 namespace mindspore {
 namespace lite {
 class Add : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Add, Arithmetic);
   Add() = default;
   explicit Add(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetActivationType(int activation_type);
 #else
-  explicit Add(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Add() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetActivationType() const;
-  void SetActivationType(int activation_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/addn.cc b/mindspore/lite/src/ops/addn.cc
index 6562f03df0..9c82f8c271 100644
--- a/mindspore/lite/src/ops/addn.cc
+++ b/mindspore/lite/src/ops/addn.cc
@@ -24,10 +24,21 @@ int AddN::GetN() const { return this->primitive_->value.AsAddN()->N; }
 void AddN::SetN(int n) { this->primitive_->value.AsAddN()->N = n; }
 
 #else
-
+int AddN::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_AddN();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_AddN return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateAddN(*fbb, attr->N());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_AddN, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int AddN::GetN() const { return this->primitive_->value_as_AddN()->N(); }
 
-void AddN::SetN(int n) {}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/addn.h b/mindspore/lite/src/ops/addn.h
index bf09104b17..8b5c61d060 100644
--- a/mindspore/lite/src/ops/addn.h
+++ b/mindspore/lite/src/ops/addn.h
@@ -28,14 +28,17 @@ namespace lite {
 class AddN : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(AddN, PrimitiveC);
   AddN() = default;
   explicit AddN(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetN(int n);
 #else
-  explicit AddN(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  AddN() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetN() const;
-  void SetN(int n);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/argmax.cc b/mindspore/lite/src/ops/argmax.cc
index 3bdb91ef67..3005409cc3 100644
--- a/mindspore/lite/src/ops/argmax.cc
+++ b/mindspore/lite/src/ops/argmax.cc
@@ -32,18 +32,26 @@ void ArgMax::SetKeepDims(bool keep_dims) { this->primitive_->value.AsArgMax()->k
 void ArgMax::SetAxisType(int axis_type) { this->primitive_->value.AsArgMax()->axisType = axis_type; }
 
 #else
-
+int ArgMax::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_ArgMax();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ArgMax return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset =
+    schema::CreateArgMax(*fbb, attr->axis(), attr->outMaxValue(), attr->topK(), attr->keepDims(), attr->axisType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ArgMax, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int ArgMax::GetAxis() const { return this->primitive_->value_as_ArgMax()->axis(); }
 bool ArgMax::GetOutMaxValue() const { return this->primitive_->value_as_ArgMax()->outMaxValue(); }
 int ArgMax::GetTopK() const { return this->primitive_->value_as_ArgMax()->topK(); }
 bool ArgMax::GetKeepDims() const { return this->primitive_->value_as_ArgMax()->keepDims(); }
 int ArgMax::GetAxisType() const { return this->primitive_->value_as_ArgMax()->axisType(); }
 
-void ArgMax::SetAxis(int axis) {}
-void ArgMax::SetOutMaxValue(bool out_max_value) {}
-void ArgMax::SetTopK(int top_k) {}
-void ArgMax::SetKeepDims(bool keep_dims) {}
-void ArgMax::SetAxisType(int axis_type) {}
 #endif
 
 int ArgMax::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/argmax.h b/mindspore/lite/src/ops/argmax.h
index dabca0b333..4b58916abc 100644
--- a/mindspore/lite/src/ops/argmax.h
+++ b/mindspore/lite/src/ops/argmax.h
@@ -28,10 +28,18 @@ namespace lite {
 class ArgMax : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ArgMax, PrimitiveC);
   ArgMax() = default;
   explicit ArgMax(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+  void SetOutMaxValue(bool out_max_value);
+  void SetTopK(int top_k);
+  void SetKeepDims(bool keep_dims);
+  void SetAxisType(int axis_type);
 #else
-  explicit ArgMax(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ArgMax() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
@@ -39,11 +47,6 @@ class ArgMax : public PrimitiveC {
   int GetTopK() const;
   bool GetKeepDims() const;
   int GetAxisType() const;
-  void SetAxis(int axis);
-  void SetOutMaxValue(bool out_max_value);
-  void SetTopK(int top_k);
-  void SetKeepDims(bool keep_dims);
-  void SetAxisType(int axis_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/argmin.cc b/mindspore/lite/src/ops/argmin.cc
index b95042a128..c3e300130d 100644
--- a/mindspore/lite/src/ops/argmin.cc
+++ b/mindspore/lite/src/ops/argmin.cc
@@ -32,18 +32,26 @@ void ArgMin::SetKeepDims(bool keep_dims) { this->primitive_->value.AsArgMin()->k
 void ArgMin::SetAxisType(int axis_type) { this->primitive_->value.AsArgMin()->axisType = axis_type; }
 
 #else
-
+int ArgMin::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_ArgMin();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ArgMin return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset =
+    schema::CreateArgMin(*fbb, attr->axis(), attr->outMaxValue(), attr->topK(), attr->keepDims(), attr->axisType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ArgMin, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int ArgMin::GetAxis() const { return this->primitive_->value_as_ArgMin()->axis(); }
 bool ArgMin::GetOutMaxValue() const { return this->primitive_->value_as_ArgMin()->outMaxValue(); }
 int ArgMin::GetTopK() const { return this->primitive_->value_as_ArgMin()->topK(); }
 bool ArgMin::GetKeepDims() const { return this->primitive_->value_as_ArgMin()->keepDims(); }
 int ArgMin::GetAxisType() const { return this->primitive_->value_as_ArgMin()->axisType(); }
 
-void ArgMin::SetAxis(int axis) {}
-void ArgMin::SetOutMaxValue(bool out_max_value) {}
-void ArgMin::SetTopK(int top_k) {}
-void ArgMin::SetKeepDims(bool keep_dims) {}
-void ArgMin::SetAxisType(int axis_type) {}
 #endif
 
 int ArgMin::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/argmin.h b/mindspore/lite/src/ops/argmin.h
index 4d4ae653ef..a62fff3917 100644
--- a/mindspore/lite/src/ops/argmin.h
+++ b/mindspore/lite/src/ops/argmin.h
@@ -28,10 +28,18 @@ namespace lite {
 class ArgMin : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ArgMin, PrimitiveC);
   ArgMin() = default;
   explicit ArgMin(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+  void SetOutMaxValue(bool out_max_value);
+  void SetTopK(int top_k);
+  void SetKeepDims(bool keep_dims);
+  void SetAxisType(int axis_type);
 #else
-  explicit ArgMin(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ArgMin() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
@@ -39,11 +47,6 @@ class ArgMin : public PrimitiveC {
   int GetTopK() const;
   bool GetKeepDims() const;
   int GetAxisType() const;
-  void SetAxis(int axis);
-  void SetOutMaxValue(bool out_max_value);
-  void SetTopK(int top_k);
-  void SetKeepDims(bool keep_dims);
-  void SetAxisType(int axis_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/arithmetic.h b/mindspore/lite/src/ops/arithmetic.h
index fcc0cda3ad..bcc516c1fe 100644
--- a/mindspore/lite/src/ops/arithmetic.h
+++ b/mindspore/lite/src/ops/arithmetic.h
@@ -28,10 +28,15 @@ namespace lite {
 class Arithmetic : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Arithmetic, PrimitiveC);
   Arithmetic() = default;
   explicit Arithmetic(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Arithmetic(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  //  explicit Arithmetic(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Arithmetic() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override {
+    return RET_ERROR;
+  }
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   bool Broadcasting() { return this->broadcasting_; }
diff --git a/mindspore/lite/src/ops/arithmetic_self.h b/mindspore/lite/src/ops/arithmetic_self.h
index d7df543a82..57e8a108ef 100644
--- a/mindspore/lite/src/ops/arithmetic_self.h
+++ b/mindspore/lite/src/ops/arithmetic_self.h
@@ -25,10 +25,15 @@ namespace lite {
 class ArithmeticSelf : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ArithmeticSelf, PrimitiveC);
   ArithmeticSelf() = default;
   explicit ArithmeticSelf(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit ArithmeticSelf(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  //  explicit ArithmeticSelf(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ArithmeticSelf() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override {
+    return RET_ERROR;
+  }
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/batch_norm.cc b/mindspore/lite/src/ops/batch_norm.cc
index 3b68b19353..736e6a9441 100644
--- a/mindspore/lite/src/ops/batch_norm.cc
+++ b/mindspore/lite/src/ops/batch_norm.cc
@@ -24,19 +24,41 @@ float BatchNorm::GetEpsilon() const { return this->primitive_->value.AsBatchNorm
 void BatchNorm::SetEpsilon(float epsilon) { this->primitive_->value.AsBatchNorm()->epsilon = epsilon; }
 
 int BatchNorm::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::FusedBatchNormT>();
-  attr->epsilon = GetValue<float>(prim.GetAttr("epsilon"));
-  this->primitive_->value.type = schema::PrimitiveType_FusedBatchNorm;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_FusedBatchNorm;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_FusedBatchNorm) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::FusedBatchNormT();
+    attr->epsilon = GetValue<float>(prim.GetAttr("epsilon"));
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 
 #else
-
+int BatchNorm::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateBatchNorm(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BatchNorm, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float BatchNorm::GetEpsilon() const { return this->primitive_->value_as_BatchNorm()->epsilon(); }
 
-void BatchNorm::SetEpsilon(float epsilon) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/batch_norm.h b/mindspore/lite/src/ops/batch_norm.h
index 2567dddc91..03dac3a71e 100644
--- a/mindspore/lite/src/ops/batch_norm.h
+++ b/mindspore/lite/src/ops/batch_norm.h
@@ -28,14 +28,17 @@ namespace lite {
 class BatchNorm : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BatchNorm, PrimitiveC);
   BatchNorm() = default;
   explicit BatchNorm(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetEpsilon(float epsilon);
 #else
-  explicit BatchNorm(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BatchNorm() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetEpsilon() const;
-  void SetEpsilon(float epsilon);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/batch_to_space.cc b/mindspore/lite/src/ops/batch_to_space.cc
index c11a5ffd20..683508b628 100644
--- a/mindspore/lite/src/ops/batch_to_space.cc
+++ b/mindspore/lite/src/ops/batch_to_space.cc
@@ -32,7 +32,31 @@ void BatchToSpace::SetBlockShape(const std::vector<int> &block_shape) {
 void BatchToSpace::SetCrops(const std::vector<int> &crops) { this->primitive_->value.AsBatchToSpace()->crops = crops; }
 
 #else
-
+int BatchToSpace::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_BatchToSpace();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_BatchToSpace return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> blockShape;
+  if (attr->blockShape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->blockShape()->size()); i++) {
+      blockShape.push_back(attr->blockShape()->data()[i]);
+    }
+  }
+  std::vector<int32_t> crops;
+  if (attr->crops() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->crops()->size()); i++) {
+      crops.push_back(attr->crops()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateBatchToSpaceDirect(*fbb, &blockShape, &crops);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BatchToSpace, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> BatchToSpace::GetBlockShape() const {
   auto fb_vector = this->primitive_->value_as_BatchToSpace()->blockShape();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
@@ -42,8 +66,6 @@ std::vector<int> BatchToSpace::GetCrops() const {
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void BatchToSpace::SetBlockShape(const std::vector<int> &block_shape) {}
-void BatchToSpace::SetCrops(const std::vector<int> &crops) {}
 #endif
 namespace {
 constexpr int kBatchToSpaceOutputNum = 1;
diff --git a/mindspore/lite/src/ops/batch_to_space.h b/mindspore/lite/src/ops/batch_to_space.h
index f63205a3df..9c9632fc37 100644
--- a/mindspore/lite/src/ops/batch_to_space.h
+++ b/mindspore/lite/src/ops/batch_to_space.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,16 +29,19 @@ namespace lite {
 class BatchToSpace : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BatchToSpace, PrimitiveC);
   BatchToSpace() = default;
   explicit BatchToSpace(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBlockShape(const std::vector<int> &block_shape);
+  void SetCrops(const std::vector<int> &crops);
 #else
-  explicit BatchToSpace(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BatchToSpace() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetBlockShape() const;
   std::vector<int> GetCrops() const;
-  void SetBlockShape(const std::vector<int> &block_shape);
-  void SetCrops(const std::vector<int> &crops);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bias_add.cc b/mindspore/lite/src/ops/bias_add.cc
index 6966fd78c7..bb7059e1ab 100644
--- a/mindspore/lite/src/ops/bias_add.cc
+++ b/mindspore/lite/src/ops/bias_add.cc
@@ -25,23 +25,59 @@ std::vector<int> BiasAdd::GetAxis() const { return this->primitive_->value.AsBia
 void BiasAdd::SetAxis(const std::vector<int> &axis) { this->primitive_->value.AsBiasAdd()->axis = axis; }
 
 int BiasAdd::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::BiasAddT>();
-  attr->axis = {0};
-  this->primitive_->value.type = schema::PrimitiveType_BiasAdd;
-  this->primitive_->value.value = attr.release();
-
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_BiasAdd;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_BiasAdd) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::BiasAddT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    attr->axis = {0};
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 
 #else
-
+int BiasAdd::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_BiasAdd();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_BiasAdd return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateBiasAddDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BiasAdd, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> BiasAdd::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_BiasAdd()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void BiasAdd::SetAxis(const std::vector<int> &axis) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bias_add.h b/mindspore/lite/src/ops/bias_add.h
index 19918fe19c..1298cf28f1 100644
--- a/mindspore/lite/src/ops/bias_add.h
+++ b/mindspore/lite/src/ops/bias_add.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,17 @@ namespace lite {
 class BiasAdd : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BiasAdd, PrimitiveC);
   BiasAdd() = default;
   explicit BiasAdd(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetAxis(const std::vector<int> &axis);
 #else
-  explicit BiasAdd(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BiasAdd() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bias_grad.cc b/mindspore/lite/src/ops/bias_grad.cc
index 6fc1caa616..23c01adc3e 100644
--- a/mindspore/lite/src/ops/bias_grad.cc
+++ b/mindspore/lite/src/ops/bias_grad.cc
@@ -24,13 +24,30 @@ std::vector<int> BiasGrad::GetAxis() const { return this->primitive_->value.AsBi
 void BiasGrad::SetAxis(const std::vector<int> &axis) { this->primitive_->value.AsBiasGrad()->axis = axis; }
 
 #else
-
+int BiasGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_BiasGrad();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_BiasGrad return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateBiasGradDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BiasGrad, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> BiasGrad::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_BiasGrad()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void BiasGrad::SetAxis(const std::vector<int> &axis) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bias_grad.h b/mindspore/lite/src/ops/bias_grad.h
index d1525cea1f..c3729764c1 100644
--- a/mindspore/lite/src/ops/bias_grad.h
+++ b/mindspore/lite/src/ops/bias_grad.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,13 +29,17 @@ namespace lite {
 class BiasGrad : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BiasGrad, PrimitiveC);
   BiasGrad() = default;
   explicit BiasGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+
 #else
-  explicit BiasGrad(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BiasGrad() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bn_grad_input.cc b/mindspore/lite/src/ops/bn_grad_input.cc
index 1736e1fe9c..9aee03f81d 100644
--- a/mindspore/lite/src/ops/bn_grad_input.cc
+++ b/mindspore/lite/src/ops/bn_grad_input.cc
@@ -26,12 +26,22 @@ void BNGradInput::SetEps(float eps) { this->primitive_->value.AsBNGradInput()->e
 void BNGradInput::SetChannels(int channels) { this->primitive_->value.AsBNGradInput()->channels = channels; }
 
 #else
-
+int BNGradInput::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_BNGradInput();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_BNGradInput return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateBNGradInput(*fbb, attr->eps(), attr->channels());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BNGradInput, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float BNGradInput::GetEps() const { return this->primitive_->value_as_BNGradInput()->eps(); }
 int BNGradInput::GetChannels() const { return this->primitive_->value_as_BNGradInput()->channels(); }
 
-void BNGradInput::SetEps(float eps) {}
-void BNGradInput::SetChannels(int channels) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/bn_grad_input.h b/mindspore/lite/src/ops/bn_grad_input.h
index 3e6f0550f3..aa22933f8a 100644
--- a/mindspore/lite/src/ops/bn_grad_input.h
+++ b/mindspore/lite/src/ops/bn_grad_input.h
@@ -28,15 +28,18 @@ namespace lite {
 class BNGradInput : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BNGradInput, PrimitiveC);
   BNGradInput() = default;
   explicit BNGradInput(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetEps(float eps);
+  void SetChannels(int channels);
 #else
-  explicit BNGradInput(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BNGradInput() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetEps() const;
   int GetChannels() const;
-  void SetEps(float eps);
-  void SetChannels(int channels);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/broadcast_to.cc b/mindspore/lite/src/ops/broadcast_to.cc
index ca2d71607d..1c4e5875cd 100644
--- a/mindspore/lite/src/ops/broadcast_to.cc
+++ b/mindspore/lite/src/ops/broadcast_to.cc
@@ -26,13 +26,30 @@ void BroadcastTo::SetDstShape(const std::vector<int> &dst_shape) {
 }
 
 #else
-
+int BroadcastTo::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_BroadcastTo();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_BroadcastTo return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> dst_shape;
+  if (attr->dst_shape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->dst_shape()->size()); i++) {
+      dst_shape.push_back(attr->dst_shape()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateBroadcastToDirect(*fbb, &dst_shape);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BroadcastTo, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> BroadcastTo::GetDstShape() const {
   auto fb_vector = this->primitive_->value_as_BroadcastTo()->dst_shape();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void BroadcastTo::SetDstShape(const std::vector<int> &dst_shape) {}
 #endif
 namespace {
 constexpr int kBroadcastToInputNum = 1;
diff --git a/mindspore/lite/src/ops/broadcast_to.h b/mindspore/lite/src/ops/broadcast_to.h
index 9b3cdaca10..d0181da165 100644
--- a/mindspore/lite/src/ops/broadcast_to.h
+++ b/mindspore/lite/src/ops/broadcast_to.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,18 @@ namespace lite {
 class BroadcastTo : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(BroadcastTo, PrimitiveC);
   BroadcastTo() = default;
   explicit BroadcastTo(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetDstShape(const std::vector<int> &dst_shape);
+
 #else
-  explicit BroadcastTo(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  BroadcastTo() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetDstShape() const;
-  void SetDstShape(const std::vector<int> &dst_shape);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/cast.cc b/mindspore/lite/src/ops/cast.cc
index d7ba94ee00..10cf0f63c2 100644
--- a/mindspore/lite/src/ops/cast.cc
+++ b/mindspore/lite/src/ops/cast.cc
@@ -26,12 +26,22 @@ void Cast::SetSrcT(int src_t) { this->primitive_->value.AsCast()->srcT = src_t;
 void Cast::SetDstT(int dst_t) { this->primitive_->value.AsCast()->dstT = dst_t; }
 
 #else
-
+int Cast::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Cast();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Cast return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateCast(*fbb, attr->srcT(), attr->dstT());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Cast, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Cast::GetSrcT() const { return this->primitive_->value_as_Cast()->srcT(); }
 int Cast::GetDstT() const { return this->primitive_->value_as_Cast()->dstT(); }
 
-void Cast::SetSrcT(int src_t) {}
-void Cast::SetDstT(int dst_t) {}
 #endif
 
 int Cast::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/cast.h b/mindspore/lite/src/ops/cast.h
index 6a244611d0..973e405a18 100644
--- a/mindspore/lite/src/ops/cast.h
+++ b/mindspore/lite/src/ops/cast.h
@@ -28,16 +28,19 @@ namespace lite {
 class Cast : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Cast, PrimitiveC);
   Cast() = default;
   explicit Cast(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetSrcT(int src_t);
+  void SetDstT(int dst_t);
 #else
-  explicit Cast(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Cast() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetSrcT() const;
   int GetDstT() const;
-  void SetSrcT(int src_t);
-  void SetDstT(int dst_t);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/ceil.h b/mindspore/lite/src/ops/ceil.h
index 5ce5276fb7..6af9ef2910 100644
--- a/mindspore/lite/src/ops/ceil.h
+++ b/mindspore/lite/src/ops/ceil.h
@@ -20,18 +20,28 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include "src/ops/arithmetic_self.h"
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
 
 namespace mindspore {
 namespace lite {
 class Ceil : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Ceil, ArithmeticSelf);
   Ceil() = default;
   explicit Ceil(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Ceil(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Ceil() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override {
+    MS_ASSERT(nullptr != primitive);
+    MS_ASSERT(nullptr != fbb);
+    auto val_offset = schema::CreateCeil(*fbb);
+    auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Ceil, val_offset.o);
+    fbb->Finish(prim_offset);
+    return RET_OK;
+  }
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/clip.cc b/mindspore/lite/src/ops/clip.cc
index 656bd5c0f7..08e654337a 100644
--- a/mindspore/lite/src/ops/clip.cc
+++ b/mindspore/lite/src/ops/clip.cc
@@ -26,12 +26,22 @@ void Clip::SetMax(float max) { this->primitive_->value.AsClip()->max = max; }
 void Clip::SetMin(float min) { this->primitive_->value.AsClip()->min = min; }
 
 #else
-
+int Clip::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Clip();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Clip return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateClip(*fbb, attr->max(), attr->min());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Clip, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float Clip::GetMax() const { return this->primitive_->value_as_Clip()->max(); }
 float Clip::GetMin() const { return this->primitive_->value_as_Clip()->min(); }
 
-void Clip::SetMax(float max) {}
-void Clip::SetMin(float min) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/clip.h b/mindspore/lite/src/ops/clip.h
index 3f8289840e..7cd343dbad 100644
--- a/mindspore/lite/src/ops/clip.h
+++ b/mindspore/lite/src/ops/clip.h
@@ -28,15 +28,18 @@ namespace lite {
 class Clip : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Clip, PrimitiveC);
   Clip() = default;
   explicit Clip(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMax(float max);
+  void SetMin(float min);
 #else
-  explicit Clip(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Clip() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetMax() const;
   float GetMin() const;
-  void SetMax(float max);
-  void SetMin(float min);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/concat.cc b/mindspore/lite/src/ops/concat.cc
index 7b53023410..db73bac434 100644
--- a/mindspore/lite/src/ops/concat.cc
+++ b/mindspore/lite/src/ops/concat.cc
@@ -30,22 +30,52 @@ void Concat::SetAxis(int axis) { this->primitive_->value.AsConcat()->axis = axis
 void Concat::SetN(int n) { this->primitive_->value.AsConcat()->n = n; }
 
 int Concat::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::ConcatT>();
-  auto prim_axis = GetValue<int>(prim.GetAttr("axis"));
-  attr->axis = prim_axis;
-  this->primitive_->value.type = schema::PrimitiveType_Concat;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Concat;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Concat) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::ConcatT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    auto prim_axis = GetValue<int>(prim.GetAttr("axis"));
+    attr->axis = prim_axis;
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 
 #else
-
+int Concat::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Concat();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Concat return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateConcat(*fbb, attr->axis(), attr->n());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Concat, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Concat::GetAxis() const { return this->primitive_->value_as_Concat()->axis(); }
 int Concat::GetN() const { return this->primitive_->value_as_Concat()->n(); }
 
-void Concat::SetAxis(int axis) {}
-void Concat::SetN(int n) {}
 #endif
 
 namespace {
@@ -78,17 +108,13 @@ int Concat::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor
   auto input0_shape_without_axis = input0_shape;
   input0_shape_without_axis.erase(input0_shape_without_axis.begin() + axis);
   auto input0_data_type = inputs_.at(0)->data_type();
-  schema::Format input0_format = inputs_[0]->GetFormat();
   int output_axis_dim = input0_shape.at(axis);
   for (size_t i = 1; i < inputs_.size(); ++i) {
     if (inputs_.at(i)->data_type() != input0_data_type) {
       MS_LOG(ERROR) << "All inputs should have the same data type!";
       return RET_PARAM_INVALID;
     }
-    if (inputs_.at(i)->GetFormat() != input0_format) {
-      MS_LOG(ERROR) << "All input format should be the same!";
-      return RET_PARAM_INVALID;
-    }
+
     auto shape_tmp = inputs_.at(i)->shape();
     if (shape_tmp.size() != input0_shape.size()) {
       MS_LOG(ERROR) << "All inputs should have the same dim num!";
diff --git a/mindspore/lite/src/ops/concat.h b/mindspore/lite/src/ops/concat.h
index 5f2099e653..c12d98fb0f 100644
--- a/mindspore/lite/src/ops/concat.h
+++ b/mindspore/lite/src/ops/concat.h
@@ -28,17 +28,20 @@ namespace lite {
 class Concat : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Concat, PrimitiveC);
   Concat() = default;
   explicit Concat(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetAxis(int axis);
+  void SetN(int n);
 #else
-  explicit Concat(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Concat() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
   int GetN() const;
-  void SetAxis(int axis);
-  void SetN(int n);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/constant_of_shape.cc b/mindspore/lite/src/ops/constant_of_shape.cc
index aa1dfb822a..587bc17278 100644
--- a/mindspore/lite/src/ops/constant_of_shape.cc
+++ b/mindspore/lite/src/ops/constant_of_shape.cc
@@ -30,10 +30,21 @@ float ConstantOfShape::GetValue() const { return this->primitive_->value.AsConst
 void ConstantOfShape::SetValue(float value) { this->primitive_->value.AsConstantOfShape()->value = value; }
 
 #else
-
+int ConstantOfShape::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_ConstantOfShape();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ConstantOfShape return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateConstantOfShape(*fbb, attr->value());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ConstantOfShape, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float ConstantOfShape::GetValue() const { return this->primitive_->value_as_ConstantOfShape()->value(); }
 
-void ConstantOfShape::SetValue(float value) {}
 #endif
 
 int ConstantOfShape::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/constant_of_shape.h b/mindspore/lite/src/ops/constant_of_shape.h
index f9bb1d6581..ab96e088b3 100644
--- a/mindspore/lite/src/ops/constant_of_shape.h
+++ b/mindspore/lite/src/ops/constant_of_shape.h
@@ -28,14 +28,17 @@ namespace lite {
 class ConstantOfShape : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ConstantOfShape, PrimitiveC);
   ConstantOfShape() = default;
   explicit ConstantOfShape(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetValue(float value);
 #else
-  explicit ConstantOfShape(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ConstantOfShape() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   float GetValue() const;
-  void SetValue(float value);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/conv2d.cc b/mindspore/lite/src/ops/conv2d.cc
index 268c15f907..38151eef0f 100644
--- a/mindspore/lite/src/ops/conv2d.cc
+++ b/mindspore/lite/src/ops/conv2d.cc
@@ -19,7 +19,6 @@
 #include <memory>
 #include "include/errorcode.h"
 #include "utils/log_adapter.h"
-#include "src/ir/tensor.h"
 #ifdef PRIMITIVE_WRITEABLE
 #include "tools/converter/quantizer/quantize_util.h"
 #endif
@@ -309,8 +308,18 @@ void Conv2D::PopulaterQuantParam(const Primitive &prim,
 }
 
 int Conv2D::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Conv2D;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Conv2D) {
+    MS_LOG(ERROR) << "primitive_ type is error:" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
   int group = GetValue<int>(prim.GetAttr("group"));
   if (group > 1) {
     PopulaterConv2DMultiGroup(prim, this->primitive_, group, inputs);
@@ -329,7 +338,23 @@ int Conv2D::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inp
 }
 
 #else
+int Conv2D::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Conv2D();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Conv2D return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateConv2D(
+    *fbb, attr->format(), attr->group(), attr->channelIn(), attr->channelOut(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Conv2D, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Conv2D::GetFormat() const { return this->primitive_->value_as_Conv2D()->format(); }
 int Conv2D::GetGroup() const { return this->primitive_->value_as_Conv2D()->group(); }
 int Conv2D::GetChannelIn() const { return this->primitive_->value_as_Conv2D()->channelIn(); }
@@ -348,23 +373,6 @@ int Conv2D::GetDilateH() const { return this->primitive_->value_as_Conv2D()->dil
 bool Conv2D::GetHasBias() const { return this->primitive_->value_as_Conv2D()->hasBias(); }
 int Conv2D::GetActivationType() const { return this->primitive_->value_as_Conv2D()->activationType(); }
 
-void Conv2D::SetFormat(int format) {}
-void Conv2D::SetGroup(int group) {}
-void Conv2D::SetChannelIn(int channel_in) {}
-void Conv2D::SetChannelOut(int channel_out) {}
-void Conv2D::SetKernelW(int kernel_w) {}
-void Conv2D::SetKernelH(int kernel_h) {}
-void Conv2D::SetStrideW(int stride_w) {}
-void Conv2D::SetStrideH(int stride_h) {}
-void Conv2D::SetPadMode(int pad_mode) {}
-void Conv2D::SetPadUp(int pad_up) {}
-void Conv2D::SetPadDown(int pad_down) {}
-void Conv2D::SetPadLeft(int pad_left) {}
-void Conv2D::SetPadRight(int pad_right) {}
-void Conv2D::SetDilateW(int dilate_w) {}
-void Conv2D::SetDilateH(int dilate_h) {}
-void Conv2D::SetHasBias(bool has_bias) {}
-void Conv2D::SetActivationType(int activation_type) {}
 #endif
 void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output_w) {
   MS_ASSERT(this->primitive_ != nullptr);
@@ -384,10 +392,18 @@ void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output
     *output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(stride_h));
     auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h);
     auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w);
-    pad_u_ = pad_h_all / 2;
-    pad_d_ = pad_h_all - pad_u_;
-    pad_l_ = pad_w_all / 2;
-    pad_r_ = pad_w_all - pad_l_;
+    if (pad_h_all < 0) {
+      pad_u_ = pad_d_ = 0;
+    } else {
+      pad_u_ = pad_h_all / 2;
+      pad_d_ = pad_h_all - pad_u_;
+    }
+    if (pad_w_all < 0) {
+      pad_l_ = pad_r_ = 0;
+    } else {
+      pad_l_ = pad_w_all / 2;
+      pad_r_ = pad_w_all - pad_l_;
+    }
   } else {
     *output_w = std::ceil((static_cast<float>(input_w) + pad_l_ + pad_r_ -
                            (static_cast<float>(kernel_w) - 1) * static_cast<float>(dilate_w)) /
diff --git a/mindspore/lite/src/ops/conv2d.h b/mindspore/lite/src/ops/conv2d.h
index 4c769a952b..21367dcdc5 100644
--- a/mindspore/lite/src/ops/conv2d.h
+++ b/mindspore/lite/src/ops/conv2d.h
@@ -28,12 +28,30 @@ namespace mindspore {
 namespace lite {
 class Conv2D : public PrimitiveC {
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Conv2D, PrimitiveC);
 
  public:
   Conv2D() = default;
   explicit Conv2D(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetFormat(int format);
+  void SetGroup(int group);
+  void SetChannelIn(int channel_in);
+  void SetChannelOut(int channel_out);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 
  private:
   void PopulaterConv2DMultiGroup(const Primitive &prim, schema::PrimitiveT *primitive, const int &group,
@@ -45,7 +63,9 @@ class Conv2D : public PrimitiveC {
 #else
 
  public:
-  explicit Conv2D(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Conv2D() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 
  public:
@@ -72,23 +92,6 @@ class Conv2D : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetGroup(int group);
-  void SetChannelIn(int channel_in);
-  void SetChannelOut(int channel_out);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 
  protected:
   void ConvInferShape(int input_h, int input_w, int *output_h, int *output_w);
diff --git a/mindspore/lite/src/ops/conv2d_grad_filter.cc b/mindspore/lite/src/ops/conv2d_grad_filter.cc
index 1fcd9ced90..f3ef4d36e1 100644
--- a/mindspore/lite/src/ops/conv2d_grad_filter.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_filter.cc
@@ -68,7 +68,22 @@ void Conv2DGradFilter::SetActivationType(int activation_type) {
 }
 
 #else
-
+int Conv2DGradFilter::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Conv2DGradFilter();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Conv2DGradFilter return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateConv2DGradFilter(
+    *fbb, attr->format(), attr->group(), attr->channelIn(), attr->channelOut(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Conv2DGradFilter, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Conv2DGradFilter::GetFormat() const { return this->primitive_->value_as_Conv2DGradFilter()->format(); }
 int Conv2DGradFilter::GetGroup() const { return this->primitive_->value_as_Conv2DGradFilter()->group(); }
 int Conv2DGradFilter::GetChannelIn() const { return this->primitive_->value_as_Conv2DGradFilter()->channelIn(); }
@@ -89,23 +104,6 @@ int Conv2DGradFilter::GetActivationType() const {
   return this->primitive_->value_as_Conv2DGradFilter()->activationType();
 }
 
-void Conv2DGradFilter::SetFormat(int format) {}
-void Conv2DGradFilter::SetGroup(int group) {}
-void Conv2DGradFilter::SetChannelIn(int channel_in) {}
-void Conv2DGradFilter::SetChannelOut(int channel_out) {}
-void Conv2DGradFilter::SetKernelW(int kernel_w) {}
-void Conv2DGradFilter::SetKernelH(int kernel_h) {}
-void Conv2DGradFilter::SetStrideW(int stride_w) {}
-void Conv2DGradFilter::SetStrideH(int stride_h) {}
-void Conv2DGradFilter::SetPadMode(int pad_mode) {}
-void Conv2DGradFilter::SetPadUp(int pad_up) {}
-void Conv2DGradFilter::SetPadDown(int pad_down) {}
-void Conv2DGradFilter::SetPadLeft(int pad_left) {}
-void Conv2DGradFilter::SetPadRight(int pad_right) {}
-void Conv2DGradFilter::SetDilateW(int dilate_w) {}
-void Conv2DGradFilter::SetDilateH(int dilate_h) {}
-void Conv2DGradFilter::SetHasBias(bool has_bias) {}
-void Conv2DGradFilter::SetActivationType(int activation_type) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/conv2d_grad_filter.h b/mindspore/lite/src/ops/conv2d_grad_filter.h
index 5e342e6ee4..54fd9a3bf0 100644
--- a/mindspore/lite/src/ops/conv2d_grad_filter.h
+++ b/mindspore/lite/src/ops/conv2d_grad_filter.h
@@ -28,10 +28,30 @@ namespace lite {
 class Conv2DGradFilter : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Conv2DGradFilter, PrimitiveC);
   Conv2DGradFilter() = default;
   explicit Conv2DGradFilter(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetGroup(int group);
+  void SetChannelIn(int channel_in);
+  void SetChannelOut(int channel_out);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 #else
-  explicit Conv2DGradFilter(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Conv2DGradFilter() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetFormat() const;
   int GetGroup() const;
@@ -50,23 +70,6 @@ class Conv2DGradFilter : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetGroup(int group);
-  void SetChannelIn(int channel_in);
-  void SetChannelOut(int channel_out);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/conv2d_grad_input.cc b/mindspore/lite/src/ops/conv2d_grad_input.cc
index 28a66d2e3c..a8a26d2bc2 100644
--- a/mindspore/lite/src/ops/conv2d_grad_input.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_input.cc
@@ -66,7 +66,22 @@ void Conv2DGradInput::SetActivationType(int activation_type) {
 }
 
 #else
-
+int Conv2DGradInput::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Conv2DGradInput();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Conv2DGradInput return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateConv2DGradInput(
+    *fbb, attr->format(), attr->group(), attr->channelIn(), attr->channelOut(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Conv2DGradInput, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Conv2DGradInput::GetFormat() const { return this->primitive_->value_as_Conv2DGradInput()->format(); }
 int Conv2DGradInput::GetGroup() const { return this->primitive_->value_as_Conv2DGradInput()->group(); }
 int Conv2DGradInput::GetChannelIn() const { return this->primitive_->value_as_Conv2DGradInput()->channelIn(); }
@@ -87,23 +102,6 @@ int Conv2DGradInput::GetActivationType() const {
   return this->primitive_->value_as_Conv2DGradInput()->activationType();
 }
 
-void Conv2DGradInput::SetFormat(int format) {}
-void Conv2DGradInput::SetGroup(int group) {}
-void Conv2DGradInput::SetChannelIn(int channel_in) {}
-void Conv2DGradInput::SetChannelOut(int channel_out) {}
-void Conv2DGradInput::SetKernelW(int kernel_w) {}
-void Conv2DGradInput::SetKernelH(int kernel_h) {}
-void Conv2DGradInput::SetStrideW(int stride_w) {}
-void Conv2DGradInput::SetStrideH(int stride_h) {}
-void Conv2DGradInput::SetPadMode(int pad_mode) {}
-void Conv2DGradInput::SetPadUp(int pad_up) {}
-void Conv2DGradInput::SetPadDown(int pad_down) {}
-void Conv2DGradInput::SetPadLeft(int pad_left) {}
-void Conv2DGradInput::SetPadRight(int pad_right) {}
-void Conv2DGradInput::SetDilateW(int dilate_w) {}
-void Conv2DGradInput::SetDilateH(int dilate_h) {}
-void Conv2DGradInput::SetHasBias(bool has_bias) {}
-void Conv2DGradInput::SetActivationType(int activation_type) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/conv2d_grad_input.h b/mindspore/lite/src/ops/conv2d_grad_input.h
index ce92332318..7d8cd2582a 100644
--- a/mindspore/lite/src/ops/conv2d_grad_input.h
+++ b/mindspore/lite/src/ops/conv2d_grad_input.h
@@ -28,10 +28,30 @@ namespace lite {
 class Conv2DGradInput : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Conv2DGradInput, PrimitiveC);
   Conv2DGradInput() = default;
   explicit Conv2DGradInput(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetGroup(int group);
+  void SetChannelIn(int channel_in);
+  void SetChannelOut(int channel_out);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 #else
-  explicit Conv2DGradInput(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Conv2DGradInput() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetFormat() const;
   int GetGroup() const;
@@ -50,23 +70,6 @@ class Conv2DGradInput : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetGroup(int group);
-  void SetChannelIn(int channel_in);
-  void SetChannelOut(int channel_out);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/cos.cc b/mindspore/lite/src/ops/cos.cc
new file mode 100644
index 0000000000..373b121d97
--- /dev/null
+++ b/mindspore/lite/src/ops/cos.cc
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/cos.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+int Cos::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateCos(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Cos, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/cos.h b/mindspore/lite/src/ops/cos.h
index 4675c92eb5..b88a41f13b 100644
--- a/mindspore/lite/src/ops/cos.h
+++ b/mindspore/lite/src/ops/cos.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
@@ -31,7 +31,9 @@ class Cos : public ArithmeticSelf {
   Cos() = default;
   explicit Cos(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Cos(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Cos() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/crop.cc b/mindspore/lite/src/ops/crop.cc
index 99bc5df434..514a5975d2 100644
--- a/mindspore/lite/src/ops/crop.cc
+++ b/mindspore/lite/src/ops/crop.cc
@@ -26,15 +26,31 @@ void Crop::SetAxis(int64_t axis) { this->primitive_->value.AsCrop()->axis = axis
 void Crop::SetOffsets(const std::vector<int64_t> &offsets) { this->primitive_->value.AsCrop()->offsets = offsets; }
 
 #else
-
+int Crop::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Crop();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Crop return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int64_t> offsets;
+  if (attr->offsets() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->offsets()->size()); i++) {
+      offsets.push_back(attr->offsets()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateCropDirect(*fbb, attr->axis(), &offsets);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Crop, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int64_t Crop::GetAxis() const { return this->primitive_->value_as_Crop()->axis(); }
 std::vector<int64_t> Crop::GetOffsets() const {
   auto fb_vector = this->primitive_->value_as_Crop()->offsets();
   return std::vector<int64_t>(fb_vector->begin(), fb_vector->end());
 }
 
-void Crop::SetAxis(int64_t axis) {}
-void Crop::SetOffsets(const std::vector<int64_t> &offsets) {}
 #endif
 namespace {
 constexpr int kCropOutputNum = 1;
diff --git a/mindspore/lite/src/ops/crop.h b/mindspore/lite/src/ops/crop.h
index 152f5e48a7..0650f7925f 100644
--- a/mindspore/lite/src/ops/crop.h
+++ b/mindspore/lite/src/ops/crop.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,16 +29,19 @@ namespace lite {
 class Crop : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Crop, PrimitiveC);
   Crop() = default;
   explicit Crop(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int64_t axis);
+  void SetOffsets(const std::vector<int64_t> &offsets);
 #else
-  explicit Crop(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Crop() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int64_t GetAxis() const;
   std::vector<int64_t> GetOffsets() const;
-  void SetAxis(int64_t axis);
-  void SetOffsets(const std::vector<int64_t> &offsets);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/deconv2d.cc b/mindspore/lite/src/ops/deconv2d.cc
index 1e93029074..892207c8cc 100644
--- a/mindspore/lite/src/ops/deconv2d.cc
+++ b/mindspore/lite/src/ops/deconv2d.cc
@@ -58,7 +58,22 @@ void DeConv2D::SetActivationType(int activation_type) {
 }
 
 #else
-
+int DeConv2D::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_DeConv2D();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_DeConv2D return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDeConv2D(
+    *fbb, attr->format(), attr->group(), attr->channelIn(), attr->channelOut(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_DeConv2D, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int DeConv2D::GetFormat() const { return this->primitive_->value_as_DeConv2D()->format(); }
 int DeConv2D::GetGroup() const { return this->primitive_->value_as_DeConv2D()->group(); }
 int DeConv2D::GetChannelIn() const { return this->primitive_->value_as_DeConv2D()->channelIn(); }
@@ -77,23 +92,6 @@ int DeConv2D::GetDilateH() const { return this->primitive_->value_as_DeConv2D()-
 bool DeConv2D::GetHasBias() const { return this->primitive_->value_as_DeConv2D()->hasBias(); }
 int DeConv2D::GetActivationType() const { return this->primitive_->value_as_DeConv2D()->activationType(); }
 
-void DeConv2D::SetFormat(int format) {}
-void DeConv2D::SetGroup(int group) {}
-void DeConv2D::SetChannelIn(int channel_in) {}
-void DeConv2D::SetChannelOut(int channel_out) {}
-void DeConv2D::SetKernelW(int kernel_w) {}
-void DeConv2D::SetKernelH(int kernel_h) {}
-void DeConv2D::SetStrideW(int stride_w) {}
-void DeConv2D::SetStrideH(int stride_h) {}
-void DeConv2D::SetPadMode(int pad_mode) {}
-void DeConv2D::SetPadUp(int pad_up) {}
-void DeConv2D::SetPadDown(int pad_down) {}
-void DeConv2D::SetPadLeft(int pad_left) {}
-void DeConv2D::SetPadRight(int pad_right) {}
-void DeConv2D::SetDilateW(int dilate_w) {}
-void DeConv2D::SetDilateH(int dilate_h) {}
-void DeConv2D::SetHasBias(bool has_bias) {}
-void DeConv2D::SetActivationType(int activation_type) {}
 #endif
 int DeConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) {
   MS_ASSERT(this->primitive_ != nullptr);
@@ -141,6 +139,18 @@ int DeConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vecto
   }
   std::vector<int> out_shape = {output_n, output_h, output_w, output_c};
   output->set_shape(out_shape);
+
+  if (pad_mode == schema::PadMode_SAME) {
+    pad_u_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2;
+    pad_l_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2;
+  } else if (pad_mode == schema::PadMode_VALID) {
+    pad_u_ = 0;
+    pad_l_ = 0;
+  } else if (pad_mode == schema::PadMode_CAFFE) {
+  } else {
+    MS_LOG(ERROR) << "unsupported pad mode for deconv";
+  }
+
   return 0;
 }
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/deconv2d.h b/mindspore/lite/src/ops/deconv2d.h
index 5fbf25ab43..020b5a95d4 100644
--- a/mindspore/lite/src/ops/deconv2d.h
+++ b/mindspore/lite/src/ops/deconv2d.h
@@ -28,10 +28,30 @@ namespace lite {
 class DeConv2D : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(DeConv2D, PrimitiveC);
   DeConv2D() = default;
   explicit DeConv2D(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetGroup(int group);
+  void SetChannelIn(int channel_in);
+  void SetChannelOut(int channel_out);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 #else
-  explicit DeConv2D(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  DeConv2D() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
@@ -51,23 +71,6 @@ class DeConv2D : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetGroup(int group);
-  void SetChannelIn(int channel_in);
-  void SetChannelOut(int channel_out);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 
   int PadUp() const { return this->pad_u_; }
   int PadDown() const { return this->pad_d_; }
diff --git a/mindspore/lite/src/ops/dedepthwise_conv2d.cc b/mindspore/lite/src/ops/dedepthwise_conv2d.cc
index 7fdcf54d51..b2ac622907 100644
--- a/mindspore/lite/src/ops/dedepthwise_conv2d.cc
+++ b/mindspore/lite/src/ops/dedepthwise_conv2d.cc
@@ -70,7 +70,24 @@ void DeDepthwiseConv2D::SetActivationType(int activation_type) {
 }
 
 #else
+int DeDepthwiseConv2D::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
+  auto attr = primitive->value_as_DeDepthwiseConv2D();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_DeDepthwiseConv2D return nullptr";
+    return RET_ERROR;
+  }
+
+  auto val_offset = schema::CreateDeDepthwiseConv2D(
+    *fbb, attr->format(), attr->channelIn(), attr->channelMultiplier(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_DeDepthwiseConv2D, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int DeDepthwiseConv2D::GetFormat() const { return this->primitive_->value_as_DeDepthwiseConv2D()->format(); }
 int DeDepthwiseConv2D::GetChannelIn() const { return this->primitive_->value_as_DeDepthwiseConv2D()->channelIn(); }
 int DeDepthwiseConv2D::GetChannelMultiplier() const {
@@ -92,22 +109,6 @@ int DeDepthwiseConv2D::GetActivationType() const {
   return this->primitive_->value_as_DeDepthwiseConv2D()->activationType();
 }
 
-void DeDepthwiseConv2D::SetFormat(int format) {}
-void DeDepthwiseConv2D::SetChannelIn(int channel_in) {}
-void DeDepthwiseConv2D::SetChannelMultiplier(int channel_multiplier) {}
-void DeDepthwiseConv2D::SetKernelW(int kernel_w) {}
-void DeDepthwiseConv2D::SetKernelH(int kernel_h) {}
-void DeDepthwiseConv2D::SetStrideW(int stride_w) {}
-void DeDepthwiseConv2D::SetStrideH(int stride_h) {}
-void DeDepthwiseConv2D::SetPadMode(int pad_mode) {}
-void DeDepthwiseConv2D::SetPadUp(int pad_up) {}
-void DeDepthwiseConv2D::SetPadDown(int pad_down) {}
-void DeDepthwiseConv2D::SetPadLeft(int pad_left) {}
-void DeDepthwiseConv2D::SetPadRight(int pad_right) {}
-void DeDepthwiseConv2D::SetDilateW(int dilate_w) {}
-void DeDepthwiseConv2D::SetDilateH(int dilate_h) {}
-void DeDepthwiseConv2D::SetHasBias(bool has_bias) {}
-void DeDepthwiseConv2D::SetActivationType(int activation_type) {}
 #endif
 int DeDepthwiseConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
                                   std::vector<lite::tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/dedepthwise_conv2d.h b/mindspore/lite/src/ops/dedepthwise_conv2d.h
index f285b76964..142ce5b1f4 100644
--- a/mindspore/lite/src/ops/dedepthwise_conv2d.h
+++ b/mindspore/lite/src/ops/dedepthwise_conv2d.h
@@ -28,10 +28,29 @@ namespace lite {
 class DeDepthwiseConv2D : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(DeDepthwiseConv2D, PrimitiveC);
   DeDepthwiseConv2D() = default;
   explicit DeDepthwiseConv2D(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetChannelIn(int channel_in);
+  void SetChannelMultiplier(int channel_multiplier);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 #else
-  explicit DeDepthwiseConv2D(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  DeDepthwiseConv2D() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
@@ -50,22 +69,6 @@ class DeDepthwiseConv2D : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetChannelIn(int channel_in);
-  void SetChannelMultiplier(int channel_multiplier);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 
   int PadUp() const { return this->pad_u_; }
   int PadDown() const { return this->pad_d_; }
diff --git a/mindspore/lite/src/ops/depth_to_space.cc b/mindspore/lite/src/ops/depth_to_space.cc
index b1b3e2d026..e090fe0dc5 100644
--- a/mindspore/lite/src/ops/depth_to_space.cc
+++ b/mindspore/lite/src/ops/depth_to_space.cc
@@ -26,12 +26,22 @@ void DepthToSpace::SetBlockSize(int block_size) { this->primitive_->value.AsDept
 void DepthToSpace::SetFormat(int format) { this->primitive_->value.AsDepthToSpace()->format = (schema::Format)format; }
 
 #else
-
+int DepthToSpace::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_DepthToSpace();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_DepthToSpace return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDepthToSpace(*fbb, attr->blockSize(), attr->format());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_DepthToSpace, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int DepthToSpace::GetBlockSize() const { return this->primitive_->value_as_DepthToSpace()->blockSize(); }
 int DepthToSpace::GetFormat() const { return this->primitive_->value_as_DepthToSpace()->format(); }
 
-void DepthToSpace::SetBlockSize(int block_size) {}
-void DepthToSpace::SetFormat(int format) {}
 #endif
 namespace {
 constexpr int kDepthToSpaceOutputNum = 1;
diff --git a/mindspore/lite/src/ops/depth_to_space.h b/mindspore/lite/src/ops/depth_to_space.h
index 4b02e6e948..5320b27c4a 100644
--- a/mindspore/lite/src/ops/depth_to_space.h
+++ b/mindspore/lite/src/ops/depth_to_space.h
@@ -28,16 +28,19 @@ namespace lite {
 class DepthToSpace : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(DepthToSpace, PrimitiveC);
   DepthToSpace() = default;
   explicit DepthToSpace(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBlockSize(int block_size);
+  void SetFormat(int format);
 #else
-  explicit DepthToSpace(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  DepthToSpace() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetBlockSize() const;
   int GetFormat() const;
-  void SetBlockSize(int block_size);
-  void SetFormat(int format);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/depthwise_conv2d.cc b/mindspore/lite/src/ops/depthwise_conv2d.cc
index 5c1e8d4f24..a66e33f82a 100644
--- a/mindspore/lite/src/ops/depthwise_conv2d.cc
+++ b/mindspore/lite/src/ops/depthwise_conv2d.cc
@@ -232,7 +232,22 @@ int DepthwiseConv2D::UnPackAttr(const Primitive &prim, const std::vector<AnfNode
 }
 
 #else
-
+int DepthwiseConv2D::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_DepthwiseConv2D();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_DepthwiseConv2D return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDepthwiseConv2D(
+    *fbb, attr->format(), attr->channelIn(), attr->channelMultiplier(), attr->kernelW(), attr->kernelH(),
+    attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(), attr->padLeft(),
+    attr->padRight(), attr->dilateW(), attr->dilateH(), attr->hasBias(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_DepthwiseConv2D, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int DepthwiseConv2D::GetFormat() const { return this->primitive_->value_as_DepthwiseConv2D()->format(); }
 int DepthwiseConv2D::GetChannelIn() const { return this->primitive_->value_as_DepthwiseConv2D()->channelIn(); }
 int DepthwiseConv2D::GetChannelMultiplier() const {
@@ -254,22 +269,6 @@ int DepthwiseConv2D::GetActivationType() const {
   return this->primitive_->value_as_DepthwiseConv2D()->activationType();
 }
 
-void DepthwiseConv2D::SetFormat(int format) {}
-void DepthwiseConv2D::SetChannelIn(int channel_in) {}
-void DepthwiseConv2D::SetChannelMultiplier(int channel_multiplier) {}
-void DepthwiseConv2D::SetKernelW(int kernel_w) {}
-void DepthwiseConv2D::SetKernelH(int kernel_h) {}
-void DepthwiseConv2D::SetStrideW(int stride_w) {}
-void DepthwiseConv2D::SetStrideH(int stride_h) {}
-void DepthwiseConv2D::SetPadMode(int pad_mode) {}
-void DepthwiseConv2D::SetPadUp(int pad_up) {}
-void DepthwiseConv2D::SetPadDown(int pad_down) {}
-void DepthwiseConv2D::SetPadLeft(int pad_left) {}
-void DepthwiseConv2D::SetPadRight(int pad_right) {}
-void DepthwiseConv2D::SetDilateW(int dilate_w) {}
-void DepthwiseConv2D::SetDilateH(int dilate_h) {}
-void DepthwiseConv2D::SetHasBias(bool has_bias) {}
-void DepthwiseConv2D::SetActivationType(int activation_type) {}
 #endif
 int DepthwiseConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
                                 std::vector<lite::tensor::Tensor *> outputs_) {
@@ -299,6 +298,7 @@ int DepthwiseConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
   int input_channel = in_shape.at(3);
   int output_w = 0, output_h = 0;
 
+  input_channel_ = input_channel;
   pad_l_ = GetPadLeft();
   pad_u_ = GetPadUp();
   pad_d_ = GetPadDown();
@@ -308,10 +308,14 @@ int DepthwiseConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
     output_w = std::ceil(static_cast<float>(input_w) / static_cast<float>(GetStrideW()));
     auto pad_h_all = ((output_h - 1) * GetStrideH() + (GetKernelH() - 1) * GetDilateH() + 1 - input_h);
     auto pad_w_all = ((output_w - 1) * GetStrideW() + (GetKernelW() - 1) * GetDilateW() + 1 - input_w);
-    pad_u_ = pad_h_all / 2;
-    pad_d_ = pad_h_all - pad_u_;
-    pad_l_ = pad_w_all / 2;
-    pad_r_ = pad_w_all - pad_l_;
+    if (pad_h_all > 0) {
+      pad_u_ = pad_h_all / 2;
+      pad_d_ = pad_h_all - pad_u_;
+    }
+    if (pad_w_all > 0) {
+      pad_l_ = pad_w_all / 2;
+      pad_r_ = pad_w_all - pad_l_;
+    }
   } else {
     output_h = std::ceil((static_cast<float>(input_h) + pad_u_ + pad_d_ -
                           (static_cast<float>(GetKernelH()) - 1) * static_cast<float>(GetDilateH())) /
diff --git a/mindspore/lite/src/ops/depthwise_conv2d.h b/mindspore/lite/src/ops/depthwise_conv2d.h
index ea92d565af..aada41f542 100644
--- a/mindspore/lite/src/ops/depthwise_conv2d.h
+++ b/mindspore/lite/src/ops/depthwise_conv2d.h
@@ -27,12 +27,29 @@ namespace mindspore {
 namespace lite {
 class DepthwiseConv2D : public PrimitiveC {
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(DepthwiseConv2D, PrimitiveC);
 
  public:
   DepthwiseConv2D() = default;
   explicit DepthwiseConv2D(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetFormat(int format);
+  void SetChannelIn(int channel_in);
+  void SetChannelMultiplier(int channel_multiplier);
+  void SetKernelW(int kernel_w);
+  void SetKernelH(int kernel_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetDilateW(int dilate_w);
+  void SetDilateH(int dilate_h);
+  void SetHasBias(bool has_bias);
+  void SetActivationType(int activation_type);
 
  private:
   void PopulaterQuantParam(const Primitive &prim, std::vector<std::vector<schema::QuantParamT>> *vecInputQuantParam,
@@ -41,7 +58,9 @@ class DepthwiseConv2D : public PrimitiveC {
 #else
 
  public:
-  explicit DepthwiseConv2D(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  DepthwiseConv2D() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 
  public:
@@ -62,33 +81,19 @@ class DepthwiseConv2D : public PrimitiveC {
   int GetDilateH() const;
   bool GetHasBias() const;
   int GetActivationType() const;
-  void SetFormat(int format);
-  void SetChannelIn(int channel_in);
-  void SetChannelMultiplier(int channel_multiplier);
-  void SetKernelW(int kernel_w);
-  void SetKernelH(int kernel_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetDilateW(int dilate_w);
-  void SetDilateH(int dilate_h);
-  void SetHasBias(bool has_bias);
-  void SetActivationType(int activation_type);
 
   int PadUp() const { return this->pad_u_; }
   int PadDown() const { return this->pad_d_; }
   int PadLeft() const { return this->pad_l_; }
   int PadRight() const { return this->pad_r_; }
+  int GetInputChannel() const { return this->input_channel_; }
 
  protected:
   int pad_u_ = 0;
   int pad_d_ = 0;
   int pad_l_ = 0;
   int pad_r_ = 0;
+  int input_channel_ = 0;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/dequant.cc b/mindspore/lite/src/ops/dequant.cc
index eabb28124c..e9ab1bf36c 100644
--- a/mindspore/lite/src/ops/dequant.cc
+++ b/mindspore/lite/src/ops/dequant.cc
@@ -21,10 +21,30 @@ namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int Dequant::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::OnnxInt8DequantizeT>();
-  this->primitive_->value.type = schema::PrimitiveType_OnnxInt8Dequantize;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_OnnxInt8Dequantize;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_OnnxInt8Dequantize) {
+    MS_LOG(ERROR) << "primitive_ type is error:" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow)(schema::OnnxInt8DequantizeT);
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "attr is nullptr";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 #endif
diff --git a/mindspore/lite/src/ops/dequant.h b/mindspore/lite/src/ops/dequant.h
index d9553177a2..73fd1391f3 100644
--- a/mindspore/lite/src/ops/dequant.h
+++ b/mindspore/lite/src/ops/dequant.h
@@ -25,11 +25,12 @@ namespace lite {
 class Dequant : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Dequant, PrimitiveC);
   Dequant() = default;
   explicit Dequant(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Dequant(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Dequant() = default;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/detection_post_process.cc b/mindspore/lite/src/ops/detection_post_process.cc
index bbe1a1e5b5..5e4754f077 100644
--- a/mindspore/lite/src/ops/detection_post_process.cc
+++ b/mindspore/lite/src/ops/detection_post_process.cc
@@ -88,7 +88,22 @@ void DetectionPostProcess::SetUseRegularNms(bool use_regular_nms) {
 }
 
 #else
-
+int DetectionPostProcess::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_DetectionPostProcess();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_DetectionPostProcess return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDetectionPostProcess(
+    *fbb, attr->format(), attr->inputSize(), attr->hScale(), attr->wScale(), attr->xScale(), attr->yScale(),
+    attr->NmsIouThreshold(), attr->NmsScoreThreshold(), attr->MaxDetections(), attr->DetectionsPreClass(),
+    attr->MaxClassesPreDetection(), attr->NumClasses(), attr->UseRegularNms());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_DetectionPostProcess, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int DetectionPostProcess::GetFormat() const { return this->primitive_->value_as_DetectionPostProcess()->format(); }
 int DetectionPostProcess::GetInputSize() const {
   return this->primitive_->value_as_DetectionPostProcess()->inputSize();
@@ -119,19 +134,6 @@ bool DetectionPostProcess::GetUseRegularNms() const {
   return this->primitive_->value_as_DetectionPostProcess()->UseRegularNms();
 }
 
-void DetectionPostProcess::SetFormat(int format) {}
-void DetectionPostProcess::SetInputSize(int input_size) {}
-void DetectionPostProcess::SetHScale(float h_scale) {}
-void DetectionPostProcess::SetWScale(float w_scale) {}
-void DetectionPostProcess::SetXScale(float x_scale) {}
-void DetectionPostProcess::SetYScale(float y_scale) {}
-void DetectionPostProcess::SetNmsIouThreshold(float nms_iou_threshold) {}
-void DetectionPostProcess::SetNmsScoreThreshold(float nms_score_threshold) {}
-void DetectionPostProcess::SetMaxDetections(int64_t max_detections) {}
-void DetectionPostProcess::SetDetectionsPreClass(int64_t detections_pre_class) {}
-void DetectionPostProcess::SetMaxClassesPreDetection(int64_t max_classes_pre_detection) {}
-void DetectionPostProcess::SetNumClasses(int64_t num_classes) {}
-void DetectionPostProcess::SetUseRegularNms(bool use_regular_nms) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/detection_post_process.h b/mindspore/lite/src/ops/detection_post_process.h
index f4d687ab89..4fb9dea282 100644
--- a/mindspore/lite/src/ops/detection_post_process.h
+++ b/mindspore/lite/src/ops/detection_post_process.h
@@ -28,10 +28,26 @@ namespace lite {
 class DetectionPostProcess : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(DetectionPostProcess, PrimitiveC);
   DetectionPostProcess() = default;
   explicit DetectionPostProcess(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetInputSize(int input_size);
+  void SetHScale(float h_scale);
+  void SetWScale(float w_scale);
+  void SetXScale(float x_scale);
+  void SetYScale(float y_scale);
+  void SetNmsIouThreshold(float nms_iou_threshold);
+  void SetNmsScoreThreshold(float nms_score_threshold);
+  void SetMaxDetections(int64_t max_detections);
+  void SetDetectionsPreClass(int64_t detections_pre_class);
+  void SetMaxClassesPreDetection(int64_t max_classes_pre_detection);
+  void SetNumClasses(int64_t num_classes);
+  void SetUseRegularNms(bool use_regular_nms);
 #else
-  explicit DetectionPostProcess(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  DetectionPostProcess() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetFormat() const;
   int GetInputSize() const;
@@ -46,19 +62,6 @@ class DetectionPostProcess : public PrimitiveC {
   int64_t GetMaxClassesPreDetection() const;
   int64_t GetNumClasses() const;
   bool GetUseRegularNms() const;
-  void SetFormat(int format);
-  void SetInputSize(int input_size);
-  void SetHScale(float h_scale);
-  void SetWScale(float w_scale);
-  void SetXScale(float x_scale);
-  void SetYScale(float y_scale);
-  void SetNmsIouThreshold(float nms_iou_threshold);
-  void SetNmsScoreThreshold(float nms_score_threshold);
-  void SetMaxDetections(int64_t max_detections);
-  void SetDetectionsPreClass(int64_t detections_pre_class);
-  void SetMaxClassesPreDetection(int64_t max_classes_pre_detection);
-  void SetNumClasses(int64_t num_classes);
-  void SetUseRegularNms(bool use_regular_nms);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/div.cc b/mindspore/lite/src/ops/div.cc
index 93da12cc7e..12300fe9ec 100644
--- a/mindspore/lite/src/ops/div.cc
+++ b/mindspore/lite/src/ops/div.cc
@@ -26,10 +26,21 @@ void Div::SetActivationType(int activation_type) {
 }
 
 #else
-
+int Div::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Div();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Div return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDiv(*fbb, attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Div, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Div::GetActivationType() const { return this->primitive_->value_as_Div()->activationType(); }
 
-void Div::SetActivationType(int activation_type) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/div.h b/mindspore/lite/src/ops/div.h
index 5f10b2bc23..6ca390b091 100644
--- a/mindspore/lite/src/ops/div.h
+++ b/mindspore/lite/src/ops/div.h
@@ -28,13 +28,17 @@ namespace lite {
 class Div : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Div, Arithmetic);
   Div() = default;
   explicit Div(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
+  void SetActivationType(int activation_type);
+
 #else
-  explicit Div(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Div() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetActivationType() const;
-  void SetActivationType(int activation_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/dropout.cc b/mindspore/lite/src/ops/dropout.cc
index b381e8f031..7cce7ad1cc 100644
--- a/mindspore/lite/src/ops/dropout.cc
+++ b/mindspore/lite/src/ops/dropout.cc
@@ -24,10 +24,21 @@ float Dropout::GetRatio() const { return this->primitive_->value.AsDropout()->ra
 void Dropout::SetRatio(float ratio) { this->primitive_->value.AsDropout()->ratio = ratio; }
 
 #else
-
+int Dropout::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Dropout();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Dropout return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateDropout(*fbb, attr->ratio());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Dropout, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float Dropout::GetRatio() const { return this->primitive_->value_as_Dropout()->ratio(); }
 
-void Dropout::SetRatio(float ratio) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/dropout.h b/mindspore/lite/src/ops/dropout.h
index 4a99d07a81..f2a756eddd 100644
--- a/mindspore/lite/src/ops/dropout.h
+++ b/mindspore/lite/src/ops/dropout.h
@@ -20,21 +20,25 @@
 #include <vector>
 #include <set>
 #include <cmath>
-#include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
+#include "ir/dtype/type_id.h"
 
 namespace mindspore {
 namespace lite {
 class Dropout : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Dropout, PrimitiveC);
   Dropout() = default;
   explicit Dropout(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetRatio(float ratio);
+
 #else
-  explicit Dropout(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Dropout() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetRatio() const;
-  void SetRatio(float ratio);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/eltwise.cc b/mindspore/lite/src/ops/eltwise.cc
index 3c5e536512..3760cd9f85 100644
--- a/mindspore/lite/src/ops/eltwise.cc
+++ b/mindspore/lite/src/ops/eltwise.cc
@@ -24,10 +24,21 @@ int Eltwise::GetMode() const { return this->primitive_->value.AsEltwise()->mode;
 void Eltwise::SetMode(int mode) { this->primitive_->value.AsEltwise()->mode = (schema::EltwiseMode)mode; }
 
 #else
-
+int Eltwise::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Eltwise();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Eltwise return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateEltwise(*fbb, attr->mode());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Eltwise, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Eltwise::GetMode() const { return this->primitive_->value_as_Eltwise()->mode(); }
 
-void Eltwise::SetMode(int mode) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/eltwise.h b/mindspore/lite/src/ops/eltwise.h
index ba28b1f533..720724b94f 100644
--- a/mindspore/lite/src/ops/eltwise.h
+++ b/mindspore/lite/src/ops/eltwise.h
@@ -28,13 +28,17 @@ namespace lite {
 class Eltwise : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Eltwise, PrimitiveC);
   Eltwise() = default;
   explicit Eltwise(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMode(int mode);
+
 #else
-  explicit Eltwise(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Eltwise() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetMode() const;
-  void SetMode(int mode);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/elu.cc b/mindspore/lite/src/ops/elu.cc
index 6c164e45ad..9a1e16991f 100644
--- a/mindspore/lite/src/ops/elu.cc
+++ b/mindspore/lite/src/ops/elu.cc
@@ -24,10 +24,21 @@ float Elu::GetAlpha() const { return this->primitive_->value.AsElu()->alpha; }
 void Elu::SetAlpha(float alpha) { this->primitive_->value.AsElu()->alpha = alpha; }
 
 #else
-
+int Elu::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Elu();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Elu return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateElu(*fbb, attr->alpha());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Elu, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float Elu::GetAlpha() const { return this->primitive_->value_as_Elu()->alpha(); }
 
-void Elu::SetAlpha(float alpha) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/elu.h b/mindspore/lite/src/ops/elu.h
index 2c08ea817d..e0f3a5f576 100644
--- a/mindspore/lite/src/ops/elu.h
+++ b/mindspore/lite/src/ops/elu.h
@@ -28,13 +28,17 @@ namespace lite {
 class Elu : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Elu, PrimitiveC);
   Elu() = default;
   explicit Elu(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAlpha(float alpha);
+
 #else
-  explicit Elu(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Elu() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetAlpha() const;
-  void SetAlpha(float alpha);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/embedding_lookup.cc b/mindspore/lite/src/ops/embedding_lookup.cc
index 4a4b391529..270f22f310 100644
--- a/mindspore/lite/src/ops/embedding_lookup.cc
+++ b/mindspore/lite/src/ops/embedding_lookup.cc
@@ -24,10 +24,23 @@ float EmbeddingLookup::GetMaxNorm() const { return this->primitive_->value.AsEmb
 void EmbeddingLookup::SetMaxNorm(float max_norm) { this->primitive_->value.AsEmbeddingLookup()->maxNorm = max_norm; }
 
 #else
+int EmbeddingLookup::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
+  auto attr = primitive->value_as_EmbeddingLookup();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_EmbeddingLookup return nullptr";
+    return RET_ERROR;
+  }
+
+  auto val_offset = schema::CreateEmbeddingLookup(*fbb, attr->maxNorm());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_EmbeddingLookup, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float EmbeddingLookup::GetMaxNorm() const { return this->primitive_->value_as_EmbeddingLookup()->maxNorm(); }
 
-void EmbeddingLookup::SetMaxNorm(float max_norm) {}
 #endif
 
 int EmbeddingLookup::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/embedding_lookup.h b/mindspore/lite/src/ops/embedding_lookup.h
index 82aa70f12e..c51441b9b0 100644
--- a/mindspore/lite/src/ops/embedding_lookup.h
+++ b/mindspore/lite/src/ops/embedding_lookup.h
@@ -28,14 +28,18 @@ namespace lite {
 class EmbeddingLookup : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(EmbeddingLookup, PrimitiveC);
   EmbeddingLookup() = default;
   explicit EmbeddingLookup(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMaxNorm(float max_norm);
+
 #else
-  explicit EmbeddingLookup(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  EmbeddingLookup() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   float GetMaxNorm() const;
-  void SetMaxNorm(float max_norm);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/embedding_lookup_sparse.cc b/mindspore/lite/src/ops/embedding_lookup_sparse.cc
index eb37231f9b..b981defde2 100644
--- a/mindspore/lite/src/ops/embedding_lookup_sparse.cc
+++ b/mindspore/lite/src/ops/embedding_lookup_sparse.cc
@@ -38,7 +38,32 @@ void EmbeddingLookupSparse::SetMaxNortm(float max_nortm) {
 }
 
 #else
-
+int EmbeddingLookupSparse::UnPackToFlatBuilder(const schema::Primitive *primitive,
+                                               flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_EmbeddingLookupSparse();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_EmbeddingLookupSparse return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> spIds;
+  if (attr->spIds() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->spIds()->size()); i++) {
+      spIds.push_back(attr->spIds()->data()[i]);
+    }
+  }
+  std::vector<float> spWeights;
+  if (attr->spWeights() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->spWeights()->size()); i++) {
+      spWeights.push_back(attr->spWeights()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateEmbeddingLookupSparseDirect(*fbb, &spIds, &spWeights);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_EmbeddingLookupSparse, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> EmbeddingLookupSparse::GetSpIds() const {
   auto fb_vector = this->primitive_->value_as_EmbeddingLookupSparse()->spIds();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
@@ -51,9 +76,6 @@ float EmbeddingLookupSparse::GetMaxNortm() const {
   return this->primitive_->value_as_EmbeddingLookupSparse()->maxNortm();
 }
 
-void EmbeddingLookupSparse::SetSpIds(const std::vector<int> &sp_ids) {}
-void EmbeddingLookupSparse::SetSpWeights(const std::vector<float> &sp_weights) {}
-void EmbeddingLookupSparse::SetMaxNortm(float max_nortm) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/embedding_lookup_sparse.h b/mindspore/lite/src/ops/embedding_lookup_sparse.h
index a07a01c991..8ca4cd1177 100644
--- a/mindspore/lite/src/ops/embedding_lookup_sparse.h
+++ b/mindspore/lite/src/ops/embedding_lookup_sparse.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,17 +29,20 @@ namespace lite {
 class EmbeddingLookupSparse : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(EmbeddingLookupSparse, PrimitiveC);
   EmbeddingLookupSparse() = default;
   explicit EmbeddingLookupSparse(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetSpIds(const std::vector<int> &sp_ids);
+  void SetSpWeights(const std::vector<float> &sp_weights);
+  void SetMaxNortm(float max_nortm);
 #else
-  explicit EmbeddingLookupSparse(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  EmbeddingLookupSparse() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetSpIds() const;
   std::vector<float> GetSpWeights() const;
   float GetMaxNortm() const;
-  void SetSpIds(const std::vector<int> &sp_ids);
-  void SetSpWeights(const std::vector<float> &sp_weights);
-  void SetMaxNortm(float max_nortm);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/equal.cc b/mindspore/lite/src/ops/equal.cc
new file mode 100644
index 0000000000..9732211061
--- /dev/null
+++ b/mindspore/lite/src/ops/equal.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/equal.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+int Equal::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateEqual(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Equal, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/equal.h b/mindspore/lite/src/ops/equal.h
index 82b0cf362c..69194b5528 100644
--- a/mindspore/lite/src/ops/equal.h
+++ b/mindspore/lite/src/ops/equal.h
@@ -28,10 +28,13 @@ namespace lite {
 class Equal : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Equal, PrimitiveC);
   Equal() = default;
   explicit Equal(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit Equal(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Equal() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/exp.cc b/mindspore/lite/src/ops/exp.cc
new file mode 100644
index 0000000000..1c5acbba01
--- /dev/null
+++ b/mindspore/lite/src/ops/exp.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/exp.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+int Exp::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateExp(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Exp, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/exp.h b/mindspore/lite/src/ops/exp.h
index a1f7eaa23b..c79648725e 100644
--- a/mindspore/lite/src/ops/exp.h
+++ b/mindspore/lite/src/ops/exp.h
@@ -28,10 +28,13 @@ namespace lite {
 class Exp : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Exp, ArithmeticSelf);
   Exp() = default;
   explicit Exp(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Exp(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Exp() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/expand_dims.cc b/mindspore/lite/src/ops/expand_dims.cc
index 38b584d681..7961a6385b 100644
--- a/mindspore/lite/src/ops/expand_dims.cc
+++ b/mindspore/lite/src/ops/expand_dims.cc
@@ -24,10 +24,22 @@ int ExpandDims::GetDim() const { return this->primitive_->value.AsExpandDims()->
 void ExpandDims::SetDim(int dim) { this->primitive_->value.AsExpandDims()->dim = dim; }
 
 #else
+int ExpandDims::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_ExpandDims();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ExpandDims return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateExpandDims(*fbb, attr->dim());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ExpandDims, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int ExpandDims::GetDim() const { return this->primitive_->value_as_ExpandDims()->dim(); }
 
-void ExpandDims::SetDim(int dim) {}
 #endif
 
 int ExpandDims::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/expand_dims.h b/mindspore/lite/src/ops/expand_dims.h
index 36404f1b25..52007b874d 100644
--- a/mindspore/lite/src/ops/expand_dims.h
+++ b/mindspore/lite/src/ops/expand_dims.h
@@ -28,14 +28,18 @@ namespace lite {
 class ExpandDims : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ExpandDims, PrimitiveC);
   ExpandDims() = default;
   explicit ExpandDims(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetDim(int dim);
+
 #else
-  explicit ExpandDims(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ExpandDims() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetDim() const;
-  void SetDim(int dim);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/fake_quant_with_min_max_vars.cc b/mindspore/lite/src/ops/fake_quant_with_min_max_vars.cc
index f027b42753..5bc8b3eb26 100644
--- a/mindspore/lite/src/ops/fake_quant_with_min_max_vars.cc
+++ b/mindspore/lite/src/ops/fake_quant_with_min_max_vars.cc
@@ -32,7 +32,21 @@ void FakeQuantWithMinMaxVars::SetNumBits(int num_bits) {
 }
 
 #else
+int FakeQuantWithMinMaxVars::UnPackToFlatBuilder(const schema::Primitive *primitive,
+                                                 flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_FakeQuantWithMinMaxVars();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_FakeQuantWithMinMaxVars return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateFakeQuantWithMinMaxVars(*fbb, attr->narrowRange(), attr->numBits());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_FakeQuantWithMinMaxVars, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 bool FakeQuantWithMinMaxVars::GetNarrowRange() const {
   return this->primitive_->value_as_FakeQuantWithMinMaxVars()->narrowRange();
 }
@@ -40,8 +54,6 @@ int FakeQuantWithMinMaxVars::GetNumBits() const {
   return this->primitive_->value_as_FakeQuantWithMinMaxVars()->numBits();
 }
 
-void FakeQuantWithMinMaxVars::SetNarrowRange(bool narrow_range) {}
-void FakeQuantWithMinMaxVars::SetNumBits(int num_bits) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/fake_quant_with_min_max_vars.h b/mindspore/lite/src/ops/fake_quant_with_min_max_vars.h
index ecc89074db..a6f85f7b92 100644
--- a/mindspore/lite/src/ops/fake_quant_with_min_max_vars.h
+++ b/mindspore/lite/src/ops/fake_quant_with_min_max_vars.h
@@ -28,15 +28,18 @@ namespace lite {
 class FakeQuantWithMinMaxVars : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(FakeQuantWithMinMaxVars, PrimitiveC);
   FakeQuantWithMinMaxVars() = default;
   explicit FakeQuantWithMinMaxVars(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetNarrowRange(bool narrow_range);
+  void SetNumBits(int num_bits);
 #else
-  explicit FakeQuantWithMinMaxVars(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  FakeQuantWithMinMaxVars() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   bool GetNarrowRange() const;
   int GetNumBits() const;
-  void SetNarrowRange(bool narrow_range);
-  void SetNumBits(int num_bits);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/fill.cc b/mindspore/lite/src/ops/fill.cc
index 35682a0430..9ae1fb4305 100644
--- a/mindspore/lite/src/ops/fill.cc
+++ b/mindspore/lite/src/ops/fill.cc
@@ -24,13 +24,30 @@ std::vector<int> Fill::GetDims() const { return this->primitive_->value.AsFill()
 void Fill::SetDims(const std::vector<int> &dims) { this->primitive_->value.AsFill()->dims = dims; }
 
 #else
-
+int Fill::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Fill();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Fill return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> dims;
+  if (attr->dims() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->dims()->size()); i++) {
+      dims.push_back(attr->dims()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateFillDirect(*fbb, &dims);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Fill, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> Fill::GetDims() const {
   auto fb_vector = this->primitive_->value_as_Fill()->dims();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void Fill::SetDims(const std::vector<int> &dims) {}
 #endif
 
 int Fill::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/fill.h b/mindspore/lite/src/ops/fill.h
index 388e11bea7..f95d22542c 100644
--- a/mindspore/lite/src/ops/fill.h
+++ b/mindspore/lite/src/ops/fill.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,18 @@ namespace lite {
 class Fill : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Fill, PrimitiveC);
   Fill() = default;
   explicit Fill(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetDims(const std::vector<int> &dims);
+
 #else
-  explicit Fill(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Fill() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetDims() const;
-  void SetDims(const std::vector<int> &dims);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/flatten.cc b/mindspore/lite/src/ops/flatten.cc
index 333351922e..949bf3d92d 100644
--- a/mindspore/lite/src/ops/flatten.cc
+++ b/mindspore/lite/src/ops/flatten.cc
@@ -51,10 +51,39 @@ int Flatten::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
 }
 #ifdef PRIMITIVE_WRITEABLE
 int Flatten::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::FlattenT>();
-  this->primitive_->value.type = schema::PrimitiveType_Flatten;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Flatten;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Flatten) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::FlattenT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+#else
+int Flatten::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateFlatten(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Flatten, val_offset.o);
+  fbb->Finish(prim_offset);
   return RET_OK;
 }
 #endif
diff --git a/mindspore/lite/src/ops/flatten.h b/mindspore/lite/src/ops/flatten.h
index d7ab3baa75..ce60608cff 100644
--- a/mindspore/lite/src/ops/flatten.h
+++ b/mindspore/lite/src/ops/flatten.h
@@ -28,14 +28,16 @@ namespace lite {
 class Flatten : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Flatten, PrimitiveC);
   Flatten() = default;
   explicit Flatten(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Flatten(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Flatten() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
-
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/floor.cc b/mindspore/lite/src/ops/floor.cc
new file mode 100644
index 0000000000..d284b102d9
--- /dev/null
+++ b/mindspore/lite/src/ops/floor.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/floor.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+
+int Floor::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateFloor(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Floor, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/floor.h b/mindspore/lite/src/ops/floor.h
index 5fc010249f..f970218cc4 100644
--- a/mindspore/lite/src/ops/floor.h
+++ b/mindspore/lite/src/ops/floor.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Floor : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Floor, ArithmeticSelf);
   Floor() = default;
   explicit Floor(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Floor(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Floor() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/floor_div.cc b/mindspore/lite/src/ops/floor_div.cc
new file mode 100644
index 0000000000..0aa4610d3e
--- /dev/null
+++ b/mindspore/lite/src/ops/floor_div.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/floor_div.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+
+int FloorDiv::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateFloor(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Floor, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/floor_div.h b/mindspore/lite/src/ops/floor_div.h
index a95c1f709b..5525218708 100644
--- a/mindspore/lite/src/ops/floor_div.h
+++ b/mindspore/lite/src/ops/floor_div.h
@@ -28,10 +28,13 @@ namespace lite {
 class FloorDiv : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(FloorDiv, Arithmetic);
   FloorDiv() = default;
   explicit FloorDiv(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit FloorDiv(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  FloorDiv() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/floor_mod.cc b/mindspore/lite/src/ops/floor_mod.cc
new file mode 100644
index 0000000000..f903620655
--- /dev/null
+++ b/mindspore/lite/src/ops/floor_mod.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/floor_mod.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+
+int FloorMod::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateFloorMod(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_FloorMod, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/floor_mod.h b/mindspore/lite/src/ops/floor_mod.h
index f20eb6cc49..adbca5e52c 100644
--- a/mindspore/lite/src/ops/floor_mod.h
+++ b/mindspore/lite/src/ops/floor_mod.h
@@ -28,10 +28,13 @@ namespace lite {
 class FloorMod : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(FloorMod, Arithmetic);
   FloorMod() = default;
   explicit FloorMod(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit FloorMod(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  FloorMod() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/full_connection.cc b/mindspore/lite/src/ops/full_connection.cc
index 35d4c505fa..4be4e45b4b 100644
--- a/mindspore/lite/src/ops/full_connection.cc
+++ b/mindspore/lite/src/ops/full_connection.cc
@@ -31,16 +31,26 @@ void FullConnection::SetActivationType(int activationType) {
   this->primitive_->value.AsFullConnection()->activationType = (schema::ActivationType)activationType;
 }
 #else
+int FullConnection::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_FullConnection();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_FullConnection return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset =
+    schema::CreateFullConnection(*fbb, attr->hasBias(), attr->axis(), attr->useAxis(), attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_FullConnection, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 bool FullConnection::GetHasBias() const { return this->primitive_->value_as_FullConnection()->hasBias(); }
 int FullConnection::GetAxis() const { return this->primitive_->value_as_FullConnection()->axis(); }
 bool FullConnection::GetUseAxis() const { return this->primitive_->value_as_FullConnection()->useAxis(); }
 int FullConnection::GetActivationType() const { return this->primitive_->value_as_FullConnection()->activationType(); }
 
-void FullConnection::SetHasBias(bool has_bias) {}
-void FullConnection::SetAxis(int axis) {}
-void FullConnection::SetUseAxis(bool use_axis) {}
-void FullConnection::SetActivationType(int activationType) {}
 #endif
 int FullConnection::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
                                std::vector<lite::tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/full_connection.h b/mindspore/lite/src/ops/full_connection.h
index 7bcb9b1166..c4d5e980ff 100644
--- a/mindspore/lite/src/ops/full_connection.h
+++ b/mindspore/lite/src/ops/full_connection.h
@@ -28,20 +28,23 @@ namespace lite {
 class FullConnection : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(FullConnection, PrimitiveC);
   FullConnection() = default;
   explicit FullConnection(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetHasBias(bool has_bias);
+  void SetAxis(int axis);
+  void SetUseAxis(bool use_axis);
+  void SetActivationType(int activationType);
 #else
-  explicit FullConnection(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  FullConnection() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   bool GetHasBias() const;
   int GetAxis() const;
   bool GetUseAxis() const;
   int GetActivationType() const;
-  void SetHasBias(bool has_bias);
-  void SetAxis(int axis);
-  void SetUseAxis(bool use_axis);
-  void SetActivationType(int activationType);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/fused_batchnorm.cc b/mindspore/lite/src/ops/fused_batchnorm.cc
index ecb7e0c1fe..a5d3a31929 100644
--- a/mindspore/lite/src/ops/fused_batchnorm.cc
+++ b/mindspore/lite/src/ops/fused_batchnorm.cc
@@ -28,14 +28,24 @@ void FusedBatchNorm::SetMomentum(float momentum) { this->primitive_->value.AsFus
 void FusedBatchNorm::SetSpatial(int spatial) { this->primitive_->value.AsFusedBatchNorm()->spatial = spatial; }
 
 #else
+int FusedBatchNorm::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_FusedBatchNorm();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_FusedBatchNorm return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateFusedBatchNorm(*fbb, attr->epsilon(), attr->momentum(), attr->spatial());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_FusedBatchNorm, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 float FusedBatchNorm::GetEpsilon() const { return this->primitive_->value_as_FusedBatchNorm()->epsilon(); }
 float FusedBatchNorm::GetMomentum() const { return this->primitive_->value_as_FusedBatchNorm()->momentum(); }
 int FusedBatchNorm::GetSpatial() const { return this->primitive_->value_as_FusedBatchNorm()->spatial(); }
 
-void FusedBatchNorm::SetEpsilon(float epsilon) {}
-void FusedBatchNorm::SetMomentum(float momentum) {}
-void FusedBatchNorm::SetSpatial(int spatial) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/fused_batchnorm.h b/mindspore/lite/src/ops/fused_batchnorm.h
index 729cc934c6..b95314def8 100644
--- a/mindspore/lite/src/ops/fused_batchnorm.h
+++ b/mindspore/lite/src/ops/fused_batchnorm.h
@@ -28,17 +28,20 @@ namespace lite {
 class FusedBatchNorm : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(FusedBatchNorm, PrimitiveC);
   FusedBatchNorm() = default;
   explicit FusedBatchNorm(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetEpsilon(float epsilon);
+  void SetMomentum(float momentum);
+  void SetSpatial(int spatial);
 #else
-  explicit FusedBatchNorm(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  FusedBatchNorm() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetEpsilon() const;
   float GetMomentum() const;
   int GetSpatial() const;
-  void SetEpsilon(float epsilon);
-  void SetMomentum(float momentum);
-  void SetSpatial(int spatial);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/gather.cc b/mindspore/lite/src/ops/gather.cc
index d4546da058..4d48933d19 100644
--- a/mindspore/lite/src/ops/gather.cc
+++ b/mindspore/lite/src/ops/gather.cc
@@ -29,12 +29,23 @@ void Gather::SetAxis(int axis) { this->primitive_->value.AsGather()->axis = axis
 void Gather::SetBatchDims(int batch_dims) { this->primitive_->value.AsGather()->batchDims = batch_dims; }
 
 #else
+int Gather::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Gather();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Gather return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateGather(*fbb, attr->axis(), attr->batchDims());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Gather, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int Gather::GetAxis() const { return this->primitive_->value_as_Gather()->axis(); }
 int Gather::GetBatchDims() const { return this->primitive_->value_as_Gather()->batchDims(); }
 
-void Gather::SetAxis(int axis) {}
-void Gather::SetBatchDims(int batch_dims) {}
 #endif
 
 int Gather::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
@@ -67,10 +78,6 @@ int Gather::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor
   }
   auto indices_shape = indices->shape();
   int indices_rank = indices_shape.size();
-  if (indices_rank < batch_dims + 1) {
-    MS_LOG(ERROR) << "input[1]'s rank is less than batchDim + 1";
-    return RET_ERROR;
-  }
   if (batch_dims != 0) {
     MS_LOG(ERROR) << "batchDims  " << batch_dims << " != 0, which is not support";
     return RET_ERROR;
diff --git a/mindspore/lite/src/ops/gather.h b/mindspore/lite/src/ops/gather.h
index 0006b190eb..088a736efa 100644
--- a/mindspore/lite/src/ops/gather.h
+++ b/mindspore/lite/src/ops/gather.h
@@ -28,16 +28,19 @@ namespace lite {
 class Gather : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Gather, PrimitiveC);
   Gather() = default;
   explicit Gather(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+  void SetBatchDims(int batch_dims);
 #else
-  explicit Gather(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Gather() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
   int GetBatchDims() const;
-  void SetAxis(int axis);
-  void SetBatchDims(int batch_dims);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/gather_nd.cc b/mindspore/lite/src/ops/gather_nd.cc
index e5da4346cb..e88e913339 100644
--- a/mindspore/lite/src/ops/gather_nd.cc
+++ b/mindspore/lite/src/ops/gather_nd.cc
@@ -24,10 +24,22 @@ int GatherNd::GetBatchDims() const { return this->primitive_->value.AsGatherNd()
 void GatherNd::SetBatchDims(int batch_dims) { this->primitive_->value.AsGatherNd()->batchDims = batch_dims; }
 
 #else
+int GatherNd::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_GatherNd();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_GatherNd return nullptr";
+    return RET_ERROR;
+  }
 
+  auto val_offset = schema::CreateGatherNd(*fbb, attr->batchDims());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_GatherNd, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 int GatherNd::GetBatchDims() const { return this->primitive_->value_as_GatherNd()->batchDims(); }
 
-void GatherNd::SetBatchDims(int batch_dims) {}
 #endif
 
 int GatherNd::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/gather_nd.h b/mindspore/lite/src/ops/gather_nd.h
index 7f0b1a7937..f578b55ae4 100644
--- a/mindspore/lite/src/ops/gather_nd.h
+++ b/mindspore/lite/src/ops/gather_nd.h
@@ -28,14 +28,18 @@ namespace lite {
 class GatherNd : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(GatherNd, PrimitiveC);
   GatherNd() = default;
   explicit GatherNd(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBatchDims(int batch_dims);
+
 #else
-  explicit GatherNd(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  GatherNd() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetBatchDims() const;
-  void SetBatchDims(int batch_dims);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/greater.cc b/mindspore/lite/src/ops/greater.cc
new file mode 100644
index 0000000000..bd92f1a1b1
--- /dev/null
+++ b/mindspore/lite/src/ops/greater.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/greater.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+
+int Greater::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateGreater(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Greater, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/greater.h b/mindspore/lite/src/ops/greater.h
index 3547efbe74..611025152d 100644
--- a/mindspore/lite/src/ops/greater.h
+++ b/mindspore/lite/src/ops/greater.h
@@ -27,10 +27,13 @@ namespace lite {
 class Greater : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Greater, Arithmetic);
   Greater() = default;
   explicit Greater(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit Greater(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Greater() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/greater_equal.cc b/mindspore/lite/src/ops/greater_equal.cc
new file mode 100644
index 0000000000..bd2e5b1c45
--- /dev/null
+++ b/mindspore/lite/src/ops/greater_equal.cc
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/greater_equal.h"
+
+namespace mindspore {
+namespace lite {
+#ifndef PRIMITIVE_WRITEABLE
+int GreaterEqual::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateGreaterEqual(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_GreaterEqual, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/greater_equal.h b/mindspore/lite/src/ops/greater_equal.h
index 5b97a43c9b..c6a6001764 100644
--- a/mindspore/lite/src/ops/greater_equal.h
+++ b/mindspore/lite/src/ops/greater_equal.h
@@ -28,10 +28,13 @@ namespace lite {
 class GreaterEqual : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(GreaterEqual, Arithmetic);
   GreaterEqual() = default;
   explicit GreaterEqual(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit GreaterEqual(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  GreaterEqual() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/l2_norm.cc b/mindspore/lite/src/ops/l2_norm.cc
index 15bbb6713a..4c8431ae68 100644
--- a/mindspore/lite/src/ops/l2_norm.cc
+++ b/mindspore/lite/src/ops/l2_norm.cc
@@ -26,15 +26,32 @@ void L2Norm::SetAxis(const std::vector<int> &axis) { this->primitive_->value.AsL
 void L2Norm::SetEpsilon(float epsilon) { this->primitive_->value.AsL2Norm()->epsilon = epsilon; }
 
 #else
+int L2Norm::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_L2Norm();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_L2Norm return nullptr";
+    return RET_ERROR;
+  }
 
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateL2NormDirect(*fbb, &axis, attr->epsilon());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_L2Norm, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 std::vector<int> L2Norm::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_L2Norm()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 float L2Norm::GetEpsilon() const { return this->primitive_->value_as_L2Norm()->epsilon(); }
 
-void L2Norm::SetAxis(const std::vector<int> &axis) {}
-void L2Norm::SetEpsilon(float epsilon) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/l2_norm.h b/mindspore/lite/src/ops/l2_norm.h
index a54b0e91b3..e44579d574 100644
--- a/mindspore/lite/src/ops/l2_norm.h
+++ b/mindspore/lite/src/ops/l2_norm.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,15 +29,18 @@ namespace lite {
 class L2Norm : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(L2Norm, PrimitiveC);
   L2Norm() = default;
   explicit L2Norm(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+  void SetEpsilon(float epsilon);
 #else
-  explicit L2Norm(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  L2Norm() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetAxis() const;
   float GetEpsilon() const;
-  void SetAxis(const std::vector<int> &axis);
-  void SetEpsilon(float epsilon);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/leaky_relu.cc b/mindspore/lite/src/ops/leaky_relu.cc
index f7d2a4bf11..7164372f6b 100644
--- a/mindspore/lite/src/ops/leaky_relu.cc
+++ b/mindspore/lite/src/ops/leaky_relu.cc
@@ -29,7 +29,19 @@ void LeakyReLU::SetNegativeSlope(float negative_slope) {
 
 float LeakyReLU::GetNegativeSlope() const { return this->primitive_->value_as_LeakyReLU()->negativeSlope(); }
 
-void LeakyReLU::SetNegativeSlope(float negative_slope) {}
+int LeakyReLU::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_LeakyReLU();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_LeakyReLU return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateLeakyReLU(*fbb, attr->negativeSlope());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LeakyReLU, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/leaky_relu.h b/mindspore/lite/src/ops/leaky_relu.h
index 1e021f35ba..f72b75516a 100644
--- a/mindspore/lite/src/ops/leaky_relu.h
+++ b/mindspore/lite/src/ops/leaky_relu.h
@@ -28,13 +28,17 @@ namespace lite {
 class LeakyReLU : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LeakyReLU, PrimitiveC);
   LeakyReLU() = default;
   explicit LeakyReLU(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetNegativeSlope(float negative_slope);
+
 #else
-  explicit LeakyReLU(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  LeakyReLU() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetNegativeSlope() const;
-  void SetNegativeSlope(float negative_slope);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/caffe_p_relu.cc b/mindspore/lite/src/ops/less.cc
similarity index 62%
rename from mindspore/lite/src/ops/caffe_p_relu.cc
rename to mindspore/lite/src/ops/less.cc
index c7a74cc18e..57a98d87a9 100644
--- a/mindspore/lite/src/ops/caffe_p_relu.cc
+++ b/mindspore/lite/src/ops/less.cc
@@ -14,22 +14,21 @@
  * limitations under the License.
  */
 
-#include "src/ops/caffe_p_relu.h"
+#include "src/ops/less.h"
 
 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
-bool CaffePReLU::GetChannelShared() const { return this->primitive_->value.AsCaffePReLU()->channelShared; }
-
-void CaffePReLU::SetChannelShared(bool channel_shared) {
-  this->primitive_->value.AsCaffePReLU()->channelShared = channel_shared;
-}
-
 #else
+int Less::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
-bool CaffePReLU::GetChannelShared() const { return this->primitive_->value_as_CaffePReLU()->channelShared(); }
-
-void CaffePReLU::SetChannelShared(bool channel_shared) {}
+  auto val_offset = schema::CreateLess(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Less, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/less.h b/mindspore/lite/src/ops/less.h
index d0205905e4..e230f89412 100644
--- a/mindspore/lite/src/ops/less.h
+++ b/mindspore/lite/src/ops/less.h
@@ -28,10 +28,13 @@ namespace lite {
 class Less : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Less, Arithmetic);
   Less() = default;
   explicit Less(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit Less(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Less() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/less_equal.cc b/mindspore/lite/src/ops/less_equal.cc
new file mode 100644
index 0000000000..7274f8cc22
--- /dev/null
+++ b/mindspore/lite/src/ops/less_equal.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/less_equal.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int LessEqual::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateLessEqual(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LessEqual, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/less_equal.h b/mindspore/lite/src/ops/less_equal.h
index c1e96ecb99..43a906713d 100644
--- a/mindspore/lite/src/ops/less_equal.h
+++ b/mindspore/lite/src/ops/less_equal.h
@@ -28,10 +28,13 @@ namespace lite {
 class LessEqual : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LessEqual, Arithmetic);
   LessEqual() = default;
   explicit LessEqual(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit LessEqual(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  LessEqual() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/local_response_normalization.cc b/mindspore/lite/src/ops/local_response_normalization.cc
index 891f165e6e..518567d785 100644
--- a/mindspore/lite/src/ops/local_response_normalization.cc
+++ b/mindspore/lite/src/ops/local_response_normalization.cc
@@ -60,10 +60,22 @@ float LocalResponseNormalization::GetBeta() const {
   return this->primitive_->value_as_LocalResponseNormalization()->beta();
 }
 
-void LocalResponseNormalization::SetDepthRadius(int depth_radius) {}
-void LocalResponseNormalization::SetBias(float bias) {}
-void LocalResponseNormalization::SetAlpha(float alpha) {}
-void LocalResponseNormalization::SetBeta(float beta) {}
+int LocalResponseNormalization::UnPackToFlatBuilder(const schema::Primitive *primitive,
+                                                    flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_LocalResponseNormalization();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_LocalResponseNormalization return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset =
+    schema::CreateLocalResponseNormalization(*fbb, attr->depth_radius(), attr->bias(), attr->alpha(), attr->beta());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LocalResponseNormalization, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/local_response_normalization.h b/mindspore/lite/src/ops/local_response_normalization.h
index 67557c5147..108e0c8c83 100644
--- a/mindspore/lite/src/ops/local_response_normalization.h
+++ b/mindspore/lite/src/ops/local_response_normalization.h
@@ -28,19 +28,22 @@ namespace lite {
 class LocalResponseNormalization : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LocalResponseNormalization, PrimitiveC);
   LocalResponseNormalization() = default;
   explicit LocalResponseNormalization(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetDepthRadius(int depth_radius);
+  void SetBias(float bias);
+  void SetAlpha(float alpha);
+  void SetBeta(float beta);
 #else
-  explicit LocalResponseNormalization(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  LocalResponseNormalization() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetDepthRadius() const;
   float GetBias() const;
   float GetAlpha() const;
   float GetBeta() const;
-  void SetDepthRadius(int depth_radius);
-  void SetBias(float bias);
-  void SetAlpha(float alpha);
-  void SetBeta(float beta);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/log.cc b/mindspore/lite/src/ops/log.cc
new file mode 100644
index 0000000000..f35ec426a4
--- /dev/null
+++ b/mindspore/lite/src/ops/log.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/log.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Log::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateLog(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Log, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/log.h b/mindspore/lite/src/ops/log.h
index 0243af016a..1bbac2eba5 100644
--- a/mindspore/lite/src/ops/log.h
+++ b/mindspore/lite/src/ops/log.h
@@ -28,10 +28,13 @@ namespace lite {
 class Log : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Log, ArithmeticSelf);
   Log() = default;
   explicit Log(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Log(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Log() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/logical_and.cc b/mindspore/lite/src/ops/logical_and.cc
new file mode 100644
index 0000000000..8cc73dfe50
--- /dev/null
+++ b/mindspore/lite/src/ops/logical_and.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/logical_and.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int LogicalAnd::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateLogicalAnd(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LogicalAnd, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/logical_and.h b/mindspore/lite/src/ops/logical_and.h
index a9a6bda890..e323b7e9e8 100644
--- a/mindspore/lite/src/ops/logical_and.h
+++ b/mindspore/lite/src/ops/logical_and.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class LogicalAnd : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LogicalAnd, Arithmetic);
   LogicalAnd() = default;
   explicit LogicalAnd(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit LogicalAnd(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  LogicalAnd() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/logical_not.cc b/mindspore/lite/src/ops/logical_not.cc
new file mode 100644
index 0000000000..c67869f932
--- /dev/null
+++ b/mindspore/lite/src/ops/logical_not.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/logical_not.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int LogicalNot::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateLogicalNot(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LogicalNot, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/logical_not.h b/mindspore/lite/src/ops/logical_not.h
index b6fc369f58..69555551b8 100644
--- a/mindspore/lite/src/ops/logical_not.h
+++ b/mindspore/lite/src/ops/logical_not.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class LogicalNot : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LogicalNot, ArithmeticSelf);
   LogicalNot() = default;
   explicit LogicalNot(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit LogicalNot(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  LogicalNot() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/logical_or.cc b/mindspore/lite/src/ops/logical_or.cc
new file mode 100644
index 0000000000..2d8f73f040
--- /dev/null
+++ b/mindspore/lite/src/ops/logical_or.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/logical_or.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int LogicalOr::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateLogicalOr(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_LogicalOr, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/logical_or.h b/mindspore/lite/src/ops/logical_or.h
index 3571dd7086..5afc583e48 100644
--- a/mindspore/lite/src/ops/logical_or.h
+++ b/mindspore/lite/src/ops/logical_or.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class LogicalOr : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(LogicalOr, Arithmetic);
   LogicalOr() = default;
   explicit LogicalOr(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit LogicalOr(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  LogicalOr() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/lrn.cc b/mindspore/lite/src/ops/lrn.cc
index 859b8728ed..55d7745c9f 100644
--- a/mindspore/lite/src/ops/lrn.cc
+++ b/mindspore/lite/src/ops/lrn.cc
@@ -36,10 +36,19 @@ float Lrn::GetBeta() const { return this->primitive_->value_as_Lrn()->beta(); }
 float Lrn::GetBias() const { return this->primitive_->value_as_Lrn()->bias(); }
 int Lrn::GetSize() const { return this->primitive_->value_as_Lrn()->size(); }
 
-void Lrn::SetAlpha(float alpha) {}
-void Lrn::SetBeta(float beta) {}
-void Lrn::SetBias(float bias) {}
-void Lrn::SetSize(int size) {}
+int Lrn::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Lrn();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Lrn return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateLrn(*fbb, attr->alpha(), attr->beta(), attr->bias(), attr->size());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Lrn, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/lrn.h b/mindspore/lite/src/ops/lrn.h
index 0dd7b17347..d1ccb69cb4 100644
--- a/mindspore/lite/src/ops/lrn.h
+++ b/mindspore/lite/src/ops/lrn.h
@@ -28,19 +28,22 @@ namespace lite {
 class Lrn : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Lrn, PrimitiveC);
   Lrn() = default;
   explicit Lrn(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAlpha(float alpha);
+  void SetBeta(float beta);
+  void SetBias(float bias);
+  void SetSize(int size);
 #else
-  explicit Lrn(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Lrn() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetAlpha() const;
   float GetBeta() const;
   float GetBias() const;
   int GetSize() const;
-  void SetAlpha(float alpha);
-  void SetBeta(float beta);
-  void SetBias(float bias);
-  void SetSize(int size);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/lstm.cc b/mindspore/lite/src/ops/lstm.cc
index 7e997c2fef..8548bc846e 100644
--- a/mindspore/lite/src/ops/lstm.cc
+++ b/mindspore/lite/src/ops/lstm.cc
@@ -26,8 +26,19 @@ void Lstm::SetBidirection(bool bidirection) { this->primitive_->value.AsLstm()->
 #else
 
 bool Lstm::GetBidirection() const { return this->primitive_->value_as_Lstm()->bidirection(); }
-
-void Lstm::SetBidirection(bool bidirection) {}
+int Lstm::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Lstm();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Lstm return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateLstm(*fbb, attr->bidirection());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Lstm, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 const int kLstmInputNum = 6;
diff --git a/mindspore/lite/src/ops/lstm.h b/mindspore/lite/src/ops/lstm.h
index 5260bed3f8..f30dbd0aed 100644
--- a/mindspore/lite/src/ops/lstm.h
+++ b/mindspore/lite/src/ops/lstm.h
@@ -28,14 +28,18 @@ namespace lite {
 class Lstm : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Lstm, PrimitiveC);
   Lstm() = default;
   explicit Lstm(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBidirection(bool bidirection);
+
 #else
-  explicit Lstm(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Lstm() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   bool GetBidirection() const;
-  void SetBidirection(bool bidirection);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/make_tuple.cc b/mindspore/lite/src/ops/make_tuple.cc
index ff102f13b3..78ca0b1084 100644
--- a/mindspore/lite/src/ops/make_tuple.cc
+++ b/mindspore/lite/src/ops/make_tuple.cc
@@ -22,10 +22,39 @@ namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int MakeTuple::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::MakeTupleT>();
-  this->primitive_->value.type = schema::PrimitiveType_MakeTuple;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_MakeTuple;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_MakeTuple) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::MakeTupleT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+#else
+int MakeTuple::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateMakeTuple(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_MakeTuple, val_offset.o);
+  fbb->Finish(prim_offset);
   return RET_OK;
 }
 #endif
diff --git a/mindspore/lite/src/ops/make_tuple.h b/mindspore/lite/src/ops/make_tuple.h
index 2559644997..04c621b587 100644
--- a/mindspore/lite/src/ops/make_tuple.h
+++ b/mindspore/lite/src/ops/make_tuple.h
@@ -24,11 +24,14 @@ namespace lite {
 class MakeTuple : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(MakeTuple, PrimitiveC);
   MakeTuple() = default;
   explicit MakeTuple(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit MakeTuple(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  MakeTuple() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/matmul.cc b/mindspore/lite/src/ops/matmul.cc
index 3eba5716cf..815eaa24a5 100644
--- a/mindspore/lite/src/ops/matmul.cc
+++ b/mindspore/lite/src/ops/matmul.cc
@@ -108,14 +108,32 @@ void MatMul::PopulaterQuantParam(const Primitive &prim,
 }
 
 int MatMul::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-
-  auto attr = std::make_unique<schema::MatMulT>();
-  attr->transposeA = GetValue<bool>(prim.GetAttr("transpose_a"));
-  attr->transposeB = GetValue<bool>(prim.GetAttr("transpose_b"));
-
-  this->primitive_->value.type = schema::PrimitiveType_MatMul;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_MatMul;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_MatMul) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::MatMulT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    attr->transposeA = GetValue<bool>(prim.GetAttr("transpose_a"));
+    attr->transposeB = GetValue<bool>(prim.GetAttr("transpose_b"));
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
   if (GetQuantType() == schema::QuantType_AwareTraining) {
     std::vector<std::vector<schema::QuantParamT>> vecInputQuantParam;
     std::vector<std::vector<schema::QuantParamT>> vecOutputQuantParam;
@@ -131,8 +149,20 @@ int MatMul::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inp
 bool MatMul::GetTransposeA() const { return this->primitive_->value_as_MatMul()->transposeA(); }
 bool MatMul::GetTransposeB() const { return this->primitive_->value_as_MatMul()->transposeB(); }
 
-void MatMul::SetTransposeA(bool transpose_a) {}
-void MatMul::SetTransposeB(bool transpose_b) {}
+int MatMul::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_MatMul();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_MatMul return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateMatMul(*fbb, attr->transposeA(), attr->transposeB());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_MatMul, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
 #endif
 
 int MatMul::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/matmul.h b/mindspore/lite/src/ops/matmul.h
index 2295aa1d39..edbd61b252 100644
--- a/mindspore/lite/src/ops/matmul.h
+++ b/mindspore/lite/src/ops/matmul.h
@@ -27,10 +27,14 @@ namespace mindspore {
 namespace lite {
 class MatMul : public PrimitiveC {
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(MatMul, PrimitiveC);
+
  public:
   MatMul() = default;
   explicit MatMul(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetTransposeA(bool transpose_a);
+  void SetTransposeB(bool transpose_b);
 
  private:
   void PopulaterQuantParam(const Primitive &prim, std::vector<std::vector<schema::QuantParamT>> *vecInputQuantParam,
@@ -39,15 +43,15 @@ class MatMul : public PrimitiveC {
 #else
 
  public:
-  explicit MatMul(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  MatMul() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 
  public:
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   bool GetTransposeA() const;
   bool GetTransposeB() const;
-  void SetTransposeA(bool transpose_a);
-  void SetTransposeB(bool transpose_b);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/matrix_diag.cc b/mindspore/lite/src/ops/matrix_diag.cc
index dbc21ce835..b92094c85c 100644
--- a/mindspore/lite/src/ops/matrix_diag.cc
+++ b/mindspore/lite/src/ops/matrix_diag.cc
@@ -38,10 +38,20 @@ int MatrixDiag::GetNumRows() const { return this->primitive_->value_as_MatrixDia
 int MatrixDiag::GetNumCols() const { return this->primitive_->value_as_MatrixDiag()->numCols(); }
 float MatrixDiag::GetPaddingValue() const { return this->primitive_->value_as_MatrixDiag()->paddingValue(); }
 
-void MatrixDiag::SetK(int k) {}
-void MatrixDiag::SetNumRows(int num_rows) {}
-void MatrixDiag::SetNumCols(int num_cols) {}
-void MatrixDiag::SetPaddingValue(float padding_value) {}
+int MatrixDiag::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_MatrixDiag();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_MatrixDiag return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateMatrixDiag(*fbb, attr->k(), attr->numRows(), attr->numCols(), attr->paddingValue());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_MatrixDiag, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/matrix_diag.h b/mindspore/lite/src/ops/matrix_diag.h
index 3b54632543..fe36b2558d 100644
--- a/mindspore/lite/src/ops/matrix_diag.h
+++ b/mindspore/lite/src/ops/matrix_diag.h
@@ -28,19 +28,22 @@ namespace lite {
 class MatrixDiag : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(MatrixDiag, PrimitiveC);
   MatrixDiag() = default;
   explicit MatrixDiag(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetK(int k);
+  void SetNumRows(int num_rows);
+  void SetNumCols(int num_cols);
+  void SetPaddingValue(float padding_value);
 #else
-  explicit MatrixDiag(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  MatrixDiag() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetK() const;
   int GetNumRows() const;
   int GetNumCols() const;
   float GetPaddingValue() const;
-  void SetK(int k);
-  void SetNumRows(int num_rows);
-  void SetNumCols(int num_cols);
-  void SetPaddingValue(float padding_value);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/maximum.cc b/mindspore/lite/src/ops/maximum.cc
new file mode 100644
index 0000000000..39223ee4b5
--- /dev/null
+++ b/mindspore/lite/src/ops/maximum.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/maximum.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Maximum::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateMaximum(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Maximum, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/maximum.h b/mindspore/lite/src/ops/maximum.h
index d123c55967..88fa03aa82 100644
--- a/mindspore/lite/src/ops/maximum.h
+++ b/mindspore/lite/src/ops/maximum.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class Maximum : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Arithmetic, Arithmetic);
   Maximum() = default;
   explicit Maximum(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit Maximum(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Maximum() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/mean.cc b/mindspore/lite/src/ops/mean.cc
index d6cfb5e204..2e7324796b 100644
--- a/mindspore/lite/src/ops/mean.cc
+++ b/mindspore/lite/src/ops/mean.cc
@@ -33,8 +33,26 @@ std::vector<int> Mean::GetAxis() const {
 }
 bool Mean::GetKeepDims() const { return this->primitive_->value_as_Mean()->keepDims(); }
 
-void Mean::SetAxis(const std::vector<int> &axis) {}
-void Mean::SetKeepDims(bool keep_dims) {}
+int Mean::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Mean();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Mean return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateMeanDirect(*fbb, &axis, attr->keepDims());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Mean, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/mean.h b/mindspore/lite/src/ops/mean.h
index 0c87275ec1..fd9a42e8a3 100644
--- a/mindspore/lite/src/ops/mean.h
+++ b/mindspore/lite/src/ops/mean.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,16 +29,19 @@ namespace lite {
 class Mean : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Mean, PrimitiveC);
   Mean() = default;
   explicit Mean(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+  void SetKeepDims(bool keep_dims);
 #else
-  explicit Mean(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Mean() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetAxis() const;
   bool GetKeepDims() const;
-  void SetAxis(const std::vector<int> &axis);
-  void SetKeepDims(bool keep_dims);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/minimum.cc b/mindspore/lite/src/ops/minimum.cc
new file mode 100644
index 0000000000..c2c8c8fd56
--- /dev/null
+++ b/mindspore/lite/src/ops/minimum.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/minimum.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Minimum::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateMinimum(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Minimum, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/minimum.h b/mindspore/lite/src/ops/minimum.h
index 9606ab5c21..6c4a097952 100644
--- a/mindspore/lite/src/ops/minimum.h
+++ b/mindspore/lite/src/ops/minimum.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class Minimum : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Arithmetic, Arithmetic);
   Minimum() = default;
   explicit Minimum(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit Minimum(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Minimum() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/mul.cc b/mindspore/lite/src/ops/mul.cc
index ffe61d852f..f6d2fab12a 100644
--- a/mindspore/lite/src/ops/mul.cc
+++ b/mindspore/lite/src/ops/mul.cc
@@ -26,10 +26,30 @@ void Mul::SetActivationType(int activation_type) {
   this->primitive_->value.AsMul()->activationType = (schema::ActivationType)activation_type;
 }
 int Mul::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::MulT>();
-  this->primitive_->value.type = schema::PrimitiveType_Mul;
-  this->primitive_->value.value = attr.release();
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Mul;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Mul) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::MulT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
 
   return RET_OK;
 }
@@ -38,7 +58,20 @@ int Mul::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs
 
 int Mul::GetActivationType() const { return this->primitive_->value_as_Mul()->activationType(); }
 
-void Mul::SetActivationType(int activation_type) {}
+int Mul::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Mul();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Mul return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateMul(*fbb, attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Mul, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/mul.h b/mindspore/lite/src/ops/mul.h
index 1962ae600f..ba582db115 100644
--- a/mindspore/lite/src/ops/mul.h
+++ b/mindspore/lite/src/ops/mul.h
@@ -28,15 +28,17 @@ namespace lite {
 class Mul : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Mul, Arithmetic);
   Mul() = default;
   explicit Mul(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
+  void SetActivationType(int activation_type);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Mul(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Mul() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetActivationType() const;
-  void SetActivationType(int activation_type);
-
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/nchw2nhwc.cc b/mindspore/lite/src/ops/nchw2nhwc.cc
index 170f04020f..0ac5c25639 100644
--- a/mindspore/lite/src/ops/nchw2nhwc.cc
+++ b/mindspore/lite/src/ops/nchw2nhwc.cc
@@ -19,6 +19,18 @@
 
 namespace mindspore {
 namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Nchw2Nhwc::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateNchw2Nhwc(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Nchw2Nhwc, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+
 int Nchw2Nhwc::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) {
   MS_ASSERT(this->primitive_ != nullptr);
   auto input = inputs_.front();
diff --git a/mindspore/lite/src/ops/nchw2nhwc.h b/mindspore/lite/src/ops/nchw2nhwc.h
index f47ff6afd4..8f7ddd0ef1 100644
--- a/mindspore/lite/src/ops/nchw2nhwc.h
+++ b/mindspore/lite/src/ops/nchw2nhwc.h
@@ -28,10 +28,13 @@ namespace lite {
 class Nchw2Nhwc : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Nchw2Nhwc, PrimitiveC);
   Nchw2Nhwc() = default;
   explicit Nchw2Nhwc(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Nchw2Nhwc(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Nchw2Nhwc() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/nhwc2nchw.cc b/mindspore/lite/src/ops/nhwc2nchw.cc
index 9bff02ab2e..a5f73bcfe0 100644
--- a/mindspore/lite/src/ops/nhwc2nchw.cc
+++ b/mindspore/lite/src/ops/nhwc2nchw.cc
@@ -19,6 +19,19 @@
 
 namespace mindspore {
 namespace lite {
+
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Nhwc2Nchw::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateNhwc2Nchw(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Nhwc2Nchw, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+
 int Nhwc2Nchw::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) {
   MS_ASSERT(this->primitive_ != nullptr);
   auto input = inputs_.front();
diff --git a/mindspore/lite/src/ops/nhwc2nchw.h b/mindspore/lite/src/ops/nhwc2nchw.h
index 232d7ab387..479769cc19 100644
--- a/mindspore/lite/src/ops/nhwc2nchw.h
+++ b/mindspore/lite/src/ops/nhwc2nchw.h
@@ -28,10 +28,13 @@ namespace lite {
 class Nhwc2Nchw : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Nhwc2Nchw, PrimitiveC);
   Nhwc2Nchw() = default;
   explicit Nhwc2Nchw(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Nhwc2Nchw(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Nhwc2Nchw() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/not_equal.cc b/mindspore/lite/src/ops/not_equal.cc
new file mode 100644
index 0000000000..b768d74293
--- /dev/null
+++ b/mindspore/lite/src/ops/not_equal.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/not_equal.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int NotEqual::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateSin(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Sin, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/not_equal.h b/mindspore/lite/src/ops/not_equal.h
index 2ec16ada79..2c73caaa5f 100644
--- a/mindspore/lite/src/ops/not_equal.h
+++ b/mindspore/lite/src/ops/not_equal.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class NotEqual : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(NotEqual, Arithmetic);
   NotEqual() = default;
   explicit NotEqual(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit NotEqual(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  NotEqual() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/one_hot.cc b/mindspore/lite/src/ops/one_hot.cc
index da1ef94278..41b2040088 100644
--- a/mindspore/lite/src/ops/one_hot.cc
+++ b/mindspore/lite/src/ops/one_hot.cc
@@ -27,7 +27,19 @@ void OneHot::SetAxis(int axis) { this->primitive_->value.AsOneHot()->axis = axis
 
 int OneHot::GetAxis() const { return this->primitive_->value_as_OneHot()->axis(); }
 
-void OneHot::SetAxis(int axis) {}
+int OneHot::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_OneHot();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_OneHot return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateOneHot(*fbb, attr->axis());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_OneHot, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/one_hot.h b/mindspore/lite/src/ops/one_hot.h
index 49e494b3dd..deaa9ab1f1 100644
--- a/mindspore/lite/src/ops/one_hot.h
+++ b/mindspore/lite/src/ops/one_hot.h
@@ -28,14 +28,18 @@ namespace lite {
 class OneHot : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(OneHot, PrimitiveC);
   OneHot() = default;
   explicit OneHot(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+
 #else
-  explicit OneHot(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  OneHot() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
-  void SetAxis(int axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/p_relu.cc b/mindspore/lite/src/ops/p_relu.cc
new file mode 100644
index 0000000000..2174e80baa
--- /dev/null
+++ b/mindspore/lite/src/ops/p_relu.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/p_relu.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+bool PReLU::GetChannelShared() const { return this->primitive_->value.AsPReLU()->channelShared; }
+
+void PReLU::SetChannelShared(bool channel_shared) { this->primitive_->value.AsPReLU()->channelShared = channel_shared; }
+
+#else
+bool PReLU::GetChannelShared() const { return this->primitive_->value_as_PReLU()->channelShared(); }
+
+int PReLU::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_PReLU();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_PReLU return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<float> slope;
+  if (attr->slope() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->slope()->size()); i++) {
+      slope.push_back(attr->slope()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreatePReLUDirect(*fbb, attr->channelShared(), &slope);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_PReLU, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/caffe_p_relu.h b/mindspore/lite/src/ops/p_relu.h
similarity index 67%
rename from mindspore/lite/src/ops/caffe_p_relu.h
rename to mindspore/lite/src/ops/p_relu.h
index d78c456338..f18f168154 100644
--- a/mindspore/lite/src/ops/caffe_p_relu.h
+++ b/mindspore/lite/src/ops/p_relu.h
@@ -14,29 +14,34 @@
  * limitations under the License.
  */
 
-#ifndef LITE_MINDSPORE_LITE_C_OPS_CAFFE_P_RE_L_U_H_
-#define LITE_MINDSPORE_LITE_C_OPS_CAFFE_P_RE_L_U_H_
+#ifndef LITE_MINDSPORE_LITE_C_OPS_P_RELU_H_
+#define LITE_MINDSPORE_LITE_C_OPS_P_RELU_H_
 
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/activation.h"
 
 namespace mindspore {
 namespace lite {
-class CaffePReLU : public Activation {
+class PReLU : public Activation {
  public:
 #ifdef PRIMITIVE_WRITEABLE
-  CaffePReLU() = default;
-  explicit CaffePReLU(schema::PrimitiveT *primitive) : Activation(primitive) {}
+  MS_DECLARE_PARENT(PReLU, Activation);
+  PReLU() = default;
+  explicit PReLU(schema::PrimitiveT *primitive) : Activation(primitive) {}
+  void SetChannelShared(bool channel_shared);
+
 #else
-  explicit CaffePReLU(schema::Primitive *primitive) : Activation(primitive) {}
+  PReLU() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   bool GetChannelShared() const;
-  void SetChannelShared(bool channel_shared);
 };
 }  // namespace lite
 }  // namespace mindspore
 
-#endif  // LITE_MINDSPORE_LITE_C_OPS_CAFFE_P_RE_L_U_H_
+#endif  // LITE_MINDSPORE_LITE_C_OPS_P_RELU_H_
diff --git a/mindspore/lite/src/ops/pad.cc b/mindspore/lite/src/ops/pad.cc
index eb116d789d..852686972c 100644
--- a/mindspore/lite/src/ops/pad.cc
+++ b/mindspore/lite/src/ops/pad.cc
@@ -38,9 +38,25 @@ std::vector<int> Pad::GetPaddings() const {
 int Pad::GetPaddingMode() const { return this->primitive_->value_as_Pad()->paddingMode(); }
 float Pad::GetConstantValue() const { return this->primitive_->value_as_Pad()->constantValue(); }
 
-void Pad::SetPaddings(const std::vector<int> &paddings) {}
-void Pad::SetPaddingMode(int padding_mode) {}
-void Pad::SetConstantValue(float constant_value) {}
+int Pad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Pad();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Pad return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> paddings;
+  if (attr->paddings() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->paddings()->size()); i++) {
+      paddings.push_back(attr->paddings()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreatePadDirect(*fbb, &paddings, attr->paddingMode(), attr->constantValue());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Pad, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 namespace {
 const size_t kInputRank = 4;
diff --git a/mindspore/lite/src/ops/pad.h b/mindspore/lite/src/ops/pad.h
index 8770133662..695a1c1742 100644
--- a/mindspore/lite/src/ops/pad.h
+++ b/mindspore/lite/src/ops/pad.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,18 +29,21 @@ namespace lite {
 class Pad : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Pad, PrimitiveC);
   Pad() = default;
   explicit Pad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetPaddings(const std::vector<int> &paddings);
+  void SetPaddingMode(int padding_mode);
+  void SetConstantValue(float constant_value);
 #else
-  explicit Pad(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Pad() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetPaddings() const;
   int GetPaddingMode() const;
   float GetConstantValue() const;
-  void SetPaddings(const std::vector<int> &paddings);
-  void SetPaddingMode(int padding_mode);
-  void SetConstantValue(float constant_value);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/permute.cc b/mindspore/lite/src/ops/permute.cc
index 74c49b53ba..d51c99ebe8 100644
--- a/mindspore/lite/src/ops/permute.cc
+++ b/mindspore/lite/src/ops/permute.cc
@@ -31,6 +31,25 @@ std::vector<int64_t> Permute::GetOrder() const {
 }
 
 void Permute::SetOrder(const std::vector<int64_t> &order) {}
+int Permute::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Permute();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Permute return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int64_t> order;
+  if (attr->order() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->order()->size()); i++) {
+      order.push_back(attr->order()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreatePermuteDirect(*fbb, &order);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Permute, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/permute.h b/mindspore/lite/src/ops/permute.h
index 08599aba1e..f2c082ab30 100644
--- a/mindspore/lite/src/ops/permute.h
+++ b/mindspore/lite/src/ops/permute.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,10 +29,12 @@ namespace lite {
 class Permute : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
-  Permute() = default;
+  MS_DECLARE_PARENT(Permute, PrimitiveC);
   explicit Permute(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Permute(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Permute() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int64_t> GetOrder() const;
   void SetOrder(const std::vector<int64_t> &order);
diff --git a/mindspore/lite/src/ops/pooling.cc b/mindspore/lite/src/ops/pooling.cc
index d71dc33053..2b409622ca 100644
--- a/mindspore/lite/src/ops/pooling.cc
+++ b/mindspore/lite/src/ops/pooling.cc
@@ -36,6 +36,7 @@ int Pooling::GetPadDown() const { return this->primitive_->value.AsPooling()->pa
 int Pooling::GetPadLeft() const { return this->primitive_->value.AsPooling()->padLeft; }
 int Pooling::GetPadRight() const { return this->primitive_->value.AsPooling()->padRight; }
 int Pooling::GetRoundMode() const { return this->primitive_->value.AsPooling()->roundMode; }
+int Pooling::GetActivationType() const { return this->primitive_->value.AsPooling()->activationType; }
 
 void Pooling::SetFormat(int format) { this->primitive_->value.AsPooling()->format = (schema::Format)format; }
 void Pooling::SetPoolingMode(int pooling_mode) {
@@ -54,44 +55,66 @@ void Pooling::SetPadRight(int pad_right) { this->primitive_->value.AsPooling()->
 void Pooling::SetRoundMode(int round_mode) {
   this->primitive_->value.AsPooling()->roundMode = (schema::RoundMode)round_mode;
 }
+void Pooling::SetActivationType(int activation_type) {
+  this->primitive_->value.AsPooling()->activationType = (schema::ActivationType)activation_type;
+}
 
 int Pooling::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::PoolingT>();
-  if (prim.instance_name() == "MaxPool") {
-    attr->poolingMode = schema::PoolMode_MAX_POOLING;
-  } else if (prim.instance_name() == "MeanPool") {
-    attr->poolingMode = schema::PoolMode_MEAN_POOLING;
-  }
-
-  auto format = GetValue<std::string>(prim.GetAttr("data_format"));
-  if (format == "NCHW") {
-    attr->format = schema::Format_NCHW;
-  } else if (format == "NHWC") {
-    attr->format = schema::Format_NHWC;
-  } else {
-    attr->format = schema::Format_NUM_OF_FORMAT;
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Pooling;
   }
-
-  auto pad_mode = GetValue<std::string>(prim.GetAttr("padding"));
-  if (pad_mode == "VALID") {
-    attr->padMode = schema::PadMode_VALID;
-  } else if (pad_mode == "SAME") {
-    attr->padMode = schema::PadMode_SAME;
-  } else {
-    attr->padMode = schema::PadMode_NOTSET;
+  if (this->primitive_->value.type != schema::PrimitiveType_Pooling) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
   }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::PoolingT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    if (prim.instance_name() == "MaxPool") {
+      attr->poolingMode = schema::PoolMode_MAX_POOLING;
+    } else if (prim.instance_name() == "MeanPool") {
+      attr->poolingMode = schema::PoolMode_MEAN_POOLING;
+    }
 
-  auto kernel_size = GetValue<std::vector<int>>(prim.GetAttr("ksize"));
-  attr->windowH = kernel_size[2];
-  attr->windowW = kernel_size[3];
+    auto format = GetValue<std::string>(prim.GetAttr("data_format"));
+    if (format == "NCHW") {
+      attr->format = schema::Format_NCHW;
+    } else if (format == "NHWC") {
+      attr->format = schema::Format_NHWC;
+    } else {
+      attr->format = schema::Format_NUM_OF_FORMAT;
+    }
 
-  auto stride = GetValue<std::vector<int>>(prim.GetAttr("strides"));
-  attr->strideH = stride[2];
-  attr->strideW = stride[3];
+    auto pad_mode = GetValue<std::string>(prim.GetAttr("padding"));
+    if (pad_mode == "VALID") {
+      attr->padMode = schema::PadMode_VALID;
+    } else if (pad_mode == "SAME") {
+      attr->padMode = schema::PadMode_SAME;
+    } else {
+      attr->padMode = schema::PadMode_NOTSET;
+    }
 
-  this->primitive_->value.type = schema::PrimitiveType_Pooling;
-  this->primitive_->value.value = attr.release();
+    auto kernel_size = GetValue<std::vector<int>>(prim.GetAttr("ksize"));
+    attr->windowH = kernel_size[2];
+    attr->windowW = kernel_size[3];
+
+    auto stride = GetValue<std::vector<int>>(prim.GetAttr("strides"));
+    attr->strideH = stride[2];
+    attr->strideW = stride[3];
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
 
   return RET_OK;
 }
@@ -111,20 +134,24 @@ int Pooling::GetPadDown() const { return this->primitive_->value_as_Pooling()->p
 int Pooling::GetPadLeft() const { return this->primitive_->value_as_Pooling()->padLeft(); }
 int Pooling::GetPadRight() const { return this->primitive_->value_as_Pooling()->padRight(); }
 int Pooling::GetRoundMode() const { return this->primitive_->value_as_Pooling()->roundMode(); }
-
-void Pooling::SetFormat(int format) {}
-void Pooling::SetPoolingMode(int pooling_mode) {}
-void Pooling::SetGlobal(bool global) {}
-void Pooling::SetWindowW(int window_w) {}
-void Pooling::SetWindowH(int window_h) {}
-void Pooling::SetStrideW(int stride_w) {}
-void Pooling::SetStrideH(int stride_h) {}
-void Pooling::SetPadMode(int pad_mode) {}
-void Pooling::SetPadUp(int pad_up) {}
-void Pooling::SetPadDown(int pad_down) {}
-void Pooling::SetPadLeft(int pad_left) {}
-void Pooling::SetPadRight(int pad_right) {}
-void Pooling::SetRoundMode(int round_mode) {}
+int Pooling::GetActivationType() const { return this->primitive_->value_as_Pooling()->activationType(); }
+
+int Pooling::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Pooling();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Pooling return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset =
+    schema::CreatePooling(*fbb, attr->format(), attr->poolingMode(), attr->global(), attr->windowW(), attr->windowH(),
+                          attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(), attr->padDown(),
+                          attr->padLeft(), attr->padRight(), attr->roundMode());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Pooling, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 
 #endif
 
@@ -165,10 +192,18 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
     output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH()));
     auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h);
     auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w);
-    pad_u_ = pad_h_all / 2;
-    pad_d_ = pad_h_all - pad_u_;
-    pad_l_ = pad_w_all / 2;
-    pad_r_ = pad_w_all - pad_l_;
+    if (pad_h_all < 0) {
+      pad_u_ = pad_d_ = 0;
+    } else {
+      pad_u_ = pad_h_all / 2;
+      pad_d_ = pad_h_all - pad_u_;
+    }
+    if (pad_w_all < 0) {
+      pad_l_ = pad_r_ = 0;
+    } else {
+      pad_l_ = pad_w_all / 2;
+      pad_r_ = pad_w_all - pad_l_;
+    }
   } else {
     auto round_mode = (schema::RoundMode)GetRoundMode();
     if (round_mode == schema::RoundMode_FLOOR) {
diff --git a/mindspore/lite/src/ops/pooling.h b/mindspore/lite/src/ops/pooling.h
index eeac0e3b96..6892d5b95e 100644
--- a/mindspore/lite/src/ops/pooling.h
+++ b/mindspore/lite/src/ops/pooling.h
@@ -28,10 +28,28 @@ namespace lite {
 class Pooling : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Pooling, PrimitiveC);
   Pooling() = default;
   explicit Pooling(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetPoolingMode(int pooling_mode);
+  void SetGlobal(bool global);
+  void SetWindowW(int window_w);
+  void SetWindowH(int window_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetRoundMode(int round_mode);
+  void SetActivationType(int activation_type);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Pooling(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Pooling() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
@@ -47,33 +65,19 @@ class Pooling : public PrimitiveC {
   int GetPadLeft() const;
   int GetPadRight() const;
   int GetRoundMode() const;
-  void SetFormat(int format);
-  void SetPoolingMode(int pooling_mode);
-  void SetGlobal(bool global);
-  void SetWindowW(int window_w);
-  void SetWindowH(int window_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetRoundMode(int round_mode);
+  int GetActivationType() const;
 
   int PadUp() const;
   int PadDown() const;
   int PadLeft() const;
   int PadRight() const;
 
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
-
  protected:
   int pad_u_ = 0;
   int pad_d_ = 0;
   int pad_l_ = 0;
   int pad_r_ = 0;
-};
+};  // namespace lite
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/src/ops/pooling_grad.cc b/mindspore/lite/src/ops/pooling_grad.cc
index f4e28d9f9a..654a3cb047 100644
--- a/mindspore/lite/src/ops/pooling_grad.cc
+++ b/mindspore/lite/src/ops/pooling_grad.cc
@@ -69,19 +69,22 @@ int PoolingGrad::GetPadLeft() const { return this->primitive_->value_as_PoolingG
 int PoolingGrad::GetPadRight() const { return this->primitive_->value_as_PoolingGrad()->padRight(); }
 int PoolingGrad::GetRoundMode() const { return this->primitive_->value_as_PoolingGrad()->roundMode(); }
 
-void PoolingGrad::SetFormat(int format) {}
-void PoolingGrad::SetPoolingMode(int pooling_mode) {}
-void PoolingGrad::SetGlobal(bool global) {}
-void PoolingGrad::SetWindowW(int window_w) {}
-void PoolingGrad::SetWindowH(int window_h) {}
-void PoolingGrad::SetStrideW(int stride_w) {}
-void PoolingGrad::SetStrideH(int stride_h) {}
-void PoolingGrad::SetPadMode(int pad_mode) {}
-void PoolingGrad::SetPadUp(int pad_up) {}
-void PoolingGrad::SetPadDown(int pad_down) {}
-void PoolingGrad::SetPadLeft(int pad_left) {}
-void PoolingGrad::SetPadRight(int pad_right) {}
-void PoolingGrad::SetRoundMode(int round_mode) {}
+int PoolingGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_PoolingGrad();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_PoolingGrad return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset =
+    schema::CreatePoolingGrad(*fbb, attr->format(), attr->poolingMode(), attr->global(), attr->windowW(),
+                              attr->windowH(), attr->strideW(), attr->strideH(), attr->padMode(), attr->padUp(),
+                              attr->padDown(), attr->padLeft(), attr->padRight(), attr->roundMode());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_PoolingGrad, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/pooling_grad.h b/mindspore/lite/src/ops/pooling_grad.h
index 490bd7ddcb..c42c5d72d6 100644
--- a/mindspore/lite/src/ops/pooling_grad.h
+++ b/mindspore/lite/src/ops/pooling_grad.h
@@ -28,10 +28,26 @@ namespace lite {
 class PoolingGrad : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(PoolingGrad, PrimitiveC);
   PoolingGrad() = default;
   explicit PoolingGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetPoolingMode(int pooling_mode);
+  void SetGlobal(bool global);
+  void SetWindowW(int window_w);
+  void SetWindowH(int window_h);
+  void SetStrideW(int stride_w);
+  void SetStrideH(int stride_h);
+  void SetPadMode(int pad_mode);
+  void SetPadUp(int pad_up);
+  void SetPadDown(int pad_down);
+  void SetPadLeft(int pad_left);
+  void SetPadRight(int pad_right);
+  void SetRoundMode(int round_mode);
 #else
-  explicit PoolingGrad(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  PoolingGrad() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetFormat() const;
   int GetPoolingMode() const;
@@ -46,19 +62,6 @@ class PoolingGrad : public PrimitiveC {
   int GetPadLeft() const;
   int GetPadRight() const;
   int GetRoundMode() const;
-  void SetFormat(int format);
-  void SetPoolingMode(int pooling_mode);
-  void SetGlobal(bool global);
-  void SetWindowW(int window_w);
-  void SetWindowH(int window_h);
-  void SetStrideW(int stride_w);
-  void SetStrideH(int stride_h);
-  void SetPadMode(int pad_mode);
-  void SetPadUp(int pad_up);
-  void SetPadDown(int pad_down);
-  void SetPadLeft(int pad_left);
-  void SetPadRight(int pad_right);
-  void SetRoundMode(int round_mode);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/power.cc b/mindspore/lite/src/ops/power.cc
index ff485c0a71..d812ed1d1f 100644
--- a/mindspore/lite/src/ops/power.cc
+++ b/mindspore/lite/src/ops/power.cc
@@ -32,10 +32,19 @@ void Power::SetShift(float shift) { this->primitive_->value.AsPower()->shift = s
 float Power::GetPower() const { return this->primitive_->value_as_Power()->power(); }
 float Power::GetScale() const { return this->primitive_->value_as_Power()->scale(); }
 float Power::GetShift() const { return this->primitive_->value_as_Power()->shift(); }
-
-void Power::SetPower(float power) {}
-void Power::SetScale(float scale) {}
-void Power::SetShift(float shift) {}
+int Power::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Power();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Power return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreatePower(*fbb, attr->power(), attr->scale(), attr->shift());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Power, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Power::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
diff --git a/mindspore/lite/src/ops/power.h b/mindspore/lite/src/ops/power.h
index 764da4028c..b38dce1bdd 100644
--- a/mindspore/lite/src/ops/power.h
+++ b/mindspore/lite/src/ops/power.h
@@ -28,18 +28,21 @@ namespace lite {
 class Power : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Power, PrimitiveC);
   Power() = default;
   explicit Power(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetPower(float power);
+  void SetScale(float scale);
+  void SetShift(float shift);
 #else
-  explicit Power(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Power() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   float GetPower() const;
   float GetScale() const;
   float GetShift() const;
-  void SetPower(float power);
-  void SetScale(float scale);
-  void SetShift(float shift);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/power_grad.cc b/mindspore/lite/src/ops/power_grad.cc
index 0e9056f458..ba10623dec 100644
--- a/mindspore/lite/src/ops/power_grad.cc
+++ b/mindspore/lite/src/ops/power_grad.cc
@@ -33,9 +33,21 @@ float PowerGrad::GetPower() const { return this->primitive_->value_as_PowerGrad(
 float PowerGrad::GetScale() const { return this->primitive_->value_as_PowerGrad()->scale(); }
 float PowerGrad::GetShift() const { return this->primitive_->value_as_PowerGrad()->shift(); }
 
-void PowerGrad::SetPower(float power) {}
-void PowerGrad::SetScale(float scale) {}
-void PowerGrad::SetShift(float shift) {}
+int PowerGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto attr = primitive->value_as_PowerGrad();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_PowerGrad return nullptr";
+    return RET_ERROR;
+  }
+
+  auto val_offset = schema::CreatePowerGrad(*fbb, attr->power(), attr->scale(), attr->shift());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_PowerGrad, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/power_grad.h b/mindspore/lite/src/ops/power_grad.h
index 3969197bfb..9cb95d696f 100644
--- a/mindspore/lite/src/ops/power_grad.h
+++ b/mindspore/lite/src/ops/power_grad.h
@@ -28,17 +28,20 @@ namespace lite {
 class PowerGrad : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(PowerGrad, PrimitiveC);
   PowerGrad() = default;
   explicit PowerGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetPower(float power);
+  void SetScale(float scale);
+  void SetShift(float shift);
 #else
-  explicit PowerGrad(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  PowerGrad() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   float GetPower() const;
   float GetScale() const;
   float GetShift() const;
-  void SetPower(float power);
-  void SetScale(float scale);
-  void SetShift(float shift);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/primitive_c.cc b/mindspore/lite/src/ops/primitive_c.cc
index 92079026fd..7bc61f531a 100644
--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@@ -17,6 +17,7 @@
 #include "src/ops/primitive_c.h"
 #include <memory>
 #include "src/ops/space_to_batch.h"
+#include "src/ops/space_to_batch_nd.h"
 #include "src/ops/conv2d.h"
 #include "src/ops/roi_pooling.h"
 #include "src/ops/topk.h"
@@ -71,8 +72,8 @@
 #include "src/ops/gather_nd.h"
 #include "src/ops/local_response_normalization.h"
 #include "src/ops/pad.h"
-#include "src/ops/prelu.h"
-#include "src/ops/caffe_p_relu.h"
+#include "src/ops/p_relu.h"
+#include "src/ops/leaky_relu.h"
 #include "src/ops/reverse_sequence.h"
 #include "src/ops/dedepthwise_conv2d.h"
 #include "src/ops/depthwise_conv2d.h"
@@ -109,6 +110,7 @@
 #include "src/ops/round.h"
 #include "src/ops/unique.h"
 #include "src/ops/zeros_like.h"
+#include "src/ops/return.h"
 #include "src/ops/where.h"
 #include "src/ops/scatter_nd.h"
 #include "src/ops/constant_of_shape.h"
@@ -116,13 +118,15 @@
 #include "src/ops/make_tuple.h"
 #include "src/ops/quant.h"
 #include "src/ops/tuple_get_item.h"
+#include "src/ops/l2_norm.h"
+#include "src/ops/sparse_to_dense.h"
 
 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 schema::PrimitiveT *PrimitiveC::GetPrimitiveT() const { return this->primitive_; }
 
-void PrimitiveC::SetPrimitiveT(schema::PrimitiveT *prim) { this->primitive_ = prim; }
+void PrimitiveC::ClearPrimitiveT() { this->primitive_ = nullptr; }
 
 void PrimitiveC::SetInputQuantParam(const std::vector<std::vector<schema::QuantParamT>> &input_quant_param) {
   this->input_quant_param_ = input_quant_param;
@@ -155,21 +159,21 @@ std::shared_ptr<PrimitiveC> GetReturnPrim() {
   auto return_primitiveT = new schema::PrimitiveT;
   return_primitiveT->value.type = schema::PrimitiveType_Return;
   return_primitiveT->value.value = new schema::ReturnT;
-  return std::make_shared<PrimitiveC>(return_primitiveT);
+  return std::make_shared<Return>(return_primitiveT);
 }
 
 std::shared_ptr<PrimitiveC> GetMakeTuplePrim() {
   auto make_tuple_primitiveT = new schema::PrimitiveT;
   make_tuple_primitiveT->value.type = schema::PrimitiveType_MakeTuple;
   make_tuple_primitiveT->value.value = new schema::MakeTupleT;
-  return std::make_shared<PrimitiveC>(make_tuple_primitiveT);
+  return std::make_shared<MakeTuple>(make_tuple_primitiveT);
 }
 
 std::shared_ptr<PrimitiveC> GetTupleGetItemPrim() {
   auto tuple_get_item_primitiveT = new schema::PrimitiveT();
   tuple_get_item_primitiveT->value.type = schema::PrimitiveType_TupleGetItem;
   tuple_get_item_primitiveT->value.value = new schema::TupleGetItemT;
-  return std::make_shared<PrimitiveC>(tuple_get_item_primitiveT);
+  return std::make_shared<TupleGetItem>(tuple_get_item_primitiveT);
 }
 
 template <typename T, typename = std::enable_if<std::is_base_of<PrimitiveC, T>::value>>
@@ -344,10 +348,10 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitiveT(mindspore::schema::PrimitiveT
       return new Minimum(primitive);
     case schema::PrimitiveType_StridedSlice:
       return new StridedSlice(primitive);
-    case schema::PrimitiveType_Prelu:
-      return new Prelu(primitive);
-    case schema::PrimitiveType_CaffePReLU:
-      return new CaffePReLU(primitive);
+    case schema::PrimitiveType_LeakyReLU:
+      return new (std::nothrow) LeakyReLU(primitive);
+    case schema::PrimitiveType_PReLU:
+      return new (std::nothrow) PReLU(primitive);
     case schema::PrimitiveType_Round:
       return new Round(primitive);
     case schema::PrimitiveType_Reverse:
@@ -414,6 +418,8 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitiveT(mindspore::schema::PrimitiveT
       return new BatchToSpace(primitive);
     case schema::PrimitiveType_SpaceToBatch:
       return new SpaceToBatch(primitive);
+    case schema::PrimitiveType_SpaceToBatchND:
+      return new SpaceToBatchND(primitive);
     case schema::PrimitiveType_BroadcastTo:
       return new BroadcastTo(primitive);
     case schema::PrimitiveType_DepthToSpace:
@@ -430,6 +436,10 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitiveT(mindspore::schema::PrimitiveT
       return new ScatterND(primitive);
     case schema::PrimitiveType_ConstantOfShape:
       return new ConstantOfShape(primitive);
+    case schema::PrimitiveType_L2Norm:
+      return new L2Norm(primitive);
+    case schema::PrimitiveType_SparseToDense:
+      return new SparseToDense(primitive);
     default:
       MS_LOG(ERROR) << "Unsupported primitive type in UnPackFromSchemaPrimitiveT : "
                     << schema::EnumNamePrimitiveType(op_type);
@@ -438,204 +448,210 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitiveT(mindspore::schema::PrimitiveT
   return nullptr;
 }
 #else
-PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitive(mindspore::schema::Primitive *primitive) {
-  MS_EXCEPTION_IF_NULL(primitive);
+PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitive(const schema::Primitive *primitive) {
+  MS_ASSERT(primitive);
   auto op_type = primitive->value_type();
   switch (op_type) {
     case schema::PrimitiveType_SoftMax:
-      return new SoftMax(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<SoftMax>(primitive);
     case schema::PrimitiveType_Activation:
-      return new Activation(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Activation>(primitive);
     case schema::PrimitiveType_Conv2D:
-      return new Conv2D(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Conv2D>(primitive);
     case schema::PrimitiveType_DeConv2D:
-      return new DeConv2D(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<DeConv2D>(primitive);
     case schema::PrimitiveType_Reduce:
-      return new Reduce(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Reduce>(primitive);
     case schema::PrimitiveType_Pooling:
-      return new Pooling(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Pooling>(primitive);
     case schema::PrimitiveType_ROIPooling:
-      return new ROIPooling(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ROIPooling>(primitive);
     case schema::PrimitiveType_DepthwiseConv2D:
-      return new DepthwiseConv2D(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<DepthwiseConv2D>(primitive);
     case schema::PrimitiveType_FusedBatchNorm:
-      return new FusedBatchNorm(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<FusedBatchNorm>(primitive);
     case schema::PrimitiveType_BatchNorm:
-      return new BatchNorm(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<BatchNorm>(primitive);
     case schema::PrimitiveType_FullConnection:
-      return new FullConnection(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<FullConnection>(primitive);
     case schema::PrimitiveType_Power:
-      return new Power(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Power>(primitive);
     case schema::PrimitiveType_Pad:
-      return new Pad(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Pad>(primitive);
     case schema::PrimitiveType_Range:
-      return new Range(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Range>(primitive);
     case schema::PrimitiveType_Mul:
-      return new Mul(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Mul>(primitive);
     case schema::PrimitiveType_Add:
-      return new Add(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Add>(primitive);
     case schema::PrimitiveType_Sub:
-      return new Sub(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Sub>(primitive);
     case schema::PrimitiveType_Div:
-      return new Div(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Div>(primitive);
     case schema::PrimitiveType_BiasAdd:
-      return new BiasAdd(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<BiasAdd>(primitive);
     case schema::PrimitiveType_ExpandDims:
-      return new ExpandDims(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ExpandDims>(primitive);
     case schema::PrimitiveType_ArgMax:
-      return new ArgMax(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ArgMax>(primitive);
     case schema::PrimitiveType_ArgMin:
-      return new ArgMin(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ArgMin>(primitive);
     case schema::PrimitiveType_Cast:
-      return new Cast(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Cast>(primitive);
     case schema::PrimitiveType_Reshape:
-      return new Reshape(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Reshape>(primitive);
     case schema::PrimitiveType_Scale:
-      return new Scale(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Scale>(primitive);
     case schema::PrimitiveType_Eltwise:
-      return new Eltwise(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Eltwise>(primitive);
     case schema::PrimitiveType_Ceil:
-      return new Ceil(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Ceil>(primitive);
     case schema::PrimitiveType_Concat:
-      return new Concat(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Concat>(primitive);
     case schema::PrimitiveType_Fill:
-      return new Fill(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Fill>(primitive);
     case schema::PrimitiveType_Nhwc2Nchw:
-      return new Nhwc2Nchw(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Nhwc2Nchw>(primitive);
     case schema::PrimitiveType_Nchw2Nhwc:
-      return new Nchw2Nhwc(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Nchw2Nhwc>(primitive);
     case schema::PrimitiveType_Transpose:
-      return new Transpose(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Transpose>(primitive);
     case schema::PrimitiveType_Slice:
-      return new Slice(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Slice>(primitive);
     case schema::PrimitiveType_Squeeze:
-      return new Squeeze(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Squeeze>(primitive);
     case schema::PrimitiveType_Flatten:
-      return new Flatten(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Flatten>(primitive);
     case schema::PrimitiveType_Mean:
-      return new Mean(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Mean>(primitive);
     case schema::PrimitiveType_Stack:
-      return new Stack(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Stack>(primitive);
     case schema::PrimitiveType_Crop:
-      return new Crop(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Crop>(primitive);
     case schema::PrimitiveType_SquaredDifference:
-      return new SquaredDifference(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<SquaredDifference>(primitive);
     case schema::PrimitiveType_AddN:
-      return new AddN(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<AddN>(primitive);
     case schema::PrimitiveType_Abs:
-      return new Abs(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Abs>(primitive);
     case schema::PrimitiveType_Sin:
-      return new Sin(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Sin>(primitive);
     case schema::PrimitiveType_Cos:
-      return new Cos(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Cos>(primitive);
     case schema::PrimitiveType_Log:
-      return new Log(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Log>(primitive);
     case schema::PrimitiveType_Sqrt:
-      return new Sqrt(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Sqrt>(primitive);
     case schema::PrimitiveType_Rsqrt:
-      return new Rsqrt(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Rsqrt>(primitive);
     case schema::PrimitiveType_Square:
-      return new Square(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Square>(primitive);
     case schema::PrimitiveType_Exp:
-      return new Exp(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Exp>(primitive);
     case schema::PrimitiveType_Gather:
-      return new Gather(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Gather>(primitive);
     case schema::PrimitiveType_GatherNd:
-      return new GatherNd(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<GatherNd>(primitive);
     case schema::PrimitiveType_LocalResponseNormalization:
-      return new LocalResponseNormalization(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<LocalResponseNormalization>(primitive);
     case schema::PrimitiveType_Maximum:
-      return new Maximum(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Maximum>(primitive);
     case schema::PrimitiveType_Minimum:
-      return new Minimum(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Minimum>(primitive);
     case schema::PrimitiveType_StridedSlice:
-      return new StridedSlice(const_cast<schema::Primitive *>(primitive));
-    case schema::PrimitiveType_Prelu:
-      return new Prelu(const_cast<schema::Primitive *>(primitive));
-    case schema::PrimitiveType_CaffePReLU:
-      return new CaffePReLU(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<StridedSlice>(primitive);
+    case schema::PrimitiveType_LeakyReLU:
+      return NewPrimitiveC<LeakyReLU>(primitive);
+    case schema::PrimitiveType_PReLU:
+      return NewPrimitiveC<PReLU>(primitive);
     case schema::PrimitiveType_Round:
-      return new Round(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Round>(primitive);
     case schema::PrimitiveType_Reverse:
-      return new Reverse(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Reverse>(primitive);
     case schema::PrimitiveType_ReverseSequence:
-      return new ReverseSequence(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ReverseSequence>(primitive);
     case schema::PrimitiveType_LogicalAnd:
-      return new LogicalAnd(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<LogicalAnd>(primitive);
     case schema::PrimitiveType_LogicalOr:
-      return new LogicalOr(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<LogicalOr>(primitive);
     case schema::PrimitiveType_LogicalNot:
-      return new LogicalNot(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<LogicalNot>(primitive);
     case schema::PrimitiveType_FloorDiv:
-      return new FloorDiv(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<FloorDiv>(primitive);
     case schema::PrimitiveType_FloorMod:
-      return new FloorMod(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<FloorMod>(primitive);
     case schema::PrimitiveType_Equal:
-      return new Equal(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Equal>(primitive);
     case schema::PrimitiveType_NotEqual:
-      return new NotEqual(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<NotEqual>(primitive);
     case schema::PrimitiveType_Less:
-      return new Less(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Less>(primitive);
     case schema::PrimitiveType_LessEqual:
-      return new LessEqual(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<LessEqual>(primitive);
     case schema::PrimitiveType_Greater:
-      return new Greater(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Greater>(primitive);
     case schema::PrimitiveType_GreaterEqual:
-      return new GreaterEqual(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<GreaterEqual>(primitive);
     case schema::PrimitiveType_Floor:
-      return new Floor(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Floor>(primitive);
     case schema::PrimitiveType_Split:
-      return new Split(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Split>(primitive);
     case schema::PrimitiveType_OneHot:
-      return new OneHot(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<OneHot>(primitive);
     case schema::PrimitiveType_PriorBox:
-      return new PriorBox(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<PriorBox>(primitive);
     case schema::PrimitiveType_SpaceToDepth:
-      return new SpaceToDepth(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<SpaceToDepth>(primitive);
     case schema::PrimitiveType_Tile:
-      return new Tile(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Tile>(primitive);
     case schema::PrimitiveType_Resize:
-      return new Resize(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Resize>(primitive);
     case schema::PrimitiveType_Unstack:
-      return new Unstack(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Unstack>(primitive);
     case schema::PrimitiveType_Unique:
-      return new Unique(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Unique>(primitive);
     case schema::PrimitiveType_TopK:
-      return new TopK(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<TopK>(primitive);
     case schema::PrimitiveType_MatMul:
-      return new MatMul(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<MatMul>(primitive);
     case schema::PrimitiveType_QuantDTypeCast:
-      return new QuantDTypeCast(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<QuantDTypeCast>(primitive);
     case schema::PrimitiveType_EmbeddingLookup:
-      return new EmbeddingLookup(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<EmbeddingLookup>(primitive);
     case schema::PrimitiveType_Elu:
-      return new Elu(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Elu>(primitive);
     case schema::PrimitiveType_DeDepthwiseConv2D:
-      return new DeDepthwiseConv2D(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<DeDepthwiseConv2D>(primitive);
     case schema::PrimitiveType_Shape:
-      return new Shape(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Shape>(primitive);
     case schema::PrimitiveType_Unsqueeze:
-      return new Unsqueeze(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Unsqueeze>(primitive);
     case schema::PrimitiveType_BatchToSpace:
-      return new BatchToSpace(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<BatchToSpace>(primitive);
     case schema::PrimitiveType_SpaceToBatch:
-      return new SpaceToBatch(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<SpaceToBatch>(primitive);
+    case schema::PrimitiveType_SpaceToBatchND:
+      return NewPrimitiveC<SpaceToBatchND>(primitive);
     case schema::PrimitiveType_BroadcastTo:
-      return new BroadcastTo(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<BroadcastTo>(primitive);
     case schema::PrimitiveType_DepthToSpace:
-      return new DepthToSpace(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<DepthToSpace>(primitive);
     case schema::PrimitiveType_Lstm:
-      return new Lstm(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Lstm>(primitive);
     case schema::PrimitiveType_ZerosLike:
-      return new ZerosLike(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ZerosLike>(primitive);
     case schema::PrimitiveType_MakeTuple:
-      return new MakeTuple(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<MakeTuple>(primitive);
     case schema::PrimitiveType_Where:
-      return new Where(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<Where>(primitive);
     case schema::PrimitiveType_ScatterND:
-      return new ScatterND(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ScatterND>(primitive);
     case schema::PrimitiveType_ConstantOfShape:
-      return new ConstantOfShape(const_cast<schema::Primitive *>(primitive));
+      return NewPrimitiveC<ConstantOfShape>(primitive);
+    case schema::PrimitiveType_L2Norm:
+      return NewPrimitiveC<L2Norm>(primitive);
+    case schema::PrimitiveType_SparseToDense:
+      return NewPrimitiveC<SparseToDense>(primitive);
     default:
       MS_LOG(ERROR) << "Unsupported primitive type in UnPackFromSchemaPrimitive : "
                     << schema::EnumNamePrimitiveType(op_type);
@@ -646,6 +662,9 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitive(mindspore::schema::Primitive *
 #endif
 
 int PrimitiveC::Type() const {
+  if (this->primitive_ == nullptr) {
+    return schema::PrimitiveType_NONE;
+  }
 #ifdef PRIMITIVE_WRITEABLE
   return this->primitive_->value.type;
 #else
diff --git a/mindspore/lite/src/ops/primitive_c.h b/mindspore/lite/src/ops/primitive_c.h
index 86f7c3302f..d13f5c31a2 100644
--- a/mindspore/lite/src/ops/primitive_c.h
+++ b/mindspore/lite/src/ops/primitive_c.h
@@ -46,14 +46,14 @@ constexpr int kAnfPopulaterTwo = 2;
 constexpr int kAnfPopulaterThree = 3;
 class PrimitiveC : public mindspore::Primitive {
  public:
-  // Argument primitive is delived into PrimitiveC and will be deleted in ~PrimitiveC(). Caller should not delete
-  // primitive
+  // Argument primitive is deliverd into PrimitiveC and will be deleted in ~PrimitiveC().
+  // Caller should not delete primitive.
   explicit PrimitiveC(schema::PrimitiveT *primitive) : Primitive(""), primitive_(primitive) {}
 
   explicit PrimitiveC(const Primitive &prim) : Primitive(prim) {}
 
-  // Argument primitive is delived into PrimitiveC and will be deleted in ~PrimitiveC(). Caller should not delete
-  // primitive
+  // Argument primitive is deliverd into PrimitiveC and will be deleted in ~PrimitiveC().
+  // Caller should not delete primitive.
   explicit PrimitiveC(const std::string &name, schema::PrimitiveT *primitive)
       : Primitive(name), primitive_(primitive) {}
 
@@ -61,15 +61,13 @@ class PrimitiveC : public mindspore::Primitive {
 
   MS_DECLARE_PARENT(PrimitiveC, Primitive);
 
-  ~PrimitiveC() override {
-    //    delete this->primitive_;
-  }
+  ~PrimitiveC() override { delete this->primitive_; }
 
   int Type() const;
 
   schema::PrimitiveT *GetPrimitiveT() const;
 
-  void SetPrimitiveT(schema::PrimitiveT *prim);
+  void ClearPrimitiveT();
 
   bool operator==(const Value &rhs) const {
     if (rhs.isa<PrimitiveC>()) {
@@ -115,7 +113,7 @@ class PrimitiveC : public mindspore::Primitive {
   static std::shared_ptr<PrimitiveC> UnPackFromPrimitive(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
 
  protected:
-  virtual int UnPackAttr(const Primitive &prim) { return RET_ERROR; }
+  virtual int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) { return RET_ERROR; }
 
  protected:
   schema::PrimitiveT *primitive_ = nullptr;
@@ -135,15 +133,9 @@ class PrimitiveC {
  public:
   PrimitiveC() = default;
 
-  // Argument primitive is delived into PrimitiveC and will be deleted in ~PrimitiveC(). Caller should not delete
-  // primitive
-  explicit PrimitiveC(schema::Primitive *primitive) : primitive_(primitive) {}
-
-  virtual ~PrimitiveC() {
-    //    delete this->primitive_;
-  }
+  virtual ~PrimitiveC() { free(this->primitive_buf_); }
 
-  static PrimitiveC *UnPackFromSchemaPrimitive(mindspore::schema::Primitive *primitive);
+  static PrimitiveC *UnPackFromSchemaPrimitive(const schema::Primitive *primitive);
 
   bool GetInferFlag() const;
 
@@ -154,7 +146,53 @@ class PrimitiveC {
   int Type() const;
 
  protected:
-  schema::Primitive *primitive_ = nullptr;
+  template <typename T, typename = std::enable_if<std::is_base_of<PrimitiveC, T>::value>>
+  static PrimitiveC *NewPrimitiveC(const schema::Primitive *primitive) {
+    auto primc = new T();
+    if (primc == nullptr) {
+      MS_LOG(ERROR) << "new PrimitiveC failed";
+      return nullptr;
+    }
+    auto ret = primc->UnPackSchemaPrimitive(primitive);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "UnPackSchemaPrimitive failed";
+      return nullptr;
+    }
+    return primc;
+  }
+
+  int UnPackSchemaPrimitive(const schema::Primitive *primitive) {
+    flatbuffers::FlatBufferBuilder fbb(1024);
+    if (UnPackToFlatBuilder(primitive, &fbb) != RET_OK) {
+      MS_LOG(ERROR) << "UnPackToFlatBuilder failde";
+      fbb.Clear();
+      return RET_ERROR;
+    }
+    auto buf = fbb.GetBufferPointer();
+    if (buf == nullptr) {
+      MS_LOG(ERROR) << "GetBufferPointer return nullptr";
+      fbb.Clear();
+      return RET_ERROR;
+    }
+    primitive_buf_ = reinterpret_cast<char *>(malloc(fbb.GetSize()));
+    if (primitive_buf_ == nullptr) {
+      MS_LOG(ERROR) << "malloc primitive_buf_ failed";
+      fbb.Clear();
+      return RET_ERROR;
+    }
+    memcpy(primitive_buf_, buf, fbb.GetSize());
+    this->primitive_ = flatbuffers::GetRoot<schema::Primitive>(primitive_buf_);
+    fbb.Clear();
+    return RET_OK;
+  }
+
+  virtual int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+    return RET_ERROR;
+  }
+
+ protected:
+  const schema::Primitive *primitive_ = nullptr;
+  char *primitive_buf_ = nullptr;
   bool infer_flag_ = true;
 };
 #endif
diff --git a/mindspore/lite/src/ops/prior_box.cc b/mindspore/lite/src/ops/prior_box.cc
index 3bad25e0d8..1bd60c151e 100644
--- a/mindspore/lite/src/ops/prior_box.cc
+++ b/mindspore/lite/src/ops/prior_box.cc
@@ -77,17 +77,43 @@ bool PriorBox::GetClip() const { return this->primitive_->value_as_PriorBox()->c
 bool PriorBox::GetFlip() const { return this->primitive_->value_as_PriorBox()->flip(); }
 float PriorBox::GetOffset() const { return this->primitive_->value_as_PriorBox()->offset(); }
 
-void PriorBox::SetMinSizes(const std::vector<int> &min_sizes) {}
-void PriorBox::SetMaxSizes(const std::vector<int> &max_sizes) {}
-void PriorBox::SetAspectRatios(const std::vector<float> &aspect_ratios) {}
-void PriorBox::SetVariances(const std::vector<float> &variances) {}
-void PriorBox::SetImageSizeW(int image_size_w) {}
-void PriorBox::SetImageSizeH(int image_size_h) {}
-void PriorBox::SetStepW(float step_w) {}
-void PriorBox::SetStepH(float step_h) {}
-void PriorBox::SetClip(bool clip) {}
-void PriorBox::SetFlip(bool flip) {}
-void PriorBox::SetOffset(float offset) {}
+int PriorBox::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_PriorBox();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_PriorBox return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> min_sizes;
+  if (attr->min_sizes() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->min_sizes()->size()); i++) {
+      min_sizes.push_back(attr->min_sizes()->data()[i]);
+    }
+  }
+  std::vector<int32_t> max_sizes;
+  if (attr->max_sizes() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->max_sizes()->size()); i++) {
+      max_sizes.push_back(attr->max_sizes()->data()[i]);
+    }
+  }
+  std::vector<float> aspect_ratios;
+  if (attr->aspect_ratios() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->aspect_ratios()->size()); i++) {
+      aspect_ratios.push_back(attr->aspect_ratios()->data()[i]);
+    }
+  }
+  std::vector<float> variances;
+  if (attr->variances() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->variances()->size()); i++) {
+      variances.push_back(attr->variances()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreatePriorBoxDirect(*fbb, &min_sizes, &max_sizes, &aspect_ratios, &variances);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_PriorBox, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/prior_box.h b/mindspore/lite/src/ops/prior_box.h
index 6802a74479..d6f105a31c 100644
--- a/mindspore/lite/src/ops/prior_box.h
+++ b/mindspore/lite/src/ops/prior_box.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,10 +29,24 @@ namespace lite {
 class PriorBox : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(PriorBox, PrimitiveC);
   PriorBox() = default;
   explicit PriorBox(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMinSizes(const std::vector<int> &min_sizes);
+  void SetMaxSizes(const std::vector<int> &max_sizes);
+  void SetAspectRatios(const std::vector<float> &aspect_ratios);
+  void SetVariances(const std::vector<float> &variances);
+  void SetImageSizeW(int image_size_w);
+  void SetImageSizeH(int image_size_h);
+  void SetStepW(float step_w);
+  void SetStepH(float step_h);
+  void SetClip(bool clip);
+  void SetFlip(bool flip);
+  void SetOffset(float offset);
 #else
-  explicit PriorBox(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  PriorBox() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetMinSizes() const;
@@ -45,17 +60,6 @@ class PriorBox : public PrimitiveC {
   bool GetClip() const;
   bool GetFlip() const;
   float GetOffset() const;
-  void SetMinSizes(const std::vector<int> &min_sizes);
-  void SetMaxSizes(const std::vector<int> &max_sizes);
-  void SetAspectRatios(const std::vector<float> &aspect_ratios);
-  void SetVariances(const std::vector<float> &variances);
-  void SetImageSizeW(int image_size_w);
-  void SetImageSizeH(int image_size_h);
-  void SetStepW(float step_w);
-  void SetStepH(float step_h);
-  void SetClip(bool clip);
-  void SetFlip(bool flip);
-  void SetOffset(float offset);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/quant.cc b/mindspore/lite/src/ops/quant.cc
index ff824ad605..3e9500ce1a 100644
--- a/mindspore/lite/src/ops/quant.cc
+++ b/mindspore/lite/src/ops/quant.cc
@@ -22,11 +22,30 @@ namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int Quant::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::OnnxInt8QuantizeT>();
-  this->primitive_->value.type = schema::PrimitiveType_OnnxInt8Quantize;
-  this->primitive_->value.value = attr.release();
-
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_OnnxInt8Quantize;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_OnnxInt8Quantize) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::OnnxInt8QuantizeT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 #endif
diff --git a/mindspore/lite/src/ops/quant.h b/mindspore/lite/src/ops/quant.h
index 6ba178e9fa..8717585aab 100644
--- a/mindspore/lite/src/ops/quant.h
+++ b/mindspore/lite/src/ops/quant.h
@@ -24,11 +24,12 @@ namespace lite {
 class Quant : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Quant, PrimitiveC);
   Quant() = default;
   explicit Quant(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Quant(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Quant() = default;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/quant_dtype_cast.cc b/mindspore/lite/src/ops/quant_dtype_cast.cc
index c29ee9a95f..a3adb9a5c4 100644
--- a/mindspore/lite/src/ops/quant_dtype_cast.cc
+++ b/mindspore/lite/src/ops/quant_dtype_cast.cc
@@ -29,9 +29,19 @@ void QuantDTypeCast::SetDstT(int dst_t) { this->primitive_->value.AsQuantDTypeCa
 
 int QuantDTypeCast::GetSrcT() const { return this->primitive_->value_as_QuantDTypeCast()->srcT(); }
 int QuantDTypeCast::GetDstT() const { return this->primitive_->value_as_QuantDTypeCast()->dstT(); }
-
-void QuantDTypeCast::SetSrcT(int src_t) {}
-void QuantDTypeCast::SetDstT(int dst_t) {}
+int QuantDTypeCast::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_QuantDTypeCast();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_QuantDTypeCast return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateQuantDTypeCast(*fbb, attr->srcT(), attr->dstT());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_QuantDTypeCast, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int QuantDTypeCast::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/quant_dtype_cast.h b/mindspore/lite/src/ops/quant_dtype_cast.h
index 718019d1e3..0523272982 100644
--- a/mindspore/lite/src/ops/quant_dtype_cast.h
+++ b/mindspore/lite/src/ops/quant_dtype_cast.h
@@ -28,17 +28,19 @@ namespace lite {
 class QuantDTypeCast : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(QuantDTypeCast, PrimitiveC);
   QuantDTypeCast() = default;
   explicit QuantDTypeCast(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  MS_DECLARE_PARENT(QuantDTypeCast, PrimitiveC);
+  void SetSrcT(int src_t);
+  void SetDstT(int dst_t);
 #else
-  explicit QuantDTypeCast(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  QuantDTypeCast() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetSrcT() const;
   int GetDstT() const;
-  void SetSrcT(int src_t);
-  void SetDstT(int dst_t);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/range.cc b/mindspore/lite/src/ops/range.cc
index 9c5816b337..75afe4efb2 100644
--- a/mindspore/lite/src/ops/range.cc
+++ b/mindspore/lite/src/ops/range.cc
@@ -35,11 +35,19 @@ int Range::GetDType() const { return this->primitive_->value_as_Range()->dType()
 int Range::GetStart() const { return this->primitive_->value_as_Range()->start(); }
 int Range::GetLimit() const { return this->primitive_->value_as_Range()->limit(); }
 int Range::GetDelta() const { return this->primitive_->value_as_Range()->delta(); }
-
-void Range::SetDType(int d_type) {}
-void Range::SetStart(int start) {}
-void Range::SetLimit(int limit) {}
-void Range::SetDelta(int delta) {}
+int Range::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Range();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Range return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateRange(*fbb, attr->dType(), attr->start(), attr->limit(), attr->delta());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Range, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Range::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/range.h b/mindspore/lite/src/ops/range.h
index d1e5a13c1e..4f0a432462 100644
--- a/mindspore/lite/src/ops/range.h
+++ b/mindspore/lite/src/ops/range.h
@@ -28,20 +28,23 @@ namespace lite {
 class Range : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Range, PrimitiveC);
   Range() = default;
   explicit Range(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetDType(int d_type);
+  void SetStart(int start);
+  void SetLimit(int limit);
+  void SetDelta(int delta);
 #else
-  explicit Range(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Range() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetDType() const;
   int GetStart() const;
   int GetLimit() const;
   int GetDelta() const;
-  void SetDType(int d_type);
-  void SetStart(int start);
-  void SetLimit(int limit);
-  void SetDelta(int delta);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/rank.cc b/mindspore/lite/src/ops/rank.cc
index 5a89c68178..1c95012d95 100644
--- a/mindspore/lite/src/ops/rank.cc
+++ b/mindspore/lite/src/ops/rank.cc
@@ -18,7 +18,17 @@
 
 namespace mindspore {
 namespace lite {
-
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Rank::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateRank(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Rank, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
 int Rank::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
   MS_ASSERT(this->primitive_ != nullptr);
   auto input = inputs_.front();
diff --git a/mindspore/lite/src/ops/rank.h b/mindspore/lite/src/ops/rank.h
index f2f39c2598..5251247e1f 100644
--- a/mindspore/lite/src/ops/rank.h
+++ b/mindspore/lite/src/ops/rank.h
@@ -28,10 +28,13 @@ namespace lite {
 class Rank : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Rank, PrimitiveC);
   Rank() = default;
   explicit Rank(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Rank(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Rank() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/reduce.cc b/mindspore/lite/src/ops/reduce.cc
index a0def40139..7ec73d67f1 100644
--- a/mindspore/lite/src/ops/reduce.cc
+++ b/mindspore/lite/src/ops/reduce.cc
@@ -29,34 +29,52 @@ void Reduce::SetKeepDims(int keep_dims) { this->primitive_->value.AsReduce()->ke
 void Reduce::SetMode(int mode) { this->primitive_->value.AsReduce()->mode = (schema::ReduceMode)mode; }
 
 int Reduce::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::ReduceT>();
-  attr->mode = schema::ReduceMode_ReduceMean;
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Reduce;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Reduce) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::ReduceT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    attr->mode = schema::ReduceMode_ReduceMean;
 
-  attr->keepDims = GetValue<bool>(prim.GetAttr("keep_dims"));
-  if (inputs.size() == kAnfPopulaterTwo) {
-    auto inputNode = inputs[kAnfPopulaterOne];
-    MS_ASSERT(inputNode != nullptr);
-    if (inputNode->isa<ValueNode>()) {
-      auto valueNode = inputNode->cast<ValueNodePtr>();
-      MS_ASSERT(valueNode != nullptr);
-      auto value = valueNode->value();
-      MS_ASSERT(value != nullptr);
-      if (value->isa<ValueTuple>()) {
-        auto valTuplPtr = dyn_cast<ValueTuple>(value);
-        MS_ASSERT(valTuplPtr != nullptr);
-        for (size_t i = 0; i < valTuplPtr->size(); i++) {
-          auto elem = dyn_cast<Int32Imm>((*valTuplPtr)[i]);
-          MS_ASSERT(elem != nullptr);
-          attr->axes.emplace_back(elem->value());
+    attr->keepDims = GetValue<bool>(prim.GetAttr("keep_dims"));
+    if (inputs.size() == kAnfPopulaterTwo) {
+      auto inputNode = inputs[kAnfPopulaterOne];
+      MS_ASSERT(inputNode != nullptr);
+      if (inputNode->isa<ValueNode>()) {
+        auto valueNode = inputNode->cast<ValueNodePtr>();
+        MS_ASSERT(valueNode != nullptr);
+        auto value = valueNode->value();
+        MS_ASSERT(value != nullptr);
+        if (value->isa<ValueTuple>()) {
+          auto valTuplPtr = dyn_cast<ValueTuple>(value);
+          MS_ASSERT(valTuplPtr != nullptr);
+          for (size_t i = 0; i < valTuplPtr->size(); i++) {
+            auto elem = dyn_cast<Int32Imm>((*valTuplPtr)[i]);
+            MS_ASSERT(elem != nullptr);
+            attr->axes.emplace_back(elem->value());
+          }
         }
       }
     }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
   }
-
-  this->primitive_->value.type = schema::PrimitiveType_Reduce;
-  this->primitive_->value.value = attr.release();
-
   return RET_OK;
 }
 
@@ -68,10 +86,25 @@ std::vector<int> Reduce::GetAxes() const {
 }
 int Reduce::GetKeepDims() const { return this->primitive_->value_as_Reduce()->keepDims(); }
 int Reduce::GetMode() const { return this->primitive_->value_as_Reduce()->mode(); }
-
-void Reduce::SetAxes(const std::vector<int> &axes) {}
-void Reduce::SetKeepDims(int keep_dims) {}
-void Reduce::SetMode(int mode) {}
+int Reduce::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Reduce();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Reduce return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axes;
+  if (attr->axes() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axes()->size()); i++) {
+      axes.push_back(attr->axes()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateReduceDirect(*fbb, &axes, attr->keepDims(), attr->mode());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Reduce, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/reduce.h b/mindspore/lite/src/ops/reduce.h
index afccbf9cd8..bb8458d09a 100644
--- a/mindspore/lite/src/ops/reduce.h
+++ b/mindspore/lite/src/ops/reduce.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,19 +29,22 @@ namespace lite {
 class Reduce : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Reduce, PrimitiveC);
   Reduce() = default;
   explicit Reduce(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetAxes(const std::vector<int> &axes);
+  void SetKeepDims(int keep_dims);
+  void SetMode(int mode);
 #else
-  explicit Reduce(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Reduce() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetAxes() const;
   int GetKeepDims() const;
   int GetMode() const;
-  void SetAxes(const std::vector<int> &axes);
-  void SetKeepDims(int keep_dims);
-  void SetMode(int mode);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/reshape.cc b/mindspore/lite/src/ops/reshape.cc
index b35638f454..8928a89b94 100644
--- a/mindspore/lite/src/ops/reshape.cc
+++ b/mindspore/lite/src/ops/reshape.cc
@@ -30,29 +30,47 @@ std::vector<int64_t> Reshape::GetShape() const { return this->primitive_->value.
 void Reshape::SetFormat(int format) { this->primitive_->value.AsReshape()->format = (schema::Format)format; }
 void Reshape::SetShape(const std::vector<int64_t> &shape) { this->primitive_->value.AsReshape()->shape = shape; }
 int Reshape::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::ReshapeT>();
-  MS_ASSERT(inputs.size() == kAnfPopulaterThree - 1);
-  auto inputNode = inputs[kAnfPopulaterTwo - 1];
-  if (inputNode->isa<ValueNode>()) {
-    auto valueNode = inputNode->cast<ValueNodePtr>();
-    MS_ASSERT(valueNode != nullptr);
-    auto val = valueNode->value();
-    MS_ASSERT(val != nullptr);
-    if (val->isa<ValueTuple>()) {
-      auto tuple = val->cast<ValueTuplePtr>();
-      MS_ASSERT(tuple != nullptr);
-      for (size_t i = 0; i < tuple->size(); ++i) {
-        auto elem = tuple->value()[i]->cast<Int32ImmPtr>();
-        MS_ASSERT(elem != nullptr);
-        attr->shape.emplace_back(static_cast<int>(elem->value()));
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Reshape;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Reshape) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::ReshapeT();
+    MS_ASSERT(inputs.size() == kAnfPopulaterThree - 1);
+    auto inputNode = inputs[kAnfPopulaterTwo - 1];
+    if (inputNode->isa<ValueNode>()) {
+      auto valueNode = inputNode->cast<ValueNodePtr>();
+      MS_ASSERT(valueNode != nullptr);
+      auto val = valueNode->value();
+      MS_ASSERT(val != nullptr);
+      if (val->isa<ValueTuple>()) {
+        auto tuple = val->cast<ValueTuplePtr>();
+        MS_ASSERT(tuple != nullptr);
+        for (size_t i = 0; i < tuple->size(); ++i) {
+          auto elem = tuple->value()[i]->cast<Int32ImmPtr>();
+          MS_ASSERT(elem != nullptr);
+          attr->shape.emplace_back(static_cast<int>(elem->value()));
+        }
       }
     }
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
   }
-
-  this->primitive_->value.type = schema::PrimitiveType_Reshape;
-  this->primitive_->value.value = attr.release();
-
   return RET_OK;
 }
 
@@ -63,9 +81,25 @@ std::vector<int64_t> Reshape::GetShape() const {
   auto fb_vector = this->primitive_->value_as_Reshape()->shape();
   return std::vector<int64_t>(fb_vector->begin(), fb_vector->end());
 }
-
-void Reshape::SetFormat(int format) {}
-void Reshape::SetShape(const std::vector<int64_t> &shape) {}
+int Reshape::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Reshape();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Reshape return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int64_t> shape;
+  if (attr->shape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->shape()->size()); i++) {
+      shape.push_back(attr->shape()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateReshapeDirect(*fbb, attr->format(), &shape);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Reshape, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Reshape::CalNewShape(const tensor::Tensor *in_tensor, std::vector<int> *out_shape) const {
diff --git a/mindspore/lite/src/ops/reshape.h b/mindspore/lite/src/ops/reshape.h
index 63854aeea7..c81187636b 100644
--- a/mindspore/lite/src/ops/reshape.h
+++ b/mindspore/lite/src/ops/reshape.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,17 +29,20 @@ namespace lite {
 class Reshape : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Reshape, PrimitiveC);
   Reshape() = default;
   explicit Reshape(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetFormat(int format);
+  void SetShape(const std::vector<int64_t> &shape);
 #else
-  explicit Reshape(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Reshape() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
   std::vector<int64_t> GetShape() const;
-  void SetFormat(int format);
-  void SetShape(const std::vector<int64_t> &shape);
 
  private:
   int CalNewShape(const lite::tensor::Tensor *in_tensor, std::vector<int> *out_shape) const;
diff --git a/mindspore/lite/src/ops/resize.cc b/mindspore/lite/src/ops/resize.cc
index 28f9e11762..aa0dd10648 100644
--- a/mindspore/lite/src/ops/resize.cc
+++ b/mindspore/lite/src/ops/resize.cc
@@ -43,13 +43,20 @@ int64_t Resize::GetNewHeight() const { return this->primitive_->value_as_Resize(
 int64_t Resize::GetNewWidth() const { return this->primitive_->value_as_Resize()->newWidth(); }
 bool Resize::GetAlignCorners() const { return this->primitive_->value_as_Resize()->alignCorners(); }
 bool Resize::GetPreserveAspectRatio() const { return this->primitive_->value_as_Resize()->preserveAspectRatio(); }
-
-void Resize::SetFormat(int format) {}
-void Resize::SetMethod(int method) {}
-void Resize::SetNewHeight(int64_t new_height) {}
-void Resize::SetNewWidth(int64_t new_width) {}
-void Resize::SetAlignCorners(bool align_corners) {}
-void Resize::SetPreserveAspectRatio(bool preserve_aspect_ratio) {}
+int Resize::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Resize();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Resize return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateResize(*fbb, attr->format(), attr->method(), attr->newHeight(), attr->newWidth(),
+                                         attr->alignCorners(), attr->preserveAspectRatio());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Resize, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 namespace {
 constexpr int kInputRank = 4;
@@ -60,7 +67,10 @@ int Resize::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<
   if (input == nullptr) {
     return 1;
   }
-  MS_ASSERT(input->shape().size() == kInputRank);
+  if (input->shape().size() != kInputRank) {
+    MS_LOG(ERROR) << "Size of input shape is wrong.";
+    return RET_ERROR;
+  }
 
   auto output = outputs_.front();
   if (output == nullptr) {
diff --git a/mindspore/lite/src/ops/resize.h b/mindspore/lite/src/ops/resize.h
index a60147fea2..3e1d71a484 100644
--- a/mindspore/lite/src/ops/resize.h
+++ b/mindspore/lite/src/ops/resize.h
@@ -28,10 +28,19 @@ namespace lite {
 class Resize : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Resize, PrimitiveC);
   Resize() = default;
   explicit Resize(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetMethod(int method);
+  void SetNewHeight(int64_t new_height);
+  void SetNewWidth(int64_t new_width);
+  void SetAlignCorners(bool align_corners);
+  void SetPreserveAspectRatio(bool preserve_aspect_ratio);
 #else
-  explicit Resize(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Resize() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
@@ -40,12 +49,6 @@ class Resize : public PrimitiveC {
   int64_t GetNewWidth() const;
   bool GetAlignCorners() const;
   bool GetPreserveAspectRatio() const;
-  void SetFormat(int format);
-  void SetMethod(int method);
-  void SetNewHeight(int64_t new_height);
-  void SetNewWidth(int64_t new_width);
-  void SetAlignCorners(bool align_corners);
-  void SetPreserveAspectRatio(bool preserve_aspect_ratio);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/return.cc b/mindspore/lite/src/ops/return.cc
new file mode 100644
index 0000000000..c4b2b4b7ec
--- /dev/null
+++ b/mindspore/lite/src/ops/return.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/return.h"
+#include <memory>
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+int Return::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Return;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Return) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::ReturnT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+#endif
+
+namespace {
+constexpr size_t kInputSize = 1;
+constexpr size_t kOutputSize = 1;
+}  // namespace
+int Return::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
+  if (inputs_.size() != kInputSize || outputs_.size() != kOutputSize) {
+    return RET_ERROR;
+  }
+  auto input = inputs_.front();
+  auto output = outputs_.front();
+  if (input == nullptr || output == nullptr) {
+    return RET_NULL_PTR;
+  }
+  output->set_data_type(input->data_type());
+  output->SetFormat(input->GetFormat());
+  if (!GetInferFlag()) {
+    return RET_OK;
+  }
+  if (this->primitive_ == nullptr) {
+    return RET_NULL_PTR;
+  }
+  output->set_data_type(input->data_type());
+  output->set_shape(input->shape());
+  output->SetFormat(input->GetFormat());
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/prelu.h b/mindspore/lite/src/ops/return.h
similarity index 59%
rename from mindspore/lite/src/ops/prelu.h
rename to mindspore/lite/src/ops/return.h
index 7dd71e9dc3..ec6af73b67 100644
--- a/mindspore/lite/src/ops/prelu.h
+++ b/mindspore/lite/src/ops/return.h
@@ -14,28 +14,30 @@
  * limitations under the License.
  */
 
-#ifndef LITE_MINDSPORE_LITE_C_OPS_PRELU_H_
-#define LITE_MINDSPORE_LITE_C_OPS_PRELU_H_
+#ifndef LITE_MINDSPORE_LITE_C_OPS_RETURN_H_
+#define LITE_MINDSPORE_LITE_C_OPS_RETURN_H_
 
 #include <vector>
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/activation.h"
+#include "src/ops/primitive_c.h"
 
 namespace mindspore {
 namespace lite {
-class Prelu : public Activation {
+class Return : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
-  Prelu() = default;
-  explicit Prelu(schema::PrimitiveT *primitive) : Activation(primitive) {}
+  MS_DECLARE_PARENT(Return, PrimitiveC);
+  Return() = default;
+  explicit Return(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit Prelu(schema::Primitive *primitive) : Activation(primitive) {}
+  Return() = default;
 #endif
-  std::vector<float> GetSlope() const;
-  void SetSlope(const std::vector<float> &slope);
+  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
 }  // namespace lite
 }  // namespace mindspore
-#endif  // LITE_MINDSPORE_LITE_C_OPS_PRELU_H_
+
+#endif  // LITE_MINDSPORE_LITE_C_OPS_RETURN_H_
diff --git a/mindspore/lite/src/ops/reverse.cc b/mindspore/lite/src/ops/reverse.cc
index b4a56286c4..11bb4388d5 100644
--- a/mindspore/lite/src/ops/reverse.cc
+++ b/mindspore/lite/src/ops/reverse.cc
@@ -29,8 +29,25 @@ std::vector<int> Reverse::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_Reverse()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void Reverse::SetAxis(const std::vector<int> &axis) {}
+int Reverse::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Reverse();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Reverse return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateReverseDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Reverse, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/reverse.h b/mindspore/lite/src/ops/reverse.h
index 2b202e112d..0a95a18413 100644
--- a/mindspore/lite/src/ops/reverse.h
+++ b/mindspore/lite/src/ops/reverse.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,13 +29,17 @@ namespace lite {
 class Reverse : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Reverse, PrimitiveC);
   Reverse() = default;
   explicit Reverse(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+
 #else
-  explicit Reverse(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Reverse() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/reverse_sequence.cc b/mindspore/lite/src/ops/reverse_sequence.cc
index e362b98c24..c89477832c 100644
--- a/mindspore/lite/src/ops/reverse_sequence.cc
+++ b/mindspore/lite/src/ops/reverse_sequence.cc
@@ -41,10 +41,26 @@ std::vector<int> ReverseSequence::GetSeqLengths() const {
   auto fb_vector = this->primitive_->value_as_ReverseSequence()->seqLengths();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
+int ReverseSequence::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
-void ReverseSequence::SetSeqAxis(int seq_axis) {}
-void ReverseSequence::SetBatchAxis(int batch_axis) {}
-void ReverseSequence::SetSeqLengths(const std::vector<int> &seq_lengths) {}
+  auto attr = primitive->value_as_ReverseSequence();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ReverseSequence return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> seqLengths;
+  if (attr->seqLengths() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->seqLengths()->size()); i++) {
+      seqLengths.push_back(attr->seqLengths()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateReverseSequenceDirect(*fbb, attr->seqAxis(), attr->batchAxis(), &seqLengths);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ReverseSequence, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int ReverseSequence::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
diff --git a/mindspore/lite/src/ops/reverse_sequence.h b/mindspore/lite/src/ops/reverse_sequence.h
index 66624bf8a5..6b0c59d384 100644
--- a/mindspore/lite/src/ops/reverse_sequence.h
+++ b/mindspore/lite/src/ops/reverse_sequence.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,18 +29,21 @@ namespace lite {
 class ReverseSequence : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ReverseSequence, PrimitiveC);
   ReverseSequence() = default;
   explicit ReverseSequence(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetSeqAxis(int seq_axis);
+  void SetBatchAxis(int batch_axis);
+  void SetSeqLengths(const std::vector<int> &seq_lengths);
 #else
-  explicit ReverseSequence(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ReverseSequence() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetSeqAxis() const;
   int GetBatchAxis() const;
   std::vector<int> GetSeqLengths() const;
-  void SetSeqAxis(int seq_axis);
-  void SetBatchAxis(int batch_axis);
-  void SetSeqLengths(const std::vector<int> &seq_lengths);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/roi_pooling.cc b/mindspore/lite/src/ops/roi_pooling.cc
index 03edeb2639..4d1270e35a 100644
--- a/mindspore/lite/src/ops/roi_pooling.cc
+++ b/mindspore/lite/src/ops/roi_pooling.cc
@@ -32,10 +32,21 @@ void ROIPooling::SetScale(float scale) { this->primitive_->value.AsROIPooling()-
 int ROIPooling::GetPooledH() const { return this->primitive_->value_as_ROIPooling()->pooledH(); }
 int ROIPooling::GetPooledW() const { return this->primitive_->value_as_ROIPooling()->pooledW(); }
 float ROIPooling::GetScale() const { return this->primitive_->value_as_ROIPooling()->scale(); }
+int ROIPooling::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
-void ROIPooling::SetPooledH(int pooled_h) {}
-void ROIPooling::SetPooledW(int pooled_w) {}
-void ROIPooling::SetScale(float scale) {}
+  auto attr = primitive->value_as_ROIPooling();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_ROIPooling return nullptr";
+    return RET_ERROR;
+  }
+
+  auto val_offset = schema::CreateROIPooling(*fbb, attr->pooledH(), attr->pooledW(), attr->scale());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ROIPooling, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int ROIPooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/roi_pooling.h b/mindspore/lite/src/ops/roi_pooling.h
index d02720394a..4dabb95f90 100644
--- a/mindspore/lite/src/ops/roi_pooling.h
+++ b/mindspore/lite/src/ops/roi_pooling.h
@@ -28,18 +28,21 @@ namespace lite {
 class ROIPooling : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ROIPooling, PrimitiveC);
   ROIPooling() = default;
   explicit ROIPooling(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetPooledH(int pooled_h);
+  void SetPooledW(int pooled_w);
+  void SetScale(float scale);
 #else
-  explicit ROIPooling(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ROIPooling() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetPooledH() const;
   int GetPooledW() const;
   float GetScale() const;
-  void SetPooledH(int pooled_h);
-  void SetPooledW(int pooled_w);
-  void SetScale(float scale);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/round.cc b/mindspore/lite/src/ops/round.cc
new file mode 100644
index 0000000000..ae3167597c
--- /dev/null
+++ b/mindspore/lite/src/ops/round.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/round.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Round::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateRound(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Round, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/round.h b/mindspore/lite/src/ops/round.h
index 3e6496555e..606324243c 100644
--- a/mindspore/lite/src/ops/round.h
+++ b/mindspore/lite/src/ops/round.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Round : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Round, ArithmeticSelf);
   Round() = default;
   explicit Round(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Round(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Round() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/prelu.cc b/mindspore/lite/src/ops/rsqrt.cc
similarity index 61%
rename from mindspore/lite/src/ops/prelu.cc
rename to mindspore/lite/src/ops/rsqrt.cc
index 1bca56a3b6..742aed2953 100644
--- a/mindspore/lite/src/ops/prelu.cc
+++ b/mindspore/lite/src/ops/rsqrt.cc
@@ -14,23 +14,21 @@
  * limitations under the License.
  */
 
-#include "src/ops/prelu.h"
+#include "src/ops/rsqrt.h"
 
 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
-std::vector<float> Prelu::GetSlope() const { return this->primitive_->value.AsPrelu()->slope; }
-
-void Prelu::SetSlope(const std::vector<float> &slope) { this->primitive_->value.AsPrelu()->slope = slope; }
-
 #else
+int Rsqrt::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
 
-std::vector<float> Prelu::GetSlope() const {
-  auto fb_vector = this->primitive_->value_as_Prelu()->slope();
-  return std::vector<float>(fb_vector->begin(), fb_vector->end());
+  auto val_offset = schema::CreateRsqrt(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Rsqrt, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
 }
-
-void Prelu::SetSlope(const std::vector<float> &slope) {}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/rsqrt.h b/mindspore/lite/src/ops/rsqrt.h
index 58a32ffd9b..17c48c4413 100644
--- a/mindspore/lite/src/ops/rsqrt.h
+++ b/mindspore/lite/src/ops/rsqrt.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Rsqrt : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Rsqrt, ArithmeticSelf);
   Rsqrt() = default;
   explicit Rsqrt(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Rsqrt(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Rsqrt() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/scale.cc b/mindspore/lite/src/ops/scale.cc
index 23abc731ad..53ed368783 100644
--- a/mindspore/lite/src/ops/scale.cc
+++ b/mindspore/lite/src/ops/scale.cc
@@ -26,8 +26,19 @@ void Scale::SetAxis(int axis) { this->primitive_->value.AsScale()->axis = axis;
 #else
 
 int Scale::GetAxis() const { return this->primitive_->value_as_Scale()->axis(); }
-
-void Scale::SetAxis(int axis) {}
+int Scale::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Scale();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Scale return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateScale(*fbb, attr->axis());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Scale, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/scale.h b/mindspore/lite/src/ops/scale.h
index df1ebc5c86..c9f3d653b6 100644
--- a/mindspore/lite/src/ops/scale.h
+++ b/mindspore/lite/src/ops/scale.h
@@ -28,13 +28,17 @@ namespace lite {
 class Scale : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Scale, PrimitiveC);
   Scale() = default;
   explicit Scale(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+
 #else
-  explicit Scale(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Scale() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetAxis() const;
-  void SetAxis(int axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/scatter_nd.cc b/mindspore/lite/src/ops/scatter_nd.cc
index a033192708..6cd425ac75 100644
--- a/mindspore/lite/src/ops/scatter_nd.cc
+++ b/mindspore/lite/src/ops/scatter_nd.cc
@@ -61,5 +61,17 @@ int ScatterND::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<ten
   output->set_shape(out_shape);
   return RET_OK;
 }
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int ScatterND::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateScatterND(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ScatterND, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/scatter_nd.h b/mindspore/lite/src/ops/scatter_nd.h
index 69ec5dc301..ad7bc2c887 100644
--- a/mindspore/lite/src/ops/scatter_nd.h
+++ b/mindspore/lite/src/ops/scatter_nd.h
@@ -28,10 +28,13 @@ namespace lite {
 class ScatterND : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ScatterND, PrimitiveC);
   ScatterND() = default;
   explicit ScatterND(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit ScatterND(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ScatterND() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/shape.cc b/mindspore/lite/src/ops/shape.cc
index 2680cf03e3..349fb8cb0d 100644
--- a/mindspore/lite/src/ops/shape.cc
+++ b/mindspore/lite/src/ops/shape.cc
@@ -38,6 +38,7 @@ int Shape::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor:
   auto in_tensor = inputs_.front();
   auto out_tensor = outputs_.front();
   out_tensor->set_data_type(kNumberTypeInt32);
+  out_tensor->SetFormat(schema::Format_NHWC);
   if (!GetInferFlag()) {
     return RET_OK;
   }
@@ -50,5 +51,17 @@ int Shape::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor:
   }
   return RET_OK;
 }
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Shape::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateShape(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Shape, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/shape.h b/mindspore/lite/src/ops/shape.h
index ae6e1ceec9..7dc856eca5 100644
--- a/mindspore/lite/src/ops/shape.h
+++ b/mindspore/lite/src/ops/shape.h
@@ -20,18 +20,21 @@
 #include <vector>
 #include <set>
 #include <cmath>
-#include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
+#include "ir/dtype/type_id.h"
 
 namespace mindspore {
 namespace lite {
 class Shape : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Shape, PrimitiveC);
   Shape() = default;
   explicit Shape(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit Shape(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Shape() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/ops/sin.cc b/mindspore/lite/src/ops/sin.cc
new file mode 100644
index 0000000000..eb363cf531
--- /dev/null
+++ b/mindspore/lite/src/ops/sin.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/sin.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Sin::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateSin(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Sin, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sin.h b/mindspore/lite/src/ops/sin.h
index ae410da36c..82383c166d 100644
--- a/mindspore/lite/src/ops/sin.h
+++ b/mindspore/lite/src/ops/sin.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Sin : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Sin, ArithmeticSelf);
   Sin() = default;
   explicit Sin(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Sin(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Sin() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/slice.cc b/mindspore/lite/src/ops/slice.cc
index 11b984fc94..bfdd2ac039 100644
--- a/mindspore/lite/src/ops/slice.cc
+++ b/mindspore/lite/src/ops/slice.cc
@@ -29,6 +29,7 @@ constexpr int kSliceOutputNum = 1;
 int Slice::GetFormat() const { return this->primitive_->value.AsSlice()->format; }
 std::vector<int> Slice::GetBegin() const { return this->primitive_->value.AsSlice()->begin; }
 std::vector<int> Slice::GetSize() const { return this->primitive_->value.AsSlice()->size; }
+std::vector<int> Slice::GetAxes() const { return this->primitive_->value.AsSlice()->axes; }
 
 void Slice::SetFormat(int format) { this->primitive_->value.AsSlice()->format = (schema::Format)format; }
 void Slice::SetBegin(const std::vector<int> &begin) { this->primitive_->value.AsSlice()->begin = begin; }
@@ -45,12 +46,42 @@ std::vector<int> Slice::GetSize() const {
   auto fb_vector = this->primitive_->value_as_Slice()->size();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
+std::vector<int> Slice::GetAxes() const {
+  auto fb_vector = this->primitive_->value_as_Slice()->axes();
+  return std::vector<int>(fb_vector->begin(), fb_vector->end());
+}
+int Slice::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto attr = primitive->value_as_Slice();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Slice return nullptr";
+    return RET_ERROR;
+  }
+
+  std::vector<int32_t> begin;
+  if (attr->begin() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->begin()->size()); i++) {
+      begin.push_back(attr->begin()->data()[i]);
+    }
+  }
+  std::vector<int32_t> size;
+  if (attr->size() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->size()->size()); i++) {
+      size.push_back(attr->size()->data()[i]);
+    }
+  }
 
-void Slice::SetFormat(int format) {}
-void Slice::SetBegin(const std::vector<int> &begin) {}
-void Slice::SetSize(const std::vector<int> &size) {}
+  auto val_offset = schema::CreateSliceDirect(*fbb, attr->format(), &begin, &size);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Slice, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
+std::vector<int> Slice::GetPostProcessBegin() const { return this->begin; }
+std::vector<int> Slice::GetPostProcessSize() const { return this->size; }
 int Slice::InferShape(std::vector<lite::tensor::Tensor *> inputs, std::vector<lite::tensor::Tensor *> outputs) {
   MS_ASSERT(this->primitive_ != nullptr);
   if (inputs.size() != kSliceInputNum || outputs.size() != kSliceOutputNum) {
@@ -64,30 +95,36 @@ int Slice::InferShape(std::vector<lite::tensor::Tensor *> inputs, std::vector<li
     return RET_OK;
   }
   auto input_shape = input->shape();
-  std::vector<int32_t> slice_begin(GetBegin().begin(), GetBegin().end());
-  std::vector<int32_t> slice_size(GetSize().begin(), GetSize().end());
+  std::vector<int32_t> slice_begin(GetBegin());
+  std::vector<int32_t> slice_size(GetSize());
+  std::vector<int32_t> slice_axes(GetAxes());
   std::vector<int32_t> output_shape(input_shape.size());
+  begin.assign(input_shape.size(), 0);
+  size.assign(input_shape.size(), -1);
+  for (size_t i = 0; i < slice_axes.size(); ++i) {
+    begin[slice_axes[i]] = slice_begin[i];
+    size[slice_axes[i]] = slice_size[i];
+  }
   for (size_t i = 0; i < input_shape.size(); ++i) {
-    if (slice_size[i] < 0 && slice_size[i] != -1) {
-      MS_LOG(ERROR) << "Invalid size input!size[" << i << "]=" << slice_size[i];
+    if (size[i] < 0 && size[i] != -1) {
+      MS_LOG(ERROR) << "Invalid size input!size[" << i << "]=" << size[i];
       return RET_PARAM_INVALID;
     }
-    if (slice_begin[i] < 0) {
-      MS_LOG(ERROR) << "Invalid begin input " << slice_begin[i] << " which should be >= 0";
+    if (begin[i] < 0) {
+      MS_LOG(ERROR) << "Invalid begin input " << begin[i] << " which should be >= 0";
       return RET_PARAM_INVALID;
     }
-    if (input_shape[i] <= slice_begin[i]) {
-      MS_LOG(ERROR) << "Invalid begin input!begin[" << i << "]=" << slice_begin[i]
+    if (input_shape[i] <= begin[i]) {
+      MS_LOG(ERROR) << "Invalid begin input!begin[" << i << "]=" << begin[i]
                     << " which should be <= " << input_shape[i];
       return RET_PARAM_INVALID;
     }
-    if (slice_size[i] > (input_shape[i] - slice_begin[i])) {
-      MS_LOG(ERROR) << "Invalid size input " << slice_size[i]
-                    << " which should be <= " << input_shape[i] - slice_begin[i];
+    if (size[i] > (input_shape[i] - begin[i])) {
+      MS_LOG(ERROR) << "Invalid size input " << size[i] << " which should be <= " << input_shape[i] - begin[i];
       return RET_PARAM_INVALID;
     }
 
-    output_shape[i] = slice_size[i] < 0 ? input_shape[i] - slice_begin[i] : slice_size[i];
+    output_shape[i] = size[i] < 0 ? input_shape[i] - begin[i] : size[i];
   }
 
   outputs[0]->set_shape(output_shape);
diff --git a/mindspore/lite/src/ops/slice.h b/mindspore/lite/src/ops/slice.h
index 71b4dc5f3e..b5fa281e4b 100644
--- a/mindspore/lite/src/ops/slice.h
+++ b/mindspore/lite/src/ops/slice.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,18 +29,30 @@ namespace lite {
 class Slice : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Slice, PrimitiveC);
   Slice() = default;
   explicit Slice(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetFormat(int format);
+  void SetBegin(const std::vector<int> &begin);
+  void SetSize(const std::vector<int> &size);
 #else
-  explicit Slice(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Slice() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetFormat() const;
   std::vector<int> GetBegin() const;
   std::vector<int> GetSize() const;
-  void SetFormat(int format);
-  void SetBegin(const std::vector<int> &begin);
-  void SetSize(const std::vector<int> &size);
+  std::vector<int> GetAxes() const;
+  // due to difference between tflite and onnx, when inferring shape, construct new parameters of begin and size.
+  // when running graph, we need to obtain new begins and sizes using the two function as below.
+  std::vector<int> GetPostProcessBegin() const;
+  std::vector<int> GetPostProcessSize() const;
+
+ protected:
+  std::vector<int> begin = {0};
+  std::vector<int> size = {-1};
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/softmax.cc b/mindspore/lite/src/ops/softmax.cc
index 6d56cefe5c..2640d3edbc 100644
--- a/mindspore/lite/src/ops/softmax.cc
+++ b/mindspore/lite/src/ops/softmax.cc
@@ -26,8 +26,19 @@ void SoftMax::SetAxis(int axis) { this->primitive_->value.AsSoftMax()->axis = ax
 #else
 
 int SoftMax::GetAxis() const { return this->primitive_->value_as_SoftMax()->axis(); }
-
-void SoftMax::SetAxis(int axis) {}
+int SoftMax::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SoftMax();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SoftMax return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateSoftMax(*fbb, attr->axis());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SoftMax, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int SoftMax::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/softmax.h b/mindspore/lite/src/ops/softmax.h
index 41e11ae044..aa7dc5db88 100644
--- a/mindspore/lite/src/ops/softmax.h
+++ b/mindspore/lite/src/ops/softmax.h
@@ -28,14 +28,18 @@ namespace lite {
 class SoftMax : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SoftMax, PrimitiveC);
   SoftMax() = default;
   explicit SoftMax(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+
 #else
-  explicit SoftMax(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SoftMax() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
-  void SetAxis(int axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/softmax_cross_entropy.cc b/mindspore/lite/src/ops/softmax_cross_entropy.cc
index 8e863ba30f..8be8ca1d88 100644
--- a/mindspore/lite/src/ops/softmax_cross_entropy.cc
+++ b/mindspore/lite/src/ops/softmax_cross_entropy.cc
@@ -31,8 +31,25 @@ std::vector<int> SoftmaxCrossEntropy::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_SoftmaxCrossEntropy()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void SoftmaxCrossEntropy::SetAxis(const std::vector<int> &axis) {}
+int SoftmaxCrossEntropy::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SoftmaxCrossEntropy();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SoftmaxCrossEntropy return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSoftmaxCrossEntropyDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SoftmaxCrossEntropy, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/softmax_cross_entropy.h b/mindspore/lite/src/ops/softmax_cross_entropy.h
index 169b966cd9..44449d0bfd 100644
--- a/mindspore/lite/src/ops/softmax_cross_entropy.h
+++ b/mindspore/lite/src/ops/softmax_cross_entropy.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,13 +29,17 @@ namespace lite {
 class SoftmaxCrossEntropy : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SoftmaxCrossEntropy, PrimitiveC);
   SoftmaxCrossEntropy() = default;
   explicit SoftmaxCrossEntropy(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+
 #else
-  explicit SoftmaxCrossEntropy(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SoftmaxCrossEntropy() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/space_to_batch.cc b/mindspore/lite/src/ops/space_to_batch.cc
index 2b92c0b863..ac2902a307 100644
--- a/mindspore/lite/src/ops/space_to_batch.cc
+++ b/mindspore/lite/src/ops/space_to_batch.cc
@@ -40,9 +40,31 @@ std::vector<int> SpaceToBatch::GetPaddings() const {
   auto fb_vector = this->primitive_->value_as_SpaceToBatch()->paddings();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void SpaceToBatch::SetBlockShape(const std::vector<int> &block_shape) {}
-void SpaceToBatch::SetPaddings(const std::vector<int> &paddings) {}
+int SpaceToBatch::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SpaceToBatch();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SpaceToBatch return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> blockShape;
+  if (attr->blockShape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->blockShape()->size()); i++) {
+      blockShape.push_back(attr->blockShape()->data()[i]);
+    }
+  }
+  std::vector<int32_t> paddings;
+  if (attr->paddings() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->paddings()->size()); i++) {
+      paddings.push_back(attr->paddings()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSpaceToBatchDirect(*fbb, &blockShape, &paddings);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SpaceToBatch, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 namespace {
 constexpr int kSpaceToBatchNDOutputNum = 1;
@@ -107,8 +129,8 @@ int SpaceToBatch::InferShape(std::vector<lite::tensor::Tensor *> inputs, std::ve
 
   std::vector<int32_t> output_shape(input_shape.size());
   output_shape[NHWC_N] = input_shape[NHWC_N] * (block_sizes_[NHWC_N] * block_sizes_[NHWC_H]);
-  output_shape[NHWC_H] = input_shape[NHWC_H] / block_sizes_[NHWC_N];
-  output_shape[NHWC_W] = input_shape[NHWC_W] / block_sizes_[NHWC_H];
+  output_shape[NHWC_H] = (input_shape[NHWC_H] + paddings_[0] + paddings_[1]) / block_sizes_[NHWC_N];
+  output_shape[NHWC_W] = (input_shape[NHWC_W] + paddings_[2] + paddings_[3]) / block_sizes_[NHWC_H];
   output_shape[NHWC_C] = input_shape[NHWC_C];
   outputs[0]->set_shape(output_shape);
   return RET_OK;
diff --git a/mindspore/lite/src/ops/space_to_batch.h b/mindspore/lite/src/ops/space_to_batch.h
index 3fb81398af..3c3888bcb7 100644
--- a/mindspore/lite/src/ops/space_to_batch.h
+++ b/mindspore/lite/src/ops/space_to_batch.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,16 +29,20 @@ namespace lite {
 class SpaceToBatch : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SpaceToBatch, PrimitiveC);
   SpaceToBatch() = default;
   explicit SpaceToBatch(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBlockShape(const std::vector<int> &block_shape);
+  void SetPaddings(const std::vector<int> &paddings);
 #else
-  explicit SpaceToBatch(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SpaceToBatch() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
-  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
+  int InferShape(std::vector<lite::tensor::Tensor *> inputs, std::vector<lite::tensor::Tensor *> outputs) override;
+
   std::vector<int> GetBlockShape() const;
   std::vector<int> GetPaddings() const;
-  void SetBlockShape(const std::vector<int> &block_shape);
-  void SetPaddings(const std::vector<int> &paddings);
 
   std::vector<int> BlockSizes() { return block_sizes_; }
   std::vector<int> Paddings() { return block_sizes_; }
diff --git a/mindspore/lite/src/ops/space_to_batch_nd.cc b/mindspore/lite/src/ops/space_to_batch_nd.cc
index ff9a32ff5f..23d5bfd18e 100644
--- a/mindspore/lite/src/ops/space_to_batch_nd.cc
+++ b/mindspore/lite/src/ops/space_to_batch_nd.cc
@@ -15,9 +15,17 @@
  */
 
 #include "src/ops/space_to_batch_nd.h"
+#include "src/common/common.h"
 
 namespace mindspore {
 namespace lite {
+namespace {
+constexpr int kSpaceToBatchNDOutputNum = 1;
+constexpr int kSpaceToBatchNDInputNum = 1;
+constexpr int kBlockSizesSize = 2;
+constexpr int kPaddingsSize = 4;
+}  // namespace
+
 #ifdef PRIMITIVE_WRITEABLE
 std::vector<int> SpaceToBatchND::GetBlockShape() const {
   return this->primitive_->value.AsSpaceToBatchND()->blockShape;
@@ -42,8 +50,74 @@ std::vector<int> SpaceToBatchND::GetPaddings() const {
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void SpaceToBatchND::SetBlockShape(const std::vector<int> &block_shape) {}
-void SpaceToBatchND::SetPaddings(const std::vector<int> &paddings) {}
-#endif
+int SpaceToBatchND::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SpaceToBatch();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SpaceToBatch return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> blockShape;
+  if (attr->blockShape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->blockShape()->size()); i++) {
+      blockShape.push_back(attr->blockShape()->data()[i]);
+    }
+  }
+  std::vector<int32_t> paddings;
+  if (attr->paddings() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->paddings()->size()); i++) {
+      paddings.push_back(attr->paddings()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSpaceToBatchDirect(*fbb, &blockShape, &paddings);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SpaceToBatch, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+
+#endif  // PRIMITIVE_WRITEABLE
+
+int SpaceToBatchND::InferShape(std::vector<lite::tensor::Tensor *> inputs,
+                               std::vector<lite::tensor::Tensor *> outputs) {
+  if (outputs.size() != kSpaceToBatchNDOutputNum || inputs.size() != kSpaceToBatchNDInputNum) {
+    MS_LOG(ERROR) << "Invalid output/input size! output size: " << outputs.size() << ",input size: " << inputs.size();
+    return 1;
+  }
+
+  auto input = inputs.at(0);
+  if (input->GetFormat() != schema::Format_NHWC) {
+    MS_LOG(ERROR) << "space_to_batch_nd only support NHWC now!";
+    return RET_ERROR;
+  }
+  outputs[0]->set_data_type(input->data_type());
+  outputs[0]->SetFormat(input->GetFormat());
+  if (!GetInferFlag()) {
+    return RET_OK;
+  }
+  auto input_shape = input->shape();
+  if (input_shape.size() != kDimension_4d) {
+    MS_LOG(ERROR) << "input shape dimension size only support " << kDimension_4d << " now!";
+    return RET_ERROR;
+  }
+  auto block_shape = GetBlockShape();
+  if (block_shape.size() != kBlockSizesSize) {
+    MS_LOG(ERROR) << "blockShape size != " << kBlockSizesSize;
+    return RET_ERROR;
+  }
+  auto pedding = GetPaddings();
+  if (pedding.size() != kPaddingsSize) {
+    MS_LOG(ERROR) << "pedding size should be " << kPaddingsSize;
+    return RET_ERROR;
+  }
+
+  std::vector<int32_t> output_shape(input_shape.size());
+  output_shape[NHWC_N] = input_shape[NHWC_N] * block_shape[0] * block_shape[1];
+  output_shape[NHWC_H] = (input_shape[NHWC_H] + pedding[0] + pedding[1]) / block_shape[0];
+  output_shape[NHWC_W] = (input_shape[NHWC_W] + pedding[2] + pedding[3]) / block_shape[1];
+  output_shape[NHWC_C] = input_shape[NHWC_C];
+  outputs[0]->set_shape(output_shape);
+  return RET_OK;
+}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/space_to_batch_nd.h b/mindspore/lite/src/ops/space_to_batch_nd.h
index 4cccdaeb6a..f308efd509 100644
--- a/mindspore/lite/src/ops/space_to_batch_nd.h
+++ b/mindspore/lite/src/ops/space_to_batch_nd.h
@@ -18,8 +18,7 @@
 #define LITE_MINDSPORE_LITE_C_OPS_SPACE_TO_BATCH_N_D_H_
 
 #include <vector>
-#include <set>
-#include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,15 +27,19 @@ namespace lite {
 class SpaceToBatchND : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SpaceToBatchND, PrimitiveC);
   SpaceToBatchND() = default;
   explicit SpaceToBatchND(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBlockShape(const std::vector<int> &block_shape);
+  void SetPaddings(const std::vector<int> &paddings);
 #else
-  explicit SpaceToBatchND(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SpaceToBatchND() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetBlockShape() const;
   std::vector<int> GetPaddings() const;
-  void SetBlockShape(const std::vector<int> &block_shape);
-  void SetPaddings(const std::vector<int> &paddings);
+  int InferShape(std::vector<lite::tensor::Tensor *> inputs, std::vector<lite::tensor::Tensor *> outputs) override;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/space_to_depth.cc b/mindspore/lite/src/ops/space_to_depth.cc
index 22c35a5da0..f98956d089 100644
--- a/mindspore/lite/src/ops/space_to_depth.cc
+++ b/mindspore/lite/src/ops/space_to_depth.cc
@@ -30,9 +30,19 @@ void SpaceToDepth::SetFormat(int format) { this->primitive_->value.AsSpaceToDept
 
 int SpaceToDepth::GetBlockSize() const { return this->primitive_->value_as_SpaceToDepth()->blockSize(); }
 int SpaceToDepth::GetFormat() const { return this->primitive_->value_as_SpaceToDepth()->format(); }
-
-void SpaceToDepth::SetBlockSize(int block_size) {}
-void SpaceToDepth::SetFormat(int format) {}
+int SpaceToDepth::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SpaceToDepth();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SpaceToDepth return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateSpaceToDepth(*fbb, attr->blockSize(), attr->format());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SpaceToDepth, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 namespace {
 constexpr int kSpaceToDepthOutputNum = 1;
diff --git a/mindspore/lite/src/ops/space_to_depth.h b/mindspore/lite/src/ops/space_to_depth.h
index cd888825d6..8edeb3ea0f 100644
--- a/mindspore/lite/src/ops/space_to_depth.h
+++ b/mindspore/lite/src/ops/space_to_depth.h
@@ -28,16 +28,19 @@ namespace lite {
 class SpaceToDepth : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SpaceToDepth, PrimitiveC);
   SpaceToDepth() = default;
   explicit SpaceToDepth(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBlockSize(int block_size);
+  void SetFormat(int format);
 #else
-  explicit SpaceToDepth(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SpaceToDepth() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetBlockSize() const;
   int GetFormat() const;
-  void SetBlockSize(int block_size);
-  void SetFormat(int format);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sparse_to_dense.cc b/mindspore/lite/src/ops/sparse_to_dense.cc
index 85a978a2f2..c59ce46473 100644
--- a/mindspore/lite/src/ops/sparse_to_dense.cc
+++ b/mindspore/lite/src/ops/sparse_to_dense.cc
@@ -58,11 +58,37 @@ std::vector<int> SparseToDense::GetDefaultValue() const {
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 bool SparseToDense::GetValidateIndices() const { return this->primitive_->value_as_SparseToDense()->validateIndices(); }
-
-void SparseToDense::SetOutputShape(const std::vector<int> &output_shape) {}
-void SparseToDense::SetSparseValue(const std::vector<int> &sparse_value) {}
-void SparseToDense::SetDefaultValue(const std::vector<int> &default_value) {}
-void SparseToDense::SetValidateIndices(bool validate_indices) {}
+int SparseToDense::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_SparseToDense();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_SparseToDense return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> outputShape;
+  if (attr->outputShape() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->outputShape()->size()); i++) {
+      outputShape.push_back(attr->outputShape()->data()[i]);
+    }
+  }
+  std::vector<int32_t> sparseValue;
+  if (attr->sparseValue() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->sparseValue()->size()); i++) {
+      sparseValue.push_back(attr->sparseValue()->data()[i]);
+    }
+  }
+  std::vector<int32_t> defaultValue;
+  if (attr->defaultValue() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->defaultValue()->size()); i++) {
+      defaultValue.push_back(attr->defaultValue()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSparseToDenseDirect(*fbb, &outputShape, &sparseValue, &defaultValue);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SparseToDense, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sparse_to_dense.h b/mindspore/lite/src/ops/sparse_to_dense.h
index 40c8798bd7..d98a843975 100644
--- a/mindspore/lite/src/ops/sparse_to_dense.h
+++ b/mindspore/lite/src/ops/sparse_to_dense.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,19 +29,22 @@ namespace lite {
 class SparseToDense : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SparseToDense, PrimitiveC);
   SparseToDense() = default;
   explicit SparseToDense(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetOutputShape(const std::vector<int> &output_shape);
+  void SetSparseValue(const std::vector<int> &sparse_value);
+  void SetDefaultValue(const std::vector<int> &default_value);
+  void SetValidateIndices(bool validate_indices);
 #else
-  explicit SparseToDense(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  SparseToDense() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   std::vector<int> GetOutputShape() const;
   std::vector<int> GetSparseValue() const;
   std::vector<int> GetDefaultValue() const;
   bool GetValidateIndices() const;
-  void SetOutputShape(const std::vector<int> &output_shape);
-  void SetSparseValue(const std::vector<int> &sparse_value);
-  void SetDefaultValue(const std::vector<int> &default_value);
-  void SetValidateIndices(bool validate_indices);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/split.cc b/mindspore/lite/src/ops/split.cc
index 6df48138a0..a7bde44996 100644
--- a/mindspore/lite/src/ops/split.cc
+++ b/mindspore/lite/src/ops/split.cc
@@ -38,9 +38,25 @@ std::vector<int> Split::GetSizeSplits() const {
 }
 int Split::GetSplitDim() const { return this->primitive_->value_as_Split()->splitDim(); }
 
-void Split::SetNumberSplit(int number_split) {}
-void Split::SetSizeSplits(const std::vector<int> &size_splits) {}
-void Split::SetSplitDim(int split_dim) {}
+int Split::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Split();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Split return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> sizeSplits;
+  if (attr->sizeSplits() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->sizeSplits()->size()); i++) {
+      sizeSplits.push_back(attr->sizeSplits()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSplitDirect(*fbb, attr->numberSplit(), &sizeSplits, attr->splitDim());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Split, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/split.h b/mindspore/lite/src/ops/split.h
index d8521329c3..86c9fe3594 100644
--- a/mindspore/lite/src/ops/split.h
+++ b/mindspore/lite/src/ops/split.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,18 +29,21 @@ namespace lite {
 class Split : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Split, PrimitiveC);
   Split() = default;
   explicit Split(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetNumberSplit(int number_split);
+  void SetSizeSplits(const std::vector<int> &size_splits);
+  void SetSplitDim(int split_dim);
 #else
-  explicit Split(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Split() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetNumberSplit() const;
   std::vector<int> GetSizeSplits() const;
   int GetSplitDim() const;
-  void SetNumberSplit(int number_split);
-  void SetSizeSplits(const std::vector<int> &size_splits);
-  void SetSplitDim(int split_dim);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sqrt.cc b/mindspore/lite/src/ops/sqrt.cc
new file mode 100644
index 0000000000..6035b5db8c
--- /dev/null
+++ b/mindspore/lite/src/ops/sqrt.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/sqrt.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Sqrt::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateSqrt(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Sqrt, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sqrt.h b/mindspore/lite/src/ops/sqrt.h
index 75202fd253..68b82eee60 100644
--- a/mindspore/lite/src/ops/sqrt.h
+++ b/mindspore/lite/src/ops/sqrt.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Sqrt : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Sqrt, ArithmeticSelf);
   Sqrt() = default;
   explicit Sqrt(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Sqrt(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Sqrt() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/square.cc b/mindspore/lite/src/ops/square.cc
new file mode 100644
index 0000000000..89f5ba8dbf
--- /dev/null
+++ b/mindspore/lite/src/ops/square.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/square.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int Square::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateSquare(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Square, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/square.h b/mindspore/lite/src/ops/square.h
index 52a8fa00af..5ab29590ff 100644
--- a/mindspore/lite/src/ops/square.h
+++ b/mindspore/lite/src/ops/square.h
@@ -20,17 +20,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic_self.h"
 
 namespace mindspore {
 namespace lite {
 class Square : public ArithmeticSelf {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Square, ArithmeticSelf);
   Square() = default;
   explicit Square(schema::PrimitiveT *primitive) : ArithmeticSelf(primitive) {}
 #else
-  explicit Square(schema::Primitive *primitive) : ArithmeticSelf(primitive) {}
+  Square() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/squared_difference.cc b/mindspore/lite/src/ops/squared_difference.cc
new file mode 100644
index 0000000000..b602b29600
--- /dev/null
+++ b/mindspore/lite/src/ops/squared_difference.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/ops/squared_difference.h"
+
+namespace mindspore {
+namespace lite {
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int SquaredDifference::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateSquaredDifference(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SquaredDifference, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/squared_difference.h b/mindspore/lite/src/ops/squared_difference.h
index e625f8a2fb..7b7a6a412f 100644
--- a/mindspore/lite/src/ops/squared_difference.h
+++ b/mindspore/lite/src/ops/squared_difference.h
@@ -21,17 +21,20 @@
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
-#include "src/ops/primitive_c.h"
+#include "src/ops/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
 class SquaredDifference : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(SquaredDifference, Arithmetic);
   SquaredDifference() = default;
   explicit SquaredDifference(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
 #else
-  explicit SquaredDifference(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  SquaredDifference() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/squeeze.cc b/mindspore/lite/src/ops/squeeze.cc
index 684300e38a..d9dbd6c734 100644
--- a/mindspore/lite/src/ops/squeeze.cc
+++ b/mindspore/lite/src/ops/squeeze.cc
@@ -29,8 +29,25 @@ std::vector<int> Squeeze::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_Squeeze()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void Squeeze::SetAxis(const std::vector<int> &axis) {}
+int Squeeze::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Squeeze();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Squeeze return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateSqueezeDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Squeeze, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/squeeze.h b/mindspore/lite/src/ops/squeeze.h
index 30b8968b8f..aec38d7d68 100644
--- a/mindspore/lite/src/ops/squeeze.h
+++ b/mindspore/lite/src/ops/squeeze.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,18 @@ namespace lite {
 class Squeeze : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Squeeze, PrimitiveC);
   Squeeze() = default;
   explicit Squeeze(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+
 #else
-  explicit Squeeze(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Squeeze() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/stack.cc b/mindspore/lite/src/ops/stack.cc
index c7e13175a2..f5f0c56df6 100644
--- a/mindspore/lite/src/ops/stack.cc
+++ b/mindspore/lite/src/ops/stack.cc
@@ -35,10 +35,25 @@ std::vector<int> Stack::GetIsScale() const {
   auto fb_vector = this->primitive_->value_as_Stack()->isScale();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void Stack::SetAxis(int axis) {}
-void Stack::SetN(int n) {}
-void Stack::SetIsScale(const std::vector<int> &is_scale) {}
+int Stack::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Stack();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Stack return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> isScale;
+  if (attr->isScale() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->isScale()->size()); i++) {
+      isScale.push_back(attr->isScale()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateStackDirect(*fbb, attr->axis(), attr->n(), &isScale);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Stack, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 namespace {
diff --git a/mindspore/lite/src/ops/stack.h b/mindspore/lite/src/ops/stack.h
index 37930cb434..b1e480349d 100644
--- a/mindspore/lite/src/ops/stack.h
+++ b/mindspore/lite/src/ops/stack.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,18 +29,21 @@ namespace lite {
 class Stack : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Stack, PrimitiveC);
   Stack() = default;
   explicit Stack(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(int axis);
+  void SetN(int n);
+  void SetIsScale(const std::vector<int> &is_scale);
 #else
-  explicit Stack(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Stack() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetAxis() const;
   int GetN() const;
   std::vector<int> GetIsScale() const;
-  void SetAxis(int axis);
-  void SetN(int n);
-  void SetIsScale(const std::vector<int> &is_scale);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/strided_slice.cc b/mindspore/lite/src/ops/strided_slice.cc
index 1f2b2e4e4e..892229b866 100644
--- a/mindspore/lite/src/ops/strided_slice.cc
+++ b/mindspore/lite/src/ops/strided_slice.cc
@@ -72,16 +72,45 @@ std::vector<int> StridedSlice::GetIsScale() const {
   auto fb_vector = this->primitive_->value_as_StridedSlice()->isScale();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void StridedSlice::SetBeginMask(int begin_mask) {}
-void StridedSlice::SetEndMask(int end_mask) {}
-void StridedSlice::SetEllipsisMask(int ellipsis_mask) {}
-void StridedSlice::SetNewAxisMask(int new_axis_mask) {}
-void StridedSlice::SetShrinkAxisMask(int shrink_axis_mask) {}
-void StridedSlice::SetBegin(const std::vector<int> &begin) {}
-void StridedSlice::SetEnd(const std::vector<int> &end) {}
-void StridedSlice::SetStride(const std::vector<int> &stride) {}
-void StridedSlice::SetIsScale(const std::vector<int> &is_scale) {}
+int StridedSlice::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_StridedSlice();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_StridedSlice return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> begin;
+  if (attr->begin() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->begin()->size()); i++) {
+      begin.push_back(attr->begin()->data()[i]);
+    }
+  }
+  std::vector<int32_t> end;
+  if (attr->end() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->end()->size()); i++) {
+      end.push_back(attr->end()->data()[i]);
+    }
+  }
+  std::vector<int32_t> stride;
+  if (attr->stride() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->stride()->size()); i++) {
+      stride.push_back(attr->stride()->data()[i]);
+    }
+  }
+  std::vector<int32_t> isScale;
+  if (attr->isScale() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->isScale()->size()); i++) {
+      isScale.push_back(attr->isScale()->data()[i]);
+    }
+  }
+  auto val_offset =
+    schema::CreateStridedSliceDirect(*fbb, attr->beginMask(), attr->endMask(), attr->ellipsisMask(),
+                                     attr->newAxisMask(), attr->shrinkAxisMask(), &begin, &end, &stride, &isScale);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_StridedSlice, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 namespace {
 constexpr int kStridedSliceOutputNum = 1;
diff --git a/mindspore/lite/src/ops/strided_slice.h b/mindspore/lite/src/ops/strided_slice.h
index 811d8bac15..13d8d9151a 100644
--- a/mindspore/lite/src/ops/strided_slice.h
+++ b/mindspore/lite/src/ops/strided_slice.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,10 +29,22 @@ namespace lite {
 class StridedSlice : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(StridedSlice, PrimitiveC);
   StridedSlice() = default;
   explicit StridedSlice(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetBeginMask(int begin_mask);
+  void SetEndMask(int end_mask);
+  void SetEllipsisMask(int ellipsis_mask);
+  void SetNewAxisMask(int new_axis_mask);
+  void SetShrinkAxisMask(int shrink_axis_mask);
+  void SetBegin(const std::vector<int> &begin);
+  void SetEnd(const std::vector<int> &end);
+  void SetStride(const std::vector<int> &stride);
+  void SetIsScale(const std::vector<int> &is_scale);
 #else
-  explicit StridedSlice(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  StridedSlice() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetBeginMask() const;
@@ -43,15 +56,6 @@ class StridedSlice : public PrimitiveC {
   std::vector<int> GetEnd() const;
   std::vector<int> GetStride() const;
   std::vector<int> GetIsScale() const;
-  void SetBeginMask(int begin_mask);
-  void SetEndMask(int end_mask);
-  void SetEllipsisMask(int ellipsis_mask);
-  void SetNewAxisMask(int new_axis_mask);
-  void SetShrinkAxisMask(int shrink_axis_mask);
-  void SetBegin(const std::vector<int> &begin);
-  void SetEnd(const std::vector<int> &end);
-  void SetStride(const std::vector<int> &stride);
-  void SetIsScale(const std::vector<int> &is_scale);
 
   int NDims() { return this->ndim_; }
   void ApplyNewAxisMask();
diff --git a/mindspore/lite/src/ops/sub.cc b/mindspore/lite/src/ops/sub.cc
index 45b188a2cf..bee2131df8 100644
--- a/mindspore/lite/src/ops/sub.cc
+++ b/mindspore/lite/src/ops/sub.cc
@@ -28,8 +28,19 @@ void Sub::SetActivationType(int activation_type) {
 #else
 
 int Sub::GetActivationType() const { return this->primitive_->value_as_Sub()->activationType(); }
-
-void Sub::SetActivationType(int activation_type) {}
+int Sub::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Sub();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Sub return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateSub(*fbb, attr->activationType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Sub, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/sub.h b/mindspore/lite/src/ops/sub.h
index 1f6d90c9fa..6b4058c368 100644
--- a/mindspore/lite/src/ops/sub.h
+++ b/mindspore/lite/src/ops/sub.h
@@ -28,13 +28,17 @@ namespace lite {
 class Sub : public Arithmetic {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Sub, Arithmetic);
   Sub() = default;
   explicit Sub(schema::PrimitiveT *primitive) : Arithmetic(primitive) {}
+  void SetActivationType(int activation_type);
+
 #else
-  explicit Sub(schema::Primitive *primitive) : Arithmetic(primitive) {}
+  Sub() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int GetActivationType() const;
-  void SetActivationType(int activation_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/tile.cc b/mindspore/lite/src/ops/tile.cc
index 7714a30106..cf7058ceaf 100644
--- a/mindspore/lite/src/ops/tile.cc
+++ b/mindspore/lite/src/ops/tile.cc
@@ -35,14 +35,35 @@ std::vector<int> Tile::GetMultiples() const {
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 
-void Tile::SetMultiples(const std::vector<int> &multiples) {}
-
 std::vector<int> Tile::GetDims() const {
   auto fb_vector = this->primitive_->value_as_Tile()->dims();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void Tile::SetDims(const std::vector<int> &dims) {}
+int Tile::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Tile();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Tile return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> multiples;
+  if (attr->multiples() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->multiples()->size()); i++) {
+      multiples.push_back(attr->multiples()->data()[i]);
+    }
+  }
+  std::vector<int32_t> dims;
+  if (attr->dims() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->dims()->size()); i++) {
+      dims.push_back(attr->dims()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateTileDirect(*fbb, &multiples, &dims);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Tile, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Tile::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/tile.h b/mindspore/lite/src/ops/tile.h
index 187129485e..f46685a605 100644
--- a/mindspore/lite/src/ops/tile.h
+++ b/mindspore/lite/src/ops/tile.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,16 +29,20 @@ namespace lite {
 class Tile : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Tile, PrimitiveC);
   Tile() = default;
   explicit Tile(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMultiples(const std::vector<int> &multiples);
+  void SetDims(const std::vector<int> &dims);
+
 #else
-  explicit Tile(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Tile() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetMultiples() const;
-  void SetMultiples(const std::vector<int> &multiples);
   std::vector<int> GetDims() const;
-  void SetDims(const std::vector<int> &dims);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/topk.cc b/mindspore/lite/src/ops/topk.cc
index 38cb1e89ff..fb3a8a47e9 100644
--- a/mindspore/lite/src/ops/topk.cc
+++ b/mindspore/lite/src/ops/topk.cc
@@ -29,9 +29,19 @@ void TopK::SetSorted(bool sorted) { this->primitive_->value.AsTopK()->sorted = s
 
 int TopK::GetK() const { return this->primitive_->value_as_TopK()->k(); }
 bool TopK::GetSorted() const { return this->primitive_->value_as_TopK()->sorted(); }
-
-void TopK::SetK(int k) {}
-void TopK::SetSorted(bool sorted) {}
+int TopK::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_TopK();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_TopK return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateTopK(*fbb, attr->k(), attr->sorted());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_TopK, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int TopK::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/topk.h b/mindspore/lite/src/ops/topk.h
index 1c23040537..082f83fd7e 100644
--- a/mindspore/lite/src/ops/topk.h
+++ b/mindspore/lite/src/ops/topk.h
@@ -28,16 +28,19 @@ namespace lite {
 class TopK : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(TopK, PrimitiveC);
   TopK() = default;
   explicit TopK(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetK(int k);
+  void SetSorted(bool sorted);
 #else
-  explicit TopK(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  TopK() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetK() const;
   bool GetSorted() const;
-  void SetK(int k);
-  void SetSorted(bool sorted);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/transpose.cc b/mindspore/lite/src/ops/transpose.cc
index f69e5c37b7..11c057a6e1 100644
--- a/mindspore/lite/src/ops/transpose.cc
+++ b/mindspore/lite/src/ops/transpose.cc
@@ -29,28 +29,43 @@ void Transpose::SetPerm(const std::vector<int> &perm) { this->primitive_->value.
 void Transpose::SetConjugate(bool conjugate) { this->primitive_->value.AsTranspose()->conjugate = conjugate; }
 
 int Transpose::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::TransposeT>();
-  MS_ASSERT(inputs.size() == kAnfPopulaterTwo);
-  auto inputNode = inputs[kAnfPopulaterOne];
-  if (inputNode->isa<ValueNode>()) {
-    auto valNode = inputNode->cast<ValueNodePtr>();
-    MS_ASSERT(valNode != nullptr);
-    auto val = valNode->value();
-    MS_ASSERT(val != nullptr);
-    if (val->isa<ValueTuple>()) {
-      auto tuple = val->cast<ValueTuplePtr>();
-      MS_ASSERT(tuple != nullptr);
-      for (size_t i = 0; i < tuple->size(); i++) {
-        auto elem = tuple->value()[i]->cast<Int32ImmPtr>();
-        MS_ASSERT(elem != nullptr);
-        attr->perm.emplace_back(static_cast<int>(elem->value()));
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_Transpose;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_Transpose) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::TransposeT();
+    MS_ASSERT(inputs.size() == kAnfPopulaterTwo);
+    auto inputNode = inputs[kAnfPopulaterOne];
+    if (inputNode->isa<ValueNode>()) {
+      auto valNode = inputNode->cast<ValueNodePtr>();
+      MS_ASSERT(valNode != nullptr);
+      auto val = valNode->value();
+      MS_ASSERT(val != nullptr);
+      if (val->isa<ValueTuple>()) {
+        auto tuple = val->cast<ValueTuplePtr>();
+        MS_ASSERT(tuple != nullptr);
+        for (size_t i = 0; i < tuple->size(); i++) {
+          auto elem = tuple->value()[i]->cast<Int32ImmPtr>();
+          MS_ASSERT(elem != nullptr);
+          attr->perm.emplace_back(static_cast<int>(elem->value()));
+        }
       }
     }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
   }
-
-  this->primitive_->value.type = schema::PrimitiveType_Transpose;
-  this->primitive_->value.value = attr.release();
   return RET_OK;
 }
 
@@ -62,8 +77,26 @@ std::vector<int> Transpose::GetPerm() const {
 }
 bool Transpose::GetConjugate() const { return this->primitive_->value_as_Transpose()->conjugate(); }
 
-void Transpose::SetPerm(const std::vector<int> &perm) {}
-void Transpose::SetConjugate(bool conjugate) {}
+int Transpose::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Transpose();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Transpose return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> perm;
+  if (attr->perm() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->perm()->size()); i++) {
+      perm.push_back(attr->perm()->data()[i]);
+    }
+  }
+
+  auto val_offset = schema::CreateTransposeDirect(*fbb, &perm, attr->conjugate());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Transpose, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Transpose::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/transpose.h b/mindspore/lite/src/ops/transpose.h
index 4cde724030..b12507993d 100644
--- a/mindspore/lite/src/ops/transpose.h
+++ b/mindspore/lite/src/ops/transpose.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,17 +29,20 @@ namespace lite {
 class Transpose : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Transpose, PrimitiveC);
   Transpose() = default;
   explicit Transpose(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
+  void SetPerm(const std::vector<int> &perm);
+  void SetConjugate(bool conjugate);
 #else
-  explicit Transpose(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Transpose() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetPerm() const;
   bool GetConjugate() const;
-  void SetPerm(const std::vector<int> &perm);
-  void SetConjugate(bool conjugate);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/tuple_get_item.cc b/mindspore/lite/src/ops/tuple_get_item.cc
index 1abf6b18a2..ac35bb6100 100644
--- a/mindspore/lite/src/ops/tuple_get_item.cc
+++ b/mindspore/lite/src/ops/tuple_get_item.cc
@@ -22,11 +22,39 @@ namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int TupleGetItem::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
-  this->primitive_ = new (schema::PrimitiveT);
-  auto attr = std::make_unique<schema::TupleGetItemT>();
-  this->primitive_->value.type = schema::PrimitiveType_TupleGetItem;
-  this->primitive_->value.value = attr.release();
-
+  if (this->primitive_ == nullptr) {
+    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
+    if (this->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.type = schema::PrimitiveType_TupleGetItem;
+  }
+  if (this->primitive_->value.type != schema::PrimitiveType_TupleGetItem) {
+    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
+    return RET_ERROR;
+  }
+  if (this->primitive_->value.value == nullptr) {
+    auto attr = new (std::nothrow) schema::TupleGetItemT();
+    if (attr == nullptr) {
+      MS_LOG(ERROR) << "new primitiveT value failed";
+      return RET_ERROR;
+    }
+    this->primitive_->value.value = attr;
+    if (this->primitive_->value.value == nullptr) {
+      MS_LOG(ERROR) << "primitive value is nullptr";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+#else
+int TupleGetItem::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto val_offset = schema::CreateTupleGetItem(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_TupleGetItem, val_offset.o);
+  fbb->Finish(prim_offset);
   return RET_OK;
 }
 #endif
diff --git a/mindspore/lite/src/ops/tuple_get_item.h b/mindspore/lite/src/ops/tuple_get_item.h
index 729a1cfc9b..e816c122b6 100644
--- a/mindspore/lite/src/ops/tuple_get_item.h
+++ b/mindspore/lite/src/ops/tuple_get_item.h
@@ -25,11 +25,13 @@ namespace lite {
 class TupleGetItem : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(TupleGetItem, PrimitiveC);
   TupleGetItem() = default;
   explicit TupleGetItem(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
-  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
-  explicit TupleGetItem(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  TupleGetItem() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
 };
 }  // namespace lite
diff --git a/mindspore/lite/src/ops/unique.cc b/mindspore/lite/src/ops/unique.cc
index 652114cafa..8f0a626bcc 100644
--- a/mindspore/lite/src/ops/unique.cc
+++ b/mindspore/lite/src/ops/unique.cc
@@ -26,8 +26,19 @@ void Unique::SetOutType(int out_type) { this->primitive_->value.AsUnique()->outT
 #else
 
 int Unique::GetOutType() const { return this->primitive_->value_as_Unique()->outType(); }
-
-void Unique::SetOutType(int out_type) {}
+int Unique::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Unique();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Unique return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateUnique(*fbb, attr->outType());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Unique, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Unique::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/unique.h b/mindspore/lite/src/ops/unique.h
index c8ca722abf..4904a1e813 100644
--- a/mindspore/lite/src/ops/unique.h
+++ b/mindspore/lite/src/ops/unique.h
@@ -28,14 +28,18 @@ namespace lite {
 class Unique : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Unique, PrimitiveC);
   Unique() = default;
   explicit Unique(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetOutType(int out_type);
+
 #else
-  explicit Unique(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Unique() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
+
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetOutType() const;
-  void SetOutType(int out_type);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/unsqueeze.cc b/mindspore/lite/src/ops/unsqueeze.cc
index f515bc9567..5d55cd19f2 100644
--- a/mindspore/lite/src/ops/unsqueeze.cc
+++ b/mindspore/lite/src/ops/unsqueeze.cc
@@ -32,8 +32,25 @@ std::vector<int> Unsqueeze::GetAxis() const {
   auto fb_vector = this->primitive_->value_as_Unsqueeze()->axis();
   return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
-
-void Unsqueeze::SetAxis(const std::vector<int> &axis) {}
+int Unsqueeze::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Unsqueeze();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Unsqueeze return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> axis;
+  if (attr->axis() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
+      axis.push_back(attr->axis()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateUnsqueezeDirect(*fbb, &axis);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Unsqueeze, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Unsqueeze::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
@@ -54,7 +71,7 @@ int Unsqueeze::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<ten
     return RET_OK;
   }
 
-  auto dims = GetAxis().data();
+  auto dims = GetAxis();
   auto in_shape = input->shape();
   auto in_rank = in_shape.size();
   auto dim_rank = GetAxis().size();
diff --git a/mindspore/lite/src/ops/unsqueeze.h b/mindspore/lite/src/ops/unsqueeze.h
index bea5831fcd..36bc2b261c 100644
--- a/mindspore/lite/src/ops/unsqueeze.h
+++ b/mindspore/lite/src/ops/unsqueeze.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,18 @@ namespace lite {
 class Unsqueeze : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Unsqueeze, PrimitiveC);
   Unsqueeze() = default;
   explicit Unsqueeze(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetAxis(const std::vector<int> &axis);
+
 #else
-  explicit Unsqueeze(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Unsqueeze() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<int> GetAxis() const;
-  void SetAxis(const std::vector<int> &axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/unstack.cc b/mindspore/lite/src/ops/unstack.cc
index 6d3a6ff03d..24da3faab6 100644
--- a/mindspore/lite/src/ops/unstack.cc
+++ b/mindspore/lite/src/ops/unstack.cc
@@ -29,9 +29,19 @@ void Unstack::SetAxis(int axis) { this->primitive_->value.AsUnstack()->axis = ax
 
 int Unstack::GetNum() const { return this->primitive_->value_as_Unstack()->num(); }
 int Unstack::GetAxis() const { return this->primitive_->value_as_Unstack()->axis(); }
-
-void Unstack::SetNum(int num) {}
-void Unstack::SetAxis(int axis) {}
+int Unstack::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Unstack();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Unstack return nullptr";
+    return RET_ERROR;
+  }
+  auto val_offset = schema::CreateUnstack(*fbb, attr->num(), attr->axis());
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Unstack, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Unstack::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
diff --git a/mindspore/lite/src/ops/unstack.h b/mindspore/lite/src/ops/unstack.h
index 337f74ab7f..b0a24b672e 100644
--- a/mindspore/lite/src/ops/unstack.h
+++ b/mindspore/lite/src/ops/unstack.h
@@ -28,16 +28,19 @@ namespace lite {
 class Unstack : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Unstack, PrimitiveC);
   Unstack() = default;
   explicit Unstack(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetNum(int num);
+  void SetAxis(int axis);
 #else
-  explicit Unstack(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Unstack() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   int GetNum() const;
   int GetAxis() const;
-  void SetNum(int num);
-  void SetAxis(int axis);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/upsample.cc b/mindspore/lite/src/ops/upsample.cc
index 9f0623bc29..10c9af70d8 100644
--- a/mindspore/lite/src/ops/upsample.cc
+++ b/mindspore/lite/src/ops/upsample.cc
@@ -33,9 +33,25 @@ std::vector<float> Upsample::GetScales() const {
   auto fb_vector = this->primitive_->value_as_Upsample()->scales();
   return std::vector<float>(fb_vector->begin(), fb_vector->end());
 }
-
-void Upsample::SetMode(std::string mode) {}
-void Upsample::SetScales(const std::vector<float> &scales) {}
+int Upsample::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Upsample();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Upsample return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<float> scales;
+  if (attr->scales() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->scales()->size()); i++) {
+      scales.push_back(attr->scales()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateUpsampleDirect(*fbb, attr->mode()->c_str(), &scales);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Upsample, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/upsample.h b/mindspore/lite/src/ops/upsample.h
index 26df8d7604..3d9ace6abc 100644
--- a/mindspore/lite/src/ops/upsample.h
+++ b/mindspore/lite/src/ops/upsample.h
@@ -21,6 +21,7 @@
 #include <set>
 #include <cmath>
 #include <string>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -29,15 +30,18 @@ namespace lite {
 class Upsample : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Upsample, PrimitiveC);
   Upsample() = default;
   explicit Upsample(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetMode(std::string mode);
+  void SetScales(const std::vector<float> &scales);
 #else
-  explicit Upsample(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Upsample() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
+
 #endif
   std::string GetMode() const;
   std::vector<float> GetScales() const;
-  void SetMode(std::string mode);
-  void SetScales(const std::vector<float> &scales);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/where.cc b/mindspore/lite/src/ops/where.cc
index 1641fd5e2c..39ca552670 100644
--- a/mindspore/lite/src/ops/where.cc
+++ b/mindspore/lite/src/ops/where.cc
@@ -31,8 +31,25 @@ std::vector<bool> Where::GetCondition() const {
   auto fb_vector = this->primitive_->value_as_Where()->condition();
   return std::vector<bool>(fb_vector->begin(), fb_vector->end());
 }
-
-void Where::SetCondition(const std::vector<bool> &condition) {}
+int Where::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+  auto attr = primitive->value_as_Where();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "value_as_Where return nullptr";
+    return RET_ERROR;
+  }
+  std::vector<uint8_t> condition;
+  if (attr->condition() != nullptr) {
+    for (int i = 0; i < static_cast<int>(attr->condition()->size()); i++) {
+      condition.push_back(attr->condition()->data()[i]);
+    }
+  }
+  auto val_offset = schema::CreateWhereDirect(*fbb, &condition);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_Where, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
 #endif
 
 int Where::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
diff --git a/mindspore/lite/src/ops/where.h b/mindspore/lite/src/ops/where.h
index 7db38f9a5d..9597c813e2 100644
--- a/mindspore/lite/src/ops/where.h
+++ b/mindspore/lite/src/ops/where.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <set>
 #include <cmath>
+#include <memory>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"
 
@@ -28,14 +29,18 @@ namespace lite {
 class Where : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(Where, PrimitiveC);
   Where() = default;
   explicit Where(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
+  void SetCondition(const std::vector<bool> &condition);
+
 #else
-  explicit Where(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  Where() = default;
+
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
   std::vector<bool> GetCondition() const;
-  void SetCondition(const std::vector<bool> &condition);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/zeros_like.cc b/mindspore/lite/src/ops/zeros_like.cc
index 23e674617d..f4562e38fc 100644
--- a/mindspore/lite/src/ops/zeros_like.cc
+++ b/mindspore/lite/src/ops/zeros_like.cc
@@ -18,6 +18,20 @@
 
 namespace mindspore {
 namespace lite {
+
+#ifdef PRIMITIVE_WRITEABLE
+#else
+int ZerosLike::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
+  MS_ASSERT(nullptr != primitive);
+  MS_ASSERT(nullptr != fbb);
+
+  auto val_offset = schema::CreateZerosLike(*fbb);
+  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ZerosLike, val_offset.o);
+  fbb->Finish(prim_offset);
+  return RET_OK;
+}
+#endif
+
 int ZerosLike::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) {
   MS_ASSERT(this->primitive_ != nullptr);
   auto input = inputs_.front();
@@ -37,5 +51,6 @@ int ZerosLike::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vect
   output->set_shape(input->shape());
   return RET_OK;
 }
+
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/ops/zeros_like.h b/mindspore/lite/src/ops/zeros_like.h
index 36524220df..08a0325d11 100644
--- a/mindspore/lite/src/ops/zeros_like.h
+++ b/mindspore/lite/src/ops/zeros_like.h
@@ -28,10 +28,12 @@ namespace lite {
 class ZerosLike : public PrimitiveC {
  public:
 #ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(ZerosLike, PrimitiveC);
   ZerosLike() = default;
   explicit ZerosLike(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
-  explicit ZerosLike(schema::Primitive *primitive) : PrimitiveC(primitive) {}
+  ZerosLike() = default;
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
   int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
diff --git a/mindspore/lite/src/param_value_lite.h b/mindspore/lite/src/param_value_lite.h
index f747042575..a57c4b8140 100644
--- a/mindspore/lite/src/param_value_lite.h
+++ b/mindspore/lite/src/param_value_lite.h
@@ -31,8 +31,8 @@ class ParamValueLite : public Value {
   ParamValueLite() : tensor_addr_(nullptr), tensor_size_(0) {}
   virtual ~ParamValueLite() {
     if (tensor_addr_ != nullptr) {
-      auto tensor_mem = reinterpret_cast<char*>(tensor_addr_);
-      delete tensor_mem;
+      auto tensor_mem = reinterpret_cast<char *>(tensor_addr_);
+      delete[](tensor_mem);
       tensor_addr_ = nullptr;
       tensor_size_ = 0;
     }
diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc
index 33621ea615..b1bb2e7e61 100644
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -20,6 +20,7 @@
 #include "schema/ops_generated.h"
 #include "src/ops/constant_of_shape.h"
 #include "src/ops/space_to_batch.h"
+#include "src/ops/space_to_batch_nd.h"
 #include "src/ops/conv2d.h"
 #include "src/ops/roi_pooling.h"
 #include "src/ops/topk.h"
@@ -74,8 +75,8 @@
 #include "src/ops/gather_nd.h"
 #include "src/ops/local_response_normalization.h"
 #include "src/ops/pad.h"
-#include "src/ops/prelu.h"
-#include "src/ops/caffe_p_relu.h"
+#include "src/ops/leaky_relu.h"
+#include "src/ops/p_relu.h"
 #include "src/ops/reverse_sequence.h"
 #include "src/ops/dedepthwise_conv2d.h"
 #include "src/ops/depthwise_conv2d.h"
@@ -144,7 +145,7 @@
 #include "nnacl/transpose.h"
 #include "nnacl/split_parameter.h"
 #include "nnacl/squeeze.h"
-#include "nnacl/fp32/gather.h"
+#include "nnacl/gather_parameter.h"
 #include "nnacl/fp32/reverse.h"
 #include "nnacl/reverse_sequence.h"
 #include "nnacl/fp32/unique.h"
@@ -172,11 +173,12 @@ namespace mindspore::kernel {
 OpParameter *PopulateROIPoolingParameter(const mindspore::lite::PrimitiveC *primitive) {
   const auto param =
     reinterpret_cast<mindspore::lite::ROIPooling *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *roi_pooling_param = new (std::nothrow) ROIPoolingParameter();
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new PoolingParameter failed.";
+  ROIPoolingParameter *roi_pooling_param = reinterpret_cast<ROIPoolingParameter *>(malloc(sizeof(ROIPoolingParameter)));
+  if (roi_pooling_param == nullptr) {
+    MS_LOG(ERROR) << "malloc ROIPoolingParameter failed.";
     return nullptr;
   }
+  memset(roi_pooling_param, 0, sizeof(ROIPoolingParameter));
   roi_pooling_param->op_parameter_.type_ = primitive->Type();
   roi_pooling_param->pooledH_ = param->GetPooledW();
   roi_pooling_param->pooledW_ = param->GetPooledW();
@@ -187,11 +189,12 @@ OpParameter *PopulateROIPoolingParameter(const mindspore::lite::PrimitiveC *prim
 OpParameter *PopulateBatchNorm(const mindspore::lite::PrimitiveC *primitive) {
   const auto param =
     reinterpret_cast<mindspore::lite::BatchNorm *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *batch_norm_param = new (std::nothrow) BatchNormParameter();
+  BatchNormParameter *batch_norm_param = reinterpret_cast<BatchNormParameter *>(malloc(sizeof(BatchNormParameter)));
   if (batch_norm_param == nullptr) {
-    MS_LOG(ERROR) << "new BatchNormParameter failed.";
+    MS_LOG(ERROR) << "malloc BatchNormParameter failed.";
     return nullptr;
   }
+  memset(batch_norm_param, 0, sizeof(BatchNormParameter));
   batch_norm_param->op_parameter_.type_ = primitive->Type();
   batch_norm_param->epsilon_ = param->GetEpsilon();
   batch_norm_param->fused_ = false;
@@ -200,11 +203,12 @@ OpParameter *PopulateBatchNorm(const mindspore::lite::PrimitiveC *primitive) {
 
 OpParameter *PopulateFillParameter(const mindspore::lite::PrimitiveC *primitive) {
   const auto param = reinterpret_cast<mindspore::lite::Fill *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *fill_param = new (std::nothrow) FillParameter();
+  FillParameter *fill_param = reinterpret_cast<FillParameter *>(malloc(sizeof(FillParameter)));
   if (fill_param == nullptr) {
-    MS_LOG(ERROR) << "new FillParameter failed.";
+    MS_LOG(ERROR) << "malloc FillParameter failed.";
     return nullptr;
   }
+  memset(fill_param, 0, sizeof(FillParameter));
   fill_param->op_parameter_.type_ = primitive->Type();
   auto flatDims = param->GetDims();
   fill_param->num_dims_ = flatDims.size();
@@ -217,63 +221,64 @@ OpParameter *PopulateFillParameter(const mindspore::lite::PrimitiveC *primitive)
 
 OpParameter *PopulateExpandDimsParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto param = reinterpret_cast<mindspore::lite::ExpandDims *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *expand_dims_param = new (std::nothrow) ExpandDimsParameter();
+  ExpandDimsParameter *expand_dims_param = reinterpret_cast<ExpandDimsParameter *>(malloc(sizeof(ExpandDimsParameter)));
   if (expand_dims_param == nullptr) {
-    MS_LOG(ERROR) << "new ExpandDimsParameter failed.";
+    MS_LOG(ERROR) << "malloc ExpandDimsParameter failed.";
     return nullptr;
   }
+  memset(expand_dims_param, 0, sizeof(ExpandDimsParameter));
   expand_dims_param->op_parameter_.type_ = primitive->Type();
   expand_dims_param->dim_ = param->GetDim();
   return reinterpret_cast<OpParameter *>(expand_dims_param);
 }
 
 OpParameter *PopulatePReLUParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto param = dynamic_cast<const mindspore::lite::CaffePReLU *>(primitive);
-  auto *prelu_param = new (std::nothrow) PReluParameter();
+  auto param = dynamic_cast<const mindspore::lite::PReLU *>(primitive);
+  PReluParameter *prelu_param = reinterpret_cast<PReluParameter *>(malloc(sizeof(PReluParameter)));
   if (prelu_param == nullptr) {
-    MS_LOG(ERROR) << "new caffePReluParameter failed.";
+    MS_LOG(ERROR) << "malloc PReluParameter failed.";
     return nullptr;
   }
+  memset(prelu_param, 0, sizeof(PReluParameter));
   prelu_param->op_parameter_.type_ = primitive->Type();
   prelu_param->channelShared = param->GetChannelShared();
   return reinterpret_cast<OpParameter *>(prelu_param);
 }
 
 OpParameter *PopulateLeakyReluParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto param = dynamic_cast<const mindspore::lite::Prelu *>(primitive);
-  LeakyReluParameter *leaky_relu_param = new (std::nothrow) LeakyReluParameter();
+  auto param = dynamic_cast<const mindspore::lite::LeakyReLU *>(primitive);
+  LeakyReluParameter *leaky_relu_param = reinterpret_cast<LeakyReluParameter *>(malloc(sizeof(LeakyReluParameter)));
   if (leaky_relu_param == nullptr) {
-    MS_LOG(ERROR) << "new LeakyReluParameter failed.";
+    MS_LOG(ERROR) << "malloc LeakyReluParameter failed.";
     return nullptr;
   }
+  memset(leaky_relu_param, 0, sizeof(LeakyReluParameter));
   leaky_relu_param->op_parameter_.type_ = primitive->Type();
-  auto temp = param->GetSlope();
-  leaky_relu_param->slope_ = reinterpret_cast<float *>(malloc(temp.size() * sizeof(float)));
+  leaky_relu_param->slope_ = reinterpret_cast<float *>(malloc(sizeof(float)));
   if (leaky_relu_param->slope_ == nullptr) {
     MS_LOG(ERROR) << "malloc relu slope fail!";
+    free(leaky_relu_param);
     return nullptr;
   }
-  for (size_t i = 0; i < temp.size(); i++) {
-    leaky_relu_param->slope_[i] = temp[i];
-  }
-  leaky_relu_param->slope_num_ = temp.size();
+  leaky_relu_param->slope_[0] = param->GetNegativeSlope();
+  leaky_relu_param->slope_num_ = 1;
   return reinterpret_cast<OpParameter *>(leaky_relu_param);
 }
 
 OpParameter *PopulatePoolingParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto pooling_primitive =
     reinterpret_cast<mindspore::lite::Pooling *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *pooling_param = new (std::nothrow) PoolingParameter();
+  PoolingParameter *pooling_param = reinterpret_cast<PoolingParameter *>(malloc(sizeof(PoolingParameter)));
   if (pooling_param == nullptr) {
-    MS_LOG(ERROR) << "new PoolingParameter failed.";
+    MS_LOG(ERROR) << "malloc PoolingParameter failed.";
     return nullptr;
   }
+  memset(pooling_param, 0, sizeof(PoolingParameter));
   pooling_param->op_parameter_.type_ = primitive->Type();
   pooling_param->global_ = pooling_primitive->GetGlobal();
   pooling_param->window_w_ = pooling_primitive->GetWindowW();
   pooling_param->window_h_ = pooling_primitive->GetWindowH();
   auto pooling_lite_primitive = (lite::Pooling *)primitive;
-  MS_ASSERT(nullptr != pooling_lite_primitive);
   pooling_param->pad_u_ = pooling_lite_primitive->PadUp();
   pooling_param->pad_d_ = pooling_lite_primitive->PadDown();
   pooling_param->pad_l_ = pooling_lite_primitive->PadLeft();
@@ -286,45 +291,48 @@ OpParameter *PopulatePoolingParameter(const mindspore::lite::PrimitiveC *primiti
   auto pool_mode = pooling_primitive->GetPoolingMode();
   switch (pool_mode) {
     case schema::PoolMode_MAX_POOLING:
-      pooling_param->max_pooling_ = true;
-      pooling_param->avg_pooling_ = false;
+      pooling_param->pool_mode_ = PoolMode_MaxPool;
       break;
     case schema::PoolMode_MEAN_POOLING:
-      pooling_param->max_pooling_ = false;
-      pooling_param->avg_pooling_ = true;
+      pooling_param->pool_mode_ = PoolMode_AvgPool;
       break;
     default:
-      pooling_param->max_pooling_ = false;
-      pooling_param->avg_pooling_ = false;
+      pooling_param->pool_mode_ = PoolMode_No;
       break;
   }
 
   auto round_mode = pooling_primitive->GetRoundMode();
   switch (round_mode) {
     case schema::RoundMode_FLOOR:
-      pooling_param->round_floor_ = true;
-      pooling_param->round_ceil_ = false;
+      pooling_param->round_mode_ = RoundMode_Floor;
       break;
     case schema::RoundMode_CEIL:
-      pooling_param->round_floor_ = false;
-      pooling_param->round_ceil_ = true;
+      pooling_param->round_mode_ = RoundMode_Ceil;
       break;
     default:
-      pooling_param->round_floor_ = false;
-      pooling_param->round_ceil_ = false;
+      pooling_param->round_mode_ = RoundMode_No;
       break;
   }
+
+  if (pooling_primitive->GetActivationType() == schema::ActivationType_RELU) {
+    pooling_param->act_type_ = ActType_Relu;
+  } else if (pooling_primitive->GetActivationType() == schema::ActivationType_RELU6) {
+    pooling_param->act_type_ = ActType_Relu6;
+  } else {
+    pooling_param->act_type_ = ActType_No;
+  }
   return reinterpret_cast<OpParameter *>(pooling_param);
 }
 
 OpParameter *PopulateFullconnectionParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto param =
     reinterpret_cast<mindspore::lite::FullConnection *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *matmul_param = new (std::nothrow) MatMulParameter();
+  MatMulParameter *matmul_param = reinterpret_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
   if (matmul_param == nullptr) {
-    MS_LOG(ERROR) << "new FullconnectionParameter failed.";
+    MS_LOG(ERROR) << "malloc MatMulParameter failed.";
     return nullptr;
   }
+  memset(matmul_param, 0, sizeof(MatMulParameter));
   matmul_param->op_parameter_.type_ = primitive->Type();
   matmul_param->b_transpose_ = true;
   matmul_param->a_transpose_ = false;
@@ -342,11 +350,12 @@ OpParameter *PopulateFullconnectionParameter(const mindspore::lite::PrimitiveC *
 
 OpParameter *PopulateMatMulParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto param = reinterpret_cast<mindspore::lite::MatMul *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *matmul_param = new (std::nothrow) MatMulParameter();
+  MatMulParameter *matmul_param = reinterpret_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
   if (matmul_param == nullptr) {
-    MS_LOG(ERROR) << "new FullconnectionParameter failed.";
+    MS_LOG(ERROR) << "malloc MatMulParameter failed.";
     return nullptr;
   }
+  memset(matmul_param, 0, sizeof(MatMulParameter));
   matmul_param->op_parameter_.type_ = primitive->Type();
   matmul_param->b_transpose_ = param->GetTransposeB();
   matmul_param->a_transpose_ = param->GetTransposeA();
@@ -356,11 +365,12 @@ OpParameter *PopulateMatMulParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *conv_param = new (std::nothrow) ConvParameter();
+  ConvParameter *conv_param = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
   if (conv_param == nullptr) {
-    MS_LOG(ERROR) << "new ConvParameter failed.";
+    MS_LOG(ERROR) << "malloc ConvParameter failed.";
     return nullptr;
   }
+  memset(conv_param, 0, sizeof(ConvParameter));
   conv_param->op_parameter_.type_ = primitive->Type();
   auto conv_primitive =
     reinterpret_cast<mindspore::lite::Conv2D *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -371,13 +381,10 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
   conv_param->stride_w_ = conv_primitive->GetStrideW();
 
   auto conv2d_lite_primitive = (lite::Conv2D *)primitive;
-  MS_ASSERT(nullptr != conv2d_lite_primitive);
   conv_param->pad_u_ = conv2d_lite_primitive->PadUp();
   conv_param->pad_d_ = conv2d_lite_primitive->PadDown();
   conv_param->pad_l_ = conv2d_lite_primitive->PadLeft();
   conv_param->pad_r_ = conv2d_lite_primitive->PadRight();
-  conv_param->pad_h_ = conv2d_lite_primitive->PadUp();
-  conv_param->pad_w_ = conv2d_lite_primitive->PadLeft();
   conv_param->dilation_h_ = conv_primitive->GetDilateH();
   conv_param->dilation_w_ = conv_primitive->GetDilateW();
   conv_param->input_channel_ = conv_primitive->GetChannelIn();
@@ -386,27 +393,25 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
   auto act_type = conv_primitive->GetActivationType();
   switch (act_type) {
     case schema::ActivationType_RELU:
-      conv_param->is_relu_ = true;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_Relu;
       break;
     case schema::ActivationType_RELU6:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = true;
+      conv_param->act_type_ = ActType_Relu6;
       break;
     default:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_No;
       break;
   }
   return reinterpret_cast<OpParameter *>(conv_param);
 }
 
 OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *conv_param = new (std::nothrow) ConvParameter();
+  ConvParameter *conv_param = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
   if (conv_param == nullptr) {
-    MS_LOG(ERROR) << "new ConvParameter failed.";
+    MS_LOG(ERROR) << "malloc ConvParameter failed.";
     return nullptr;
   }
+  memset(conv_param, 0, sizeof(ConvParameter));
   conv_param->op_parameter_.type_ = primitive->Type();
 
   auto conv_primitive =
@@ -417,39 +422,35 @@ OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitiv
   conv_param->stride_w_ = conv_primitive->GetStrideW();
 
   auto convdw_lite_primitive = (lite::DepthwiseConv2D *)primitive;
-  MS_ASSERT(nullptr != convdw_lite_primitive);
   conv_param->pad_u_ = convdw_lite_primitive->PadUp();
   conv_param->pad_d_ = convdw_lite_primitive->PadDown();
   conv_param->pad_l_ = convdw_lite_primitive->PadLeft();
   conv_param->pad_r_ = convdw_lite_primitive->PadRight();
-  conv_param->pad_h_ = convdw_lite_primitive->PadUp();
-  conv_param->pad_w_ = convdw_lite_primitive->PadLeft();
+  conv_param->input_channel_ = convdw_lite_primitive->GetInputChannel();
   conv_param->dilation_h_ = conv_primitive->GetDilateH();
   conv_param->dilation_w_ = conv_primitive->GetDilateW();
   auto act_type = conv_primitive->GetActivationType();
   switch (act_type) {
     case schema::ActivationType_RELU:
-      conv_param->is_relu_ = true;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_Relu;
       break;
     case schema::ActivationType_RELU6:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = true;
+      conv_param->act_type_ = ActType_Relu6;
       break;
     default:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_No;
       break;
   }
   return reinterpret_cast<OpParameter *>(conv_param);
 }
 
 OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *conv_param = new ConvParameter();
+  ConvParameter *conv_param = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
   if (conv_param == nullptr) {
-    MS_LOG(ERROR) << "new ConvParameter failed.";
+    MS_LOG(ERROR) << "malloc ConvParameter failed.";
     return nullptr;
   }
+  memset(conv_param, 0, sizeof(ConvParameter));
   conv_param->op_parameter_.type_ = primitive->Type();
   auto conv_primitive =
     reinterpret_cast<mindspore::lite::DeDepthwiseConv2D *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -459,39 +460,34 @@ OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primit
   conv_param->stride_w_ = conv_primitive->GetStrideW();
 
   auto deconvdw_lite_primitive = (mindspore::lite::DeDepthwiseConv2D *)primitive;
-  MS_ASSERT(nullptr != deconvdw_lite_primitive);
   conv_param->pad_u_ = deconvdw_lite_primitive->PadUp();
   conv_param->pad_d_ = deconvdw_lite_primitive->PadDown();
   conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft();
   conv_param->pad_r_ = deconvdw_lite_primitive->PadRight();
-  conv_param->pad_h_ = deconvdw_lite_primitive->PadUp();
-  conv_param->pad_w_ = deconvdw_lite_primitive->PadLeft();
   conv_param->dilation_h_ = conv_primitive->GetDilateH();
   conv_param->dilation_w_ = conv_primitive->GetDilateW();
   auto act_type = conv_primitive->GetActivationType();
   switch (act_type) {
     case schema::ActivationType_RELU:
-      conv_param->is_relu_ = true;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_Relu;
       break;
     case schema::ActivationType_RELU6:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = true;
+      conv_param->act_type_ = ActType_Relu6;
       break;
     default:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_No;
       break;
   }
   return reinterpret_cast<OpParameter *>(conv_param);
 }
 
 OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *conv_param = new ConvParameter();
+  ConvParameter *conv_param = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
   if (conv_param == nullptr) {
-    MS_LOG(ERROR) << "new ConvParameter failed.";
+    MS_LOG(ERROR) << "malloc ConvParameter failed.";
     return nullptr;
   }
+  memset(conv_param, 0, sizeof(ConvParameter));
   conv_param->op_parameter_.type_ = primitive->Type();
   auto conv_primitive =
     reinterpret_cast<mindspore::lite::DeConv2D *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -501,7 +497,6 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv
   conv_param->stride_w_ = conv_primitive->GetStrideW();
 
   auto deconv_lite_primitive = (lite::DeConv2D *)primitive;
-  MS_ASSERT(nullptr != deconvdw_lite_primitive);
   conv_param->pad_u_ = deconv_lite_primitive->PadUp();
   conv_param->pad_d_ = deconv_lite_primitive->PadDown();
   conv_param->pad_l_ = deconv_lite_primitive->PadLeft();
@@ -511,67 +506,46 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv
   auto act_type = conv_primitive->GetActivationType();
   switch (act_type) {
     case schema::ActivationType_RELU:
-      conv_param->is_relu_ = true;
-      conv_param->is_relu6_ = false;
+      conv_param->act_type_ = ActType_Relu;
       break;
     case schema::ActivationType_RELU6:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = true;
+      conv_param->act_type_ = ActType_Relu6;
       break;
     default:
-      conv_param->is_relu_ = false;
-      conv_param->is_relu6_ = false;
-      break;
-  }
-
-  auto pad_mode = conv_primitive->GetPadMode();
-  switch (pad_mode) {
-    case schema::PadMode_SAME:
-      conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
-      conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
-      break;
-    case schema::PadMode_VALID:
-      conv_param->pad_h_ = 0;
-      conv_param->pad_w_ = 0;
+      conv_param->act_type_ = ActType_No;
       break;
-    case schema::PadMode_CAFFE:
-      conv_param->pad_h_ = conv_param->pad_u_;
-      conv_param->pad_w_ = conv_param->pad_l_;
-      break;
-    default:
-      MS_LOG(ERROR) << "invalid pad mode!";
-      return nullptr;
   }
-
   return reinterpret_cast<OpParameter *>(conv_param);
 }
 
 OpParameter *PopulateSoftmaxParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto softmax_primitive =
     reinterpret_cast<mindspore::lite::SoftMax *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *softmax_param = new (std::nothrow) SoftmaxParameter();
+  SoftmaxParameter *softmax_param = reinterpret_cast<SoftmaxParameter *>(malloc(sizeof(SoftmaxParameter)));
   if (softmax_param == nullptr) {
-    MS_LOG(ERROR) << "new SoftmaxParameter failed.";
+    MS_LOG(ERROR) << "malloc SoftmaxParameter failed.";
     return nullptr;
   }
+  memset(softmax_param, 0, sizeof(SoftmaxParameter));
   softmax_param->op_parameter_.type_ = primitive->Type();
   softmax_param->axis_ = softmax_primitive->GetAxis();
   return reinterpret_cast<OpParameter *>(softmax_param);
 }
 
 OpParameter *PopulateReduceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *reduce_param = new (std::nothrow) ReduceParameter();
+  ReduceParameter *reduce_param = reinterpret_cast<ReduceParameter *>(malloc(sizeof(ReduceParameter)));
   if (reduce_param == nullptr) {
-    MS_LOG(ERROR) << "new ReduceParameter failed.";
+    MS_LOG(ERROR) << "malloc ReduceParameter failed.";
     return nullptr;
   }
+  memset(reduce_param, 0, sizeof(ReduceParameter));
   reduce_param->op_parameter_.type_ = primitive->Type();
   auto reduce = reinterpret_cast<mindspore::lite::Reduce *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   reduce_param->keep_dims_ = reduce->GetKeepDims();
   auto axisVector = reduce->GetAxes();
   if (axisVector.size() > REDUCE_MAX_AXES_NUM) {
     MS_LOG(ERROR) << "Reduce axes size " << axisVector.size() << " exceed limit " << REDUCE_MAX_AXES_NUM;
-    delete (reduce_param);
+    free(reduce_param);
     return nullptr;
   }
   reduce_param->num_axes_ = static_cast<int>(axisVector.size());
@@ -584,18 +558,19 @@ OpParameter *PopulateReduceParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateMeanParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *mean_param = new (std::nothrow) ReduceParameter();
+  ReduceParameter *mean_param = reinterpret_cast<ReduceParameter *>(malloc(sizeof(ReduceParameter)));
   if (mean_param == nullptr) {
-    MS_LOG(ERROR) << "new ReduceParameter failed.";
+    MS_LOG(ERROR) << "malloc ReduceParameter failed.";
     return nullptr;
   }
+  memset(mean_param, 0, sizeof(ReduceParameter));
   mean_param->op_parameter_.type_ = primitive->Type();
   auto mean = reinterpret_cast<mindspore::lite::Mean *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   mean_param->keep_dims_ = mean->GetKeepDims();
   auto axisVector = mean->GetAxis();
   if (axisVector.size() > REDUCE_MAX_AXES_NUM) {
     MS_LOG(ERROR) << "Reduce axes size " << axisVector.size() << " exceed limit " << REDUCE_MAX_AXES_NUM;
-    delete (mean_param);
+    free(mean_param);
     return nullptr;
   }
   mean_param->num_axes_ = static_cast<int>(axisVector.size());
@@ -608,11 +583,12 @@ OpParameter *PopulateMeanParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulatePadParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *pad_param = new (std::nothrow) PadParameter();
+  PadParameter *pad_param = reinterpret_cast<PadParameter *>(malloc(sizeof(PadParameter)));
   if (pad_param == nullptr) {
-    MS_LOG(ERROR) << "new PadParameter failed.";
+    MS_LOG(ERROR) << "malloc PadParameter failed.";
     return nullptr;
   }
+  memset(pad_param, 0, sizeof(PadParameter));
   pad_param->op_parameter_.type_ = primitive->Type();
   auto pad_node = reinterpret_cast<mindspore::lite::Pad *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   pad_param->pad_mode_ = pad_node->GetPaddingMode();
@@ -620,14 +596,14 @@ OpParameter *PopulatePadParameter(const mindspore::lite::PrimitiveC *primitive)
     pad_param->constant_value_ = pad_node->GetConstantValue();
   } else {
     MS_LOG(ERROR) << "Invalid padding mode: " << pad_param->pad_mode_;
-    delete (pad_param);
+    free(pad_param);
     return nullptr;
   }
 
   auto size = pad_node->GetPaddings().size();
   if (size > MAX_PAD_SIZE) {
     MS_LOG(ERROR) << "Invalid padding size: " << size;
-    delete (pad_param);
+    free(pad_param);
     return nullptr;
   }
 
@@ -638,11 +614,12 @@ OpParameter *PopulatePadParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulateActivationParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *act_param = new (std::nothrow) ActivationParameter();
+  ActivationParameter *act_param = reinterpret_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter)));
   if (act_param == nullptr) {
-    MS_LOG(ERROR) << "new ActivationParameter failed.";
+    MS_LOG(ERROR) << "malloc ActivationParameter failed.";
     return nullptr;
   }
+  memset(act_param, 0, sizeof(ActivationParameter));
   auto activation =
     reinterpret_cast<mindspore::lite::Activation *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   act_param->type_ = static_cast<int>(activation->GetType());
@@ -651,11 +628,12 @@ OpParameter *PopulateActivationParameter(const mindspore::lite::PrimitiveC *prim
 }
 
 OpParameter *PopulateFusedBatchNorm(const mindspore::lite::PrimitiveC *primitive) {
-  auto *batch_norm_param = new (std::nothrow) BatchNormParameter();
+  BatchNormParameter *batch_norm_param = reinterpret_cast<BatchNormParameter *>(malloc(sizeof(BatchNormParameter)));
   if (batch_norm_param == nullptr) {
-    MS_LOG(ERROR) << "new FusedBatchNormParameter failed.";
+    MS_LOG(ERROR) << "malloc BatchNormParameter failed.";
     return nullptr;
   }
+  memset(batch_norm_param, 0, sizeof(BatchNormParameter));
   batch_norm_param->op_parameter_.type_ = primitive->Type();
   auto param =
     reinterpret_cast<mindspore::lite::FusedBatchNorm *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -665,11 +643,12 @@ OpParameter *PopulateFusedBatchNorm(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateArithmetic(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arithmetic_param = new (std::nothrow) ArithmeticParameter();
+  ArithmeticParameter *arithmetic_param = reinterpret_cast<ArithmeticParameter *>(malloc(sizeof(ArithmeticParameter)));
   if (arithmetic_param == nullptr) {
-    MS_LOG(ERROR) << "new ArithmeticParameter failed.";
+    MS_LOG(ERROR) << "malloc ArithmeticParameter failed.";
     return nullptr;
   }
+  memset(arithmetic_param, 0, sizeof(ArithmeticParameter));
   arithmetic_param->op_parameter_.type_ = primitive->Type();
   arithmetic_param->broadcasting_ = ((lite::Arithmetic *)primitive)->Broadcasting();
   arithmetic_param->ndim_ = ((lite::Arithmetic *)primitive)->NDims();
@@ -708,11 +687,12 @@ OpParameter *PopulateArithmetic(const mindspore::lite::PrimitiveC *primitive) {
 }
 
 OpParameter *PopulateEltwiseParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arithmetic_param = new (std::nothrow) ArithmeticParameter();
+  ArithmeticParameter *arithmetic_param = reinterpret_cast<ArithmeticParameter *>(malloc(sizeof(ArithmeticParameter)));
   if (arithmetic_param == nullptr) {
-    MS_LOG(ERROR) << "new ArithmeticParameter failed.";
+    MS_LOG(ERROR) << "malloc ArithmeticParameter failed.";
     return nullptr;
   }
+  memset(arithmetic_param, 0, sizeof(ArithmeticParameter));
   auto eltwise = reinterpret_cast<mindspore::lite::Eltwise *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   switch (eltwise->GetMode()) {
     case schema::EltwiseMode_PROD:
@@ -725,28 +705,31 @@ OpParameter *PopulateEltwiseParameter(const mindspore::lite::PrimitiveC *primiti
       arithmetic_param->op_parameter_.type_ = schema::PrimitiveType_Maximum;
       break;
     default:
-      delete arithmetic_param;
+      free(arithmetic_param);
       return nullptr;
   }
   return reinterpret_cast<OpParameter *>(arithmetic_param);
 }
 
 OpParameter *PopulateArithmeticSelf(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arithmetic_self_param = new (std::nothrow) ArithmeticSelfParameter();
+  ArithmeticSelfParameter *arithmetic_self_param =
+      reinterpret_cast<ArithmeticSelfParameter *>(malloc(sizeof(ArithmeticSelfParameter)));
   if (arithmetic_self_param == nullptr) {
-    MS_LOG(ERROR) << "new ArithmeticParameter failed.";
+    MS_LOG(ERROR) << "malloc ArithmeticSelfParameter failed.";
     return nullptr;
   }
+  memset(arithmetic_self_param, 0, sizeof(ArithmeticSelfParameter));
   arithmetic_self_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(arithmetic_self_param);
 }
 
 OpParameter *PopulatePowerParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *power_param = new (std::nothrow) PowerParameter();
+  PowerParameter *power_param = reinterpret_cast<PowerParameter *>(malloc(sizeof(PowerParameter)));
   if (power_param == nullptr) {
-    MS_LOG(ERROR) << "new PowerParameter failed.";
+    MS_LOG(ERROR) << "malloc PowerParameter failed.";
     return nullptr;
   }
+  memset(power_param, 0, sizeof(PowerParameter));
   power_param->op_parameter_.type_ = primitive->Type();
   auto power = reinterpret_cast<mindspore::lite::Power *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   power_param->power_ = power->GetPower();
@@ -756,11 +739,12 @@ OpParameter *PopulatePowerParameter(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateArgMaxParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arg_param = new (std::nothrow) ArgMinMaxParameter();
+  ArgMinMaxParameter *arg_param = reinterpret_cast<ArgMinMaxParameter *>(malloc(sizeof(ArgMinMaxParameter)));
   if (arg_param == nullptr) {
-    MS_LOG(ERROR) << "new ArgMinMaxParameter failed.";
+    MS_LOG(ERROR) << "malloc ArgMinMaxParameter failed.";
     return nullptr;
   }
+  memset(arg_param, 0, sizeof(ArgMinMaxParameter));
   arg_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::ArgMax *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   arg_param->axis_ = param->GetAxis();
@@ -772,11 +756,12 @@ OpParameter *PopulateArgMaxParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateArgMinParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arg_param = new (std::nothrow) ArgMinMaxParameter();
+  ArgMinMaxParameter *arg_param = reinterpret_cast<ArgMinMaxParameter *>(malloc(sizeof(ArgMinMaxParameter)));
   if (arg_param == nullptr) {
-    MS_LOG(ERROR) << "new ArgMinMaxParameter failed.";
+    MS_LOG(ERROR) << "malloc ArgMinMaxParameter failed.";
     return nullptr;
   }
+  memset(arg_param, 0, sizeof(ArgMinMaxParameter));
   arg_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::ArgMin *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   arg_param->axis_ = param->GetAxis();
@@ -788,11 +773,12 @@ OpParameter *PopulateArgMinParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateCastParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *cast_param = new (std::nothrow) CastParameter();
+  CastParameter *cast_param = reinterpret_cast<CastParameter *>(malloc(sizeof(CastParameter)));
   if (cast_param == nullptr) {
-    MS_LOG(ERROR) << "new CastParameter failed.";
+    MS_LOG(ERROR) << "malloc CastParameter failed.";
     return nullptr;
   }
+  memset(cast_param, 0, sizeof(CastParameter));
   cast_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Cast *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   cast_param->src_type_ = param->GetSrcT();
@@ -803,11 +789,13 @@ OpParameter *PopulateCastParameter(const mindspore::lite::PrimitiveC *primitive)
 OpParameter *PopulateLocalResponseNormParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto local_response_norm_attr = reinterpret_cast<mindspore::lite::LocalResponseNormalization *>(
     const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *lrn_param = new (std::nothrow) LocalResponseNormParameter();
+  LocalResponseNormParameter *lrn_param =
+      reinterpret_cast<LocalResponseNormParameter *>(malloc(sizeof(LocalResponseNormParameter)));
   if (lrn_param == nullptr) {
-    MS_LOG(ERROR) << "new LocalResponseNormParameter failed.";
+    MS_LOG(ERROR) << "malloc LocalResponseNormParameter failed.";
     return nullptr;
   }
+  memset(lrn_param, 0, sizeof(LocalResponseNormParameter));
   lrn_param->op_parameter_.type_ = primitive->Type();
   lrn_param->depth_radius_ = local_response_norm_attr->GetDepthRadius();
   lrn_param->bias_ = local_response_norm_attr->GetBias();
@@ -818,11 +806,12 @@ OpParameter *PopulateLocalResponseNormParameter(const mindspore::lite::Primitive
 
 OpParameter *PopulateRangeParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto range_attr = reinterpret_cast<mindspore::lite::Range *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *range_param = new (std::nothrow) RangeParameter();
+  RangeParameter *range_param = reinterpret_cast<RangeParameter *>(malloc(sizeof(RangeParameter)));
   if (range_param == nullptr) {
-    MS_LOG(ERROR) << "new RangeParameter failed.";
+    MS_LOG(ERROR) << "malloc RangeParameter failed.";
     return nullptr;
   }
+  memset(range_param, 0, sizeof(RangeParameter));
   range_param->op_parameter_.type_ = primitive->Type();
   range_param->start_ = range_attr->GetStart();
   range_param->limit_ = range_attr->GetLimit();
@@ -832,11 +821,12 @@ OpParameter *PopulateRangeParameter(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateConcatParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *concat_param = new (std::nothrow) ConcatParameter();
+  ConcatParameter *concat_param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
   if (concat_param == nullptr) {
-    MS_LOG(ERROR) << "new ConcatParameter failed.";
+    MS_LOG(ERROR) << "malloc ConcatParameter failed.";
     return nullptr;
   }
+  memset(concat_param, 0, sizeof(ConcatParameter));
   concat_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Concat *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   concat_param->axis_ = param->GetAxis();
@@ -844,11 +834,12 @@ OpParameter *PopulateConcatParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateTileParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *tile_param = new (std::nothrow) TileParameter();
+  TileParameter *tile_param = reinterpret_cast<TileParameter *>(malloc(sizeof(TileParameter)));
   if (tile_param == nullptr) {
-    MS_LOG(ERROR) << "new TileParameter failed.";
+    MS_LOG(ERROR) << "malloc TileParameter failed.";
     return nullptr;
   }
+  memset(tile_param, 0, sizeof(TileParameter));
   tile_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Tile *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   auto multiples = param->GetMultiples();
@@ -860,11 +851,12 @@ OpParameter *PopulateTileParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulateTopKParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *topk_param = new (std::nothrow) TopkParameter();
+  TopkParameter *topk_param = reinterpret_cast<TopkParameter *>(malloc(sizeof(TopkParameter)));
   if (topk_param == nullptr) {
-    MS_LOG(ERROR) << "new TopkParameter failed.";
+    MS_LOG(ERROR) << "malloc TopkParameter failed.";
     return nullptr;
   }
+  memset(topk_param, 0, sizeof(TopkParameter));
   topk_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::TopK *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   topk_param->k_ = param->GetK();
@@ -873,31 +865,34 @@ OpParameter *PopulateTopKParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulateNhwc2NchwParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *parameter = new (std::nothrow) OpParameter();
+  OpParameter *parameter = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
   if (parameter == nullptr) {
-    MS_LOG(ERROR) << "new Nhwc2NchwParameter failed.";
+    MS_LOG(ERROR) << "malloc OpParameter failed.";
     return nullptr;
   }
+  memset(parameter, 0, sizeof(OpParameter));
   parameter->type_ = primitive->Type();
   return parameter;
 }
 
 OpParameter *PopulateNchw2NhwcParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *parameter = new (std::nothrow) OpParameter();
+  OpParameter *parameter = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
   if (parameter == nullptr) {
-    MS_LOG(ERROR) << "new Nchw2NhwcParameter failed.";
+    MS_LOG(ERROR) << "malloc OpParameter failed.";
     return nullptr;
   }
+  memset(parameter, 0, sizeof(OpParameter));
   parameter->type_ = primitive->Type();
   return parameter;
 }
 
 OpParameter *PopulateTransposeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *transpose_param = new (std::nothrow) TransposeParameter();
+  TransposeParameter *transpose_param = reinterpret_cast<TransposeParameter *>(malloc(sizeof(TransposeParameter)));
   if (transpose_param == nullptr) {
-    MS_LOG(ERROR) << "new TransposeParameter failed.";
+    MS_LOG(ERROR) << "malloc TransposeParameter failed.";
     return nullptr;
   }
+  memset(transpose_param, 0, sizeof(TransposeParameter));
   auto param = reinterpret_cast<mindspore::lite::Transpose *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   transpose_param->op_parameter_.type_ = primitive->Type();
   auto perm_vector_ = param->GetPerm();
@@ -911,11 +906,12 @@ OpParameter *PopulateTransposeParameter(const mindspore::lite::PrimitiveC *primi
 }
 
 OpParameter *PopulateSplitParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *split_param = new (std::nothrow) SplitParameter();
+  SplitParameter *split_param = reinterpret_cast<SplitParameter *>(malloc(sizeof(SplitParameter)));
   if (split_param == nullptr) {
-    MS_LOG(ERROR) << "new SplitParameter failed.";
+    MS_LOG(ERROR) << "malloc SplitParameter failed.";
     return nullptr;
   }
+  memset(split_param, 0, sizeof(SplitParameter));
   auto param = reinterpret_cast<mindspore::lite::Split *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   split_param->op_parameter_.type_ = primitive->Type();
   split_param->num_split_ = param->GetNumberSplit();
@@ -930,11 +926,12 @@ OpParameter *PopulateSplitParameter(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateSqueezeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *squeeze_param = new (std::nothrow) SqueezeParameter();
+  SqueezeParameter *squeeze_param = reinterpret_cast<SqueezeParameter *>(malloc(sizeof(SqueezeParameter)));
   if (squeeze_param == nullptr) {
-    MS_LOG(ERROR) << "new SqueezeParameter failed.";
+    MS_LOG(ERROR) << "malloc SqueezeParameter failed.";
     return nullptr;
   }
+  memset(squeeze_param, 0, sizeof(SqueezeParameter));
   squeeze_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(squeeze_param);
 }
@@ -944,11 +941,12 @@ OpParameter *PopulateScaleParameter(const mindspore::lite::PrimitiveC *primitive
     MS_LOG(ERROR) << "input primitive is nullptr";
     return nullptr;
   }
-  auto *scale_param = new (std::nothrow) ScaleParameter();
+  ScaleParameter *scale_param = reinterpret_cast<ScaleParameter *>(malloc(sizeof(ScaleParameter)));
   if (scale_param == nullptr) {
-    MS_LOG(ERROR) << "new ScaleParameter failed.";
+    MS_LOG(ERROR) << "malloc ScaleParameter failed.";
     return nullptr;
   }
+  memset(scale_param, 0, sizeof(ScaleParameter));
   scale_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Scale *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   scale_param->axis_ = param->GetAxis();
@@ -957,11 +955,12 @@ OpParameter *PopulateScaleParameter(const mindspore::lite::PrimitiveC *primitive
 
 OpParameter *PopulateGatherParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto gather_attr = reinterpret_cast<mindspore::lite::Gather *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *gather_param = new (std::nothrow) GatherParameter();
+  GatherParameter *gather_param = reinterpret_cast<GatherParameter *>(malloc(sizeof(GatherParameter)));
   if (gather_param == nullptr) {
-    MS_LOG(ERROR) << "new GatherParameter failed.";
+    MS_LOG(ERROR) << "malloc GatherParameter failed.";
     return nullptr;
   }
+  memset(gather_param, 0, sizeof(GatherParameter));
   gather_param->op_parameter_.type_ = primitive->Type();
   gather_param->axis_ = gather_attr->GetAxis();
   gather_param->batchDims_ = gather_attr->GetBatchDims();
@@ -969,11 +968,12 @@ OpParameter *PopulateGatherParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateGatherNdParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *gather_nd_param = new (std::nothrow) GatherNdParameter();
+  GatherNdParameter *gather_nd_param = reinterpret_cast<GatherNdParameter *>(malloc(sizeof(GatherNdParameter)));
   if (gather_nd_param == nullptr) {
-    MS_LOG(ERROR) << "new GatherNDParameter failed.";
+    MS_LOG(ERROR) << "malloc GatherNdParameter failed.";
     return nullptr;
   }
+  memset(gather_nd_param, 0, sizeof(GatherNdParameter));
   gather_nd_param->op_parameter_.type_ = primitive->Type();
   auto gatherNd_attr =
     reinterpret_cast<mindspore::lite::GatherNd *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -982,28 +982,29 @@ OpParameter *PopulateGatherNdParameter(const mindspore::lite::PrimitiveC *primit
 }
 
 OpParameter *PopulateScatterNDParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *scatter_nd_param = new (std::nothrow) ScatterNDParameter();
+  ScatterNDParameter *scatter_nd_param = reinterpret_cast<ScatterNDParameter *>(malloc(sizeof(ScatterNDParameter)));
   if (scatter_nd_param == nullptr) {
-    MS_LOG(ERROR) << "new ScatterNDParameter failed.";
+    MS_LOG(ERROR) << "malloc ScatterNDParameter failed.";
     return nullptr;
   }
+  memset(scatter_nd_param, 0, sizeof(ScatterNDParameter));
   scatter_nd_param->op_parameter_.type_ = primitive->Type();
-  MS_ASSERT(paramter != nullptr);
   return reinterpret_cast<OpParameter *>(scatter_nd_param);
 }
 
 OpParameter *PopulateSliceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *slice_param = new (std::nothrow) SliceParameter();
+  SliceParameter *slice_param = reinterpret_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
   if (slice_param == nullptr) {
-    MS_LOG(ERROR) << "new SliceParameter failed.";
+    MS_LOG(ERROR) << "malloc SliceParameter failed.";
     return nullptr;
   }
+  memset(slice_param, 0, sizeof(SliceParameter));
   auto param = reinterpret_cast<mindspore::lite::Slice *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   slice_param->op_parameter_.type_ = primitive->Type();
-  auto param_begin = param->GetBegin();
-  auto param_size = param->GetSize();
+  auto param_begin = param->GetPostProcessBegin();
+  auto param_size = param->GetPostProcessSize();
   if (param_begin.size() != param_size.size()) {
-    delete slice_param;
+    free(slice_param);
     return nullptr;
   }
   slice_param->param_length_ = static_cast<int32_t>(param_begin.size());
@@ -1015,11 +1016,13 @@ OpParameter *PopulateSliceParameter(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateBroadcastToParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *broadcast_param = new (std::nothrow) BroadcastToParameter();
+  BroadcastToParameter *broadcast_param =
+      reinterpret_cast<BroadcastToParameter *>(malloc(sizeof(BroadcastToParameter)));
   if (broadcast_param == nullptr) {
-    MS_LOG(ERROR) << "new BroadcastToParameter failed.";
+    MS_LOG(ERROR) << "malloc BroadcastToParameter failed.";
     return nullptr;
   }
+  memset(broadcast_param, 0, sizeof(BroadcastToParameter));
   auto param = reinterpret_cast<mindspore::lite::BroadcastTo *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   broadcast_param->op_parameter_.type_ = primitive->Type();
   auto dst_shape = param->GetDstShape();
@@ -1031,21 +1034,23 @@ OpParameter *PopulateBroadcastToParameter(const mindspore::lite::PrimitiveC *pri
 }
 
 OpParameter *PopulateReshapeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *reshape_param = new (std::nothrow) ReshapeParameter();
+  ReshapeParameter *reshape_param = reinterpret_cast<ReshapeParameter *>(malloc(sizeof(ReshapeParameter)));
   if (reshape_param == nullptr) {
-    MS_LOG(ERROR) << "new ReshapeParameter failed.";
+    MS_LOG(ERROR) << "malloc ReshapeParameter failed.";
     return nullptr;
   }
+  memset(reshape_param, 0, sizeof(ReshapeParameter));
   reshape_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(reshape_param);
 }
 
 OpParameter *PopulateShapeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *shape_param = new (std::nothrow) ShapeParameter();
+  ShapeParameter *shape_param = reinterpret_cast<ShapeParameter *>(malloc(sizeof(ShapeParameter)));
   if (shape_param == nullptr) {
-    MS_LOG(ERROR) << "new ShapeParameter failed.";
+    MS_LOG(ERROR) << "malloc ShapeParameter failed.";
     return nullptr;
   }
+  memset(shape_param, 0, sizeof(ShapeParameter));
   shape_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(shape_param);
 }
@@ -1053,11 +1058,13 @@ OpParameter *PopulateShapeParameter(const mindspore::lite::PrimitiveC *primitive
 OpParameter *PopulateConstantOfShapeParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto attr =
     reinterpret_cast<mindspore::lite::ConstantOfShape *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  ConstantOfShapeParameter *param = new (std::nothrow) ConstantOfShapeParameter();
+  ConstantOfShapeParameter *param =
+      reinterpret_cast<ConstantOfShapeParameter *>(malloc(sizeof(ConstantOfShapeParameter)));
   if (param == nullptr) {
-    MS_LOG(ERROR) << "new ConstantOfShapeParameter failed.";
+    MS_LOG(ERROR) << "malloc ConstantOfShapeParameter failed.";
     return nullptr;
   }
+  memset(param, 0, sizeof(ConstantOfShapeParameter));
   param->op_parameter_.type_ = primitive->Type();
   param->value_ = attr->GetValue();
   return reinterpret_cast<OpParameter *>(param);
@@ -1066,11 +1073,12 @@ OpParameter *PopulateConstantOfShapeParameter(const mindspore::lite::PrimitiveC
 OpParameter *PopulateReverseParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto reverse_attr =
     reinterpret_cast<mindspore::lite::Reverse *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  ReverseParameter *reverse_param = new (std::nothrow) ReverseParameter();
+  ReverseParameter *reverse_param = reinterpret_cast<ReverseParameter *>(malloc(sizeof(ReverseParameter)));
   if (reverse_param == nullptr) {
-    MS_LOG(ERROR) << "new ReverseParameter failed.";
+    MS_LOG(ERROR) << "malloc ReverseParameter failed.";
     return nullptr;
   }
+  memset(reverse_param, 0, sizeof(ReverseParameter));
   reverse_param->op_parameter_.type_ = primitive->Type();
   auto flatAxis = reverse_attr->GetAxis();
   reverse_param->num_axis_ = flatAxis.size();
@@ -1084,11 +1092,12 @@ OpParameter *PopulateReverseParameter(const mindspore::lite::PrimitiveC *primiti
 OpParameter *PopulateUnsqueezeParameter(const mindspore::lite::PrimitiveC *primitive) {
   auto unsqueeze_attr =
     reinterpret_cast<mindspore::lite::Unsqueeze *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
-  auto *unsqueeze_param = new (std::nothrow) UnsqueezeParameter();
+  UnsqueezeParameter *unsqueeze_param = reinterpret_cast<UnsqueezeParameter *>(malloc(sizeof(UnsqueezeParameter)));
   if (unsqueeze_param == nullptr) {
-    MS_LOG(ERROR) << "new ReverseParameter failed.";
+    MS_LOG(ERROR) << "malloc UnsqueezeParameter failed.";
     return nullptr;
   }
+  memset(unsqueeze_param, 0, sizeof(UnsqueezeParameter));
   unsqueeze_param->op_parameter_.type_ = primitive->Type();
   auto flatAxis = unsqueeze_attr->GetAxis();
   unsqueeze_param->num_dim_ = flatAxis.size();
@@ -1100,11 +1109,12 @@ OpParameter *PopulateUnsqueezeParameter(const mindspore::lite::PrimitiveC *primi
 }
 
 OpParameter *PopulateStackParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *stack_param = new (std::nothrow) StackParameter();
+  StackParameter *stack_param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter)));
   if (stack_param == nullptr) {
-    MS_LOG(ERROR) << "new StackParameter failed.";
+    MS_LOG(ERROR) << "malloc StackParameter failed.";
     return nullptr;
   }
+  memset(stack_param, 0, sizeof(StackParameter));
   auto param = reinterpret_cast<mindspore::lite::Stack *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   stack_param->op_parameter_.type_ = primitive->Type();
   stack_param->axis_ = param->GetAxis();
@@ -1112,11 +1122,12 @@ OpParameter *PopulateStackParameter(const mindspore::lite::PrimitiveC *primitive
 }
 
 OpParameter *PopulateUnstackParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *unstack_param = new (std::nothrow) UnstackParameter();
+  UnstackParameter *unstack_param = reinterpret_cast<UnstackParameter *>(malloc(sizeof(UnstackParameter)));
   if (unstack_param == nullptr) {
-    MS_LOG(ERROR) << "new UnstackParameter failed.";
+    MS_LOG(ERROR) << "malloc UnstackParameter failed.";
     return nullptr;
   }
+  memset(unstack_param, 0, sizeof(UnstackParameter));
   auto param = reinterpret_cast<mindspore::lite::Unstack *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   unstack_param->op_parameter_.type_ = primitive->Type();
   unstack_param->num_ = param->GetNum();
@@ -1125,11 +1136,13 @@ OpParameter *PopulateUnstackParameter(const mindspore::lite::PrimitiveC *primiti
 }
 
 OpParameter *PopulateReverseSequenceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *reverse_sequence_param = new (std::nothrow) ReverseSequenceParameter();
+  ReverseSequenceParameter *reverse_sequence_param =
+      reinterpret_cast<ReverseSequenceParameter *>(malloc(sizeof(ReverseSequenceParameter)));
   if (reverse_sequence_param == nullptr) {
-    MS_LOG(ERROR) << "new ReverseSequenceParameter failed.";
+    MS_LOG(ERROR) << "malloc ReverseSequenceParameter failed.";
     return nullptr;
   }
+  memset(reverse_sequence_param, 0, sizeof(ReverseSequenceParameter));
   auto param =
     reinterpret_cast<mindspore::lite::ReverseSequence *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   reverse_sequence_param->op_parameter_.type_ = primitive->Type();
@@ -1139,21 +1152,24 @@ OpParameter *PopulateReverseSequenceParameter(const mindspore::lite::PrimitiveC
 }
 
 OpParameter *PopulateUniqueParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *unique_param = new (std::nothrow) UniqueParameter();
+  UniqueParameter *unique_param = reinterpret_cast<UniqueParameter *>(malloc(sizeof(UniqueParameter)));
   if (unique_param == nullptr) {
-    MS_LOG(ERROR) << "new PopulateUniqueParam failed.";
+    MS_LOG(ERROR) << "malloc UniqueParameter failed.";
     return nullptr;
   }
+  memset(unique_param, 0, sizeof(UniqueParameter));
   unique_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(unique_param);
 }
 
 OpParameter *PopulateDepthToSpaceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *depth_space_param = new (std::nothrow) DepthToSpaceParameter();
+  DepthToSpaceParameter *depth_space_param =
+      reinterpret_cast<DepthToSpaceParameter *>(malloc(sizeof(DepthToSpaceParameter)));
   if (depth_space_param == nullptr) {
-    MS_LOG(ERROR) << "new DepthToSpaceParameter failed.";
+    MS_LOG(ERROR) << "malloc DepthToSpaceParameter failed.";
     return nullptr;
   }
+  memset(depth_space_param, 0, sizeof(DepthToSpaceParameter));
   auto param = reinterpret_cast<mindspore::lite::DepthToSpace *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   depth_space_param->op_parameter_.type_ = primitive->Type();
   depth_space_param->block_size_ = param->GetBlockSize();
@@ -1161,47 +1177,65 @@ OpParameter *PopulateDepthToSpaceParameter(const mindspore::lite::PrimitiveC *pr
 }
 
 OpParameter *PopulateSpaceToDepthParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *space_depth_param = new (std::nothrow) SpaceToDepthParameter();
+  SpaceToDepthParameter *space_depth_param =
+      reinterpret_cast<SpaceToDepthParameter *>(malloc(sizeof(SpaceToDepthParameter)));
   if (space_depth_param == nullptr) {
-    MS_LOG(ERROR) << "new SpaceToDepthspace_depth_param failed.";
+    MS_LOG(ERROR) << "malloc SpaceToDepthParameter failed.";
     return nullptr;
   }
+  memset(space_depth_param, 0, sizeof(SpaceToDepthParameter));
   space_depth_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::SpaceToDepth *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   space_depth_param->op_parameter_.type_ = primitive->Type();
   space_depth_param->block_size_ = param->GetBlockSize();
   if (param->GetFormat() != schema::Format_NHWC) {
     MS_LOG(ERROR) << "Currently only NHWC format is supported.";
+    free(space_depth_param);
     return nullptr;
   }
   return reinterpret_cast<OpParameter *>(space_depth_param);
 }
 
 OpParameter *PopulateSpaceToBatchParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *space_batch_param = new (std::nothrow) SpaceToBatchParameter();
+  SpaceToBatchParameter *space_batch_param =
+      reinterpret_cast<SpaceToBatchParameter *>(malloc(sizeof(SpaceToBatchParameter)));
   if (space_batch_param == nullptr) {
-    MS_LOG(ERROR) << "new SpaceToBatchParameter failed.";
+    MS_LOG(ERROR) << "malloc SpaceToBatchParameter failed.";
     return nullptr;
   }
+  memset(space_batch_param, 0, sizeof(SpaceToBatchParameter));
   space_batch_param->op_parameter_.type_ = primitive->Type();
   space_batch_param->op_parameter_.type_ = primitive->Type();
   auto block_sizes = ((mindspore::lite::SpaceToBatch *)primitive)->BlockSizes();
   (void)memcpy(space_batch_param->block_sizes_, (block_sizes.data()), block_sizes.size() * sizeof(int));
   auto paddings = ((mindspore::lite::SpaceToBatch *)primitive)->Paddings();
   (void)memcpy(space_batch_param->paddings_, (paddings.data()), paddings.size() * sizeof(int));
-  auto in_shape = ((mindspore::lite::SpaceToBatch *)primitive)->InShape();
-  (void)memcpy(space_batch_param->in_shape_, (in_shape.data()), in_shape.size() * sizeof(int));
-  auto padded_in_shape = ((mindspore::lite::SpaceToBatch *)primitive)->PaddedInShape();
-  (void)memcpy(space_batch_param->padded_in_shape_, (padded_in_shape.data()), padded_in_shape.size() * sizeof(int));
+  return reinterpret_cast<OpParameter *>(space_batch_param);
+}
+
+OpParameter *PopulateSpaceToBatchParameterND(const mindspore::lite::PrimitiveC *primitivec) {
+  auto *space_batch_param = new (std::nothrow) SpaceToBatchParameter();
+  if (space_batch_param == nullptr) {
+    MS_LOG(ERROR) << "new SpaceToBatchParameter failed.";
+    return nullptr;
+  }
+
+  mindspore::lite::SpaceToBatchND *primitive = (mindspore::lite::SpaceToBatchND *)primitivec;
+  space_batch_param->op_parameter_.type_ = primitive->Type();
+  auto block_sizes = primitive->GetBlockShape();
+  (void)memcpy(space_batch_param->block_sizes_, (block_sizes.data()), block_sizes.size() * sizeof(int));
+  auto paddings = primitive->GetPaddings();
+  (void)memcpy(space_batch_param->paddings_, (paddings.data()), paddings.size() * sizeof(int));
   return reinterpret_cast<OpParameter *>(space_batch_param);
 }
 
 OpParameter *PopulateResizeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *resize_param = new (std::nothrow) ResizeParameter();
+  ResizeParameter *resize_param = reinterpret_cast<ResizeParameter *>(malloc(sizeof(ResizeParameter)));
   if (resize_param == nullptr) {
-    MS_LOG(ERROR) << "new ResizeParameter failed.";
+    MS_LOG(ERROR) << "malloc ResizeParameter failed.";
     return nullptr;
   }
+  memset(resize_param, 0, sizeof(ResizeParameter));
   resize_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Resize *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   resize_param->method_ = static_cast<int>(param->GetMethod());
@@ -1213,22 +1247,26 @@ OpParameter *PopulateResizeParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateBatchToSpaceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *batch_space_param = new (std::nothrow) BatchToSpaceParameter();
+  BatchToSpaceParameter *batch_space_param =
+      reinterpret_cast<BatchToSpaceParameter *>(malloc(sizeof(BatchToSpaceParameter)));
   if (batch_space_param == nullptr) {
-    MS_LOG(ERROR) << "New BatchToSpaceParameter fail!";
+    MS_LOG(ERROR) << "malloc BatchToSpaceParameter failed.";
     return nullptr;
   }
+  memset(batch_space_param, 0, sizeof(BatchToSpaceParameter));
   batch_space_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::BatchToSpace *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   auto block_shape = param->GetBlockShape();
   if (block_shape.size() != BATCH_TO_SPACE_BLOCK_SHAPE_SIZE) {
     MS_LOG(ERROR) << "batch_to_space blockShape size should be " << BATCH_TO_SPACE_BLOCK_SHAPE_SIZE;
+    free(batch_space_param);
     return nullptr;
   }
 
   auto crops = param->GetCrops();
   if (crops.size() != BATCH_TO_SPACE_CROPS_SIZE) {
     MS_LOG(ERROR) << "batch_to_space crops size should be " << BATCH_TO_SPACE_CROPS_SIZE;
+    free(batch_space_param);
     return nullptr;
   }
 
@@ -1249,11 +1287,12 @@ OpParameter *PopulateCropParameter(const mindspore::lite::PrimitiveC *primitive)
     MS_LOG(ERROR) << "crop_param offset size(" << param_offset.size() << ") should <= " << CROP_OFFSET_MAX_SIZE;
     return nullptr;
   }
-  auto *crop_param = new (std::nothrow) CropParameter();
+  CropParameter *crop_param = reinterpret_cast<CropParameter *>(malloc(sizeof(CropParameter)));
   if (crop_param == nullptr) {
-    MS_LOG(ERROR) << "new CropParameter fail!";
+    MS_LOG(ERROR) << "malloc CropParameter failed.";
     return nullptr;
   }
+  memset(crop_param, 0, sizeof(CropParameter));
   crop_param->op_parameter_.type_ = primitive->Type();
   crop_param->axis_ = param->GetAxis();
   crop_param->offset_size_ = param_offset.size();
@@ -1264,15 +1303,16 @@ OpParameter *PopulateCropParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulateOneHotParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *one_hot_param = new (std::nothrow) OneHotParameter();
+  OneHotParameter *one_hot_param = reinterpret_cast<OneHotParameter *>(malloc(sizeof(OneHotParameter)));
   if (one_hot_param == nullptr) {
-    MS_LOG(ERROR) << "new OneHotParameter fail!";
+    MS_LOG(ERROR) << "malloc OneHotParameter failed.";
     return nullptr;
   }
+  memset(one_hot_param, 0, sizeof(OneHotParameter));
   one_hot_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::OneHot *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   if (param == nullptr) {
-    delete (one_hot_param);
+    free(one_hot_param);
     MS_LOG(ERROR) << "get OneHot param nullptr.";
     return nullptr;
   }
@@ -1281,21 +1321,24 @@ OpParameter *PopulateOneHotParameter(const mindspore::lite::PrimitiveC *primitiv
 }
 
 OpParameter *PopulateFlattenParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *flatten_param = new (std::nothrow) FlattenParameter();
+  FlattenParameter *flatten_param = reinterpret_cast<FlattenParameter *>(malloc(sizeof(FlattenParameter)));
   if (flatten_param == nullptr) {
-    MS_LOG(ERROR) << "new FlattenParameter fail!";
+    MS_LOG(ERROR) << "malloc FlattenParameter failed.";
     return nullptr;
   }
+  memset(flatten_param, 0, sizeof(FlattenParameter));
   flatten_param->op_parameter_.type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(flatten_param);
 }
 
 OpParameter *PopulateQuantDTypeCastParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *parameter = new (std::nothrow) QuantDTypeCastParameter();
+  QuantDTypeCastParameter *parameter =
+      reinterpret_cast<QuantDTypeCastParameter *>(malloc(sizeof(QuantDTypeCastParameter)));
   if (parameter == nullptr) {
-    MS_LOG(ERROR) << "new QuantDTypeCastParameter fail!";
+    MS_LOG(ERROR) << "malloc QuantDTypeCastParameter failed.";
     return nullptr;
   }
+  memset(parameter, 0, sizeof(QuantDTypeCastParameter));
   parameter->op_parameter_.type_ = primitive->Type();
   auto quant_dtype_cast_param =
     reinterpret_cast<mindspore::lite::QuantDTypeCast *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -1305,11 +1348,13 @@ OpParameter *PopulateQuantDTypeCastParameter(const mindspore::lite::PrimitiveC *
 }
 
 OpParameter *PopulateStridedSliceParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *strided_slice_param = new (std::nothrow) StridedSliceParameter();
+  StridedSliceParameter *strided_slice_param =
+      reinterpret_cast<StridedSliceParameter *>(malloc(sizeof(StridedSliceParameter)));
   if (strided_slice_param == nullptr) {
-    MS_LOG(ERROR) << "new StridedSliceParameter failed.";
+    MS_LOG(ERROR) << "malloc StridedSliceParameter failed.";
     return nullptr;
   }
+  memset(strided_slice_param, 0, sizeof(StridedSliceParameter));
   strided_slice_param->op_parameter_.type_ = primitive->Type();
   auto n_dims = ((lite::StridedSlice *)primitive)->NDims();
   strided_slice_param->num_axes_ = n_dims;
@@ -1325,21 +1370,23 @@ OpParameter *PopulateStridedSliceParameter(const mindspore::lite::PrimitiveC *pr
 }
 
 OpParameter *PopulateAddNParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto addn_param = new (std::nothrow) OpParameter();
+  OpParameter *addn_param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
   if (addn_param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter fail!";
+    MS_LOG(ERROR) << "malloc OpParameter failed.";
     return nullptr;
   }
+  memset(addn_param, 0, sizeof(OpParameter));
   addn_param->type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(addn_param);
 }
 
 OpParameter *PopulatePriorBoxParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *prior_box_param = new (std::nothrow) PriorBoxParameter();
+  PriorBoxParameter *prior_box_param = reinterpret_cast<PriorBoxParameter *>(malloc(sizeof(PriorBoxParameter)));
   if (prior_box_param == nullptr) {
-    MS_LOG(ERROR) << "new PriorBoxParameter failed.";
+    MS_LOG(ERROR) << "malloc PriorBoxParameter failed.";
     return nullptr;
   }
+  memset(prior_box_param, 0, sizeof(PriorBoxParameter));
   prior_box_param->op_parameter_.type_ = primitive->Type();
   auto prior_box_attr =
     reinterpret_cast<mindspore::lite::PriorBox *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -1347,14 +1394,14 @@ OpParameter *PopulatePriorBoxParameter(const mindspore::lite::PrimitiveC *primit
   if (prior_box_attr->GetMinSizes().size() > PRIOR_BOX_MAX_NUM) {
     MS_LOG(ERROR) << "PriorBox min_sizes size exceeds max num " << PRIOR_BOX_MAX_NUM << ", got "
                   << prior_box_attr->GetMinSizes();
-    delete (prior_box_param);
+    free(prior_box_param);
     return nullptr;
   }
   prior_box_param->min_sizes_size = prior_box_attr->GetMinSizes().size();
   if (prior_box_attr->GetMaxSizes().size() > PRIOR_BOX_MAX_NUM) {
     MS_LOG(ERROR) << "PriorBox max_sizes size exceeds max num " << PRIOR_BOX_MAX_NUM << ", got "
                   << prior_box_attr->GetMaxSizes();
-    delete (prior_box_param);
+    free(prior_box_param);
     return nullptr;
   }
   prior_box_param->max_sizes_size = prior_box_attr->GetMaxSizes().size();
@@ -1366,7 +1413,7 @@ OpParameter *PopulatePriorBoxParameter(const mindspore::lite::PrimitiveC *primit
   if (prior_box_attr->GetAspectRatios().size() > PRIOR_BOX_MAX_NUM) {
     MS_LOG(ERROR) << "PriorBox aspect_ratios size exceeds max num " << PRIOR_BOX_MAX_NUM << ", got "
                   << prior_box_attr->GetAspectRatios();
-    delete (prior_box_param);
+    free(prior_box_param);
     return nullptr;
   }
   prior_box_param->aspect_ratios_size = prior_box_attr->GetAspectRatios().size();
@@ -1375,7 +1422,7 @@ OpParameter *PopulatePriorBoxParameter(const mindspore::lite::PrimitiveC *primit
   if (prior_box_attr->GetVariances().size() != PRIOR_BOX_VAR_NUM) {
     MS_LOG(ERROR) << "PriorBox variances size should be " << PRIOR_BOX_VAR_NUM << ", got "
                   << prior_box_attr->GetVariances().size();
-    delete (prior_box_param);
+    free(prior_box_param);
     return nullptr;
   }
   (void)memcpy(prior_box_param->variances, prior_box_attr->GetVariances().data(), PRIOR_BOX_VAR_NUM * sizeof(float));
@@ -1390,15 +1437,16 @@ OpParameter *PopulatePriorBoxParameter(const mindspore::lite::PrimitiveC *primit
 }
 
 OpParameter *PopulateLstmParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *lstm_param = new (std::nothrow) LstmParameter();
+  LstmParameter *lstm_param = reinterpret_cast<LstmParameter *>(malloc(sizeof(LstmParameter)));
   if (lstm_param == nullptr) {
-    MS_LOG(ERROR) << "new LstmParameter fail!";
+    MS_LOG(ERROR) << "malloc LstmParameter failed.";
     return nullptr;
   }
+  memset(lstm_param, 0, sizeof(LstmParameter));
   lstm_param->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Lstm *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   if (param == nullptr) {
-    delete (lstm_param);
+    free(lstm_param);
     MS_LOG(ERROR) << "get Lstm param nullptr.";
     return nullptr;
   }
@@ -1407,11 +1455,13 @@ OpParameter *PopulateLstmParameter(const mindspore::lite::PrimitiveC *primitive)
 }
 
 OpParameter *PopulateEmbeddingLookupParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *embedding_lookup_parameter = new (std::nothrow) EmbeddingLookupParameter();
+  EmbeddingLookupParameter *embedding_lookup_parameter =
+      reinterpret_cast<EmbeddingLookupParameter *>(malloc(sizeof(EmbeddingLookupParameter)));
   if (embedding_lookup_parameter == nullptr) {
-    MS_LOG(ERROR) << "new EmbeddingLookupParameter failed";
+    MS_LOG(ERROR) << "malloc EmbeddingLookupParameter failed.";
     return nullptr;
   }
+  memset(embedding_lookup_parameter, 0, sizeof(EmbeddingLookupParameter));
   embedding_lookup_parameter->op_parameter_.type_ = primitive->Type();
   auto param =
     reinterpret_cast<mindspore::lite::EmbeddingLookup *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
@@ -1419,28 +1469,31 @@ OpParameter *PopulateEmbeddingLookupParameter(const mindspore::lite::PrimitiveC
   if (embedding_lookup_parameter->max_norm_ < 0) {
     MS_LOG(ERROR) << "Embedding lookup max norm should be positive number, got "
                   << embedding_lookup_parameter->max_norm_;
+    free(embedding_lookup_parameter);
     return nullptr;
   }
   return reinterpret_cast<OpParameter *>(embedding_lookup_parameter);
 }
 
 OpParameter *PopulateBiasAddParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *arithmetic_param = new (std::nothrow) ArithmeticParameter();
+  ArithmeticParameter *arithmetic_param = reinterpret_cast<ArithmeticParameter *>(malloc(sizeof(ArithmeticParameter)));
   if (arithmetic_param == nullptr) {
-    MS_LOG(ERROR) << "new Bias Add Parameter failed";
+    MS_LOG(ERROR) << "malloc ArithmeticParameter failed.";
     return nullptr;
   }
+  memset(arithmetic_param, 0, sizeof(ArithmeticParameter));
   arithmetic_param->op_parameter_.type_ = primitive->Type();
 
   return reinterpret_cast<OpParameter *>(arithmetic_param);
 }
 
 OpParameter *PopulateEluParameter(const mindspore::lite::PrimitiveC *primitive) {
-  auto *elu_parameter = new (std::nothrow) EluParameter();
+  EluParameter *elu_parameter = reinterpret_cast<EluParameter *>(malloc(sizeof(EluParameter)));
   if (elu_parameter == nullptr) {
-    MS_LOG(ERROR) << "new EluParameter failed";
+    MS_LOG(ERROR) << "malloc EluParameter failed.";
     return nullptr;
   }
+  memset(elu_parameter, 0, sizeof(EluParameter));
   elu_parameter->op_parameter_.type_ = primitive->Type();
   auto param = reinterpret_cast<mindspore::lite::Elu *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   elu_parameter->alpha_ = param->GetAlpha();
@@ -1525,6 +1578,7 @@ PopulateParameterRegistry::PopulateParameterRegistry() {
   populate_parameter_funcs_[schema::PrimitiveType_BatchToSpace] = PopulateBatchToSpaceParameter;
   populate_parameter_funcs_[schema::PrimitiveType_SpaceToDepth] = PopulateSpaceToDepthParameter;
   populate_parameter_funcs_[schema::PrimitiveType_SpaceToBatch] = PopulateSpaceToBatchParameter;
+  populate_parameter_funcs_[schema::PrimitiveType_SpaceToBatchND] = PopulateSpaceToBatchParameterND;
   populate_parameter_funcs_[schema::PrimitiveType_Crop] = PopulateCropParameter;
   populate_parameter_funcs_[schema::PrimitiveType_Unsqueeze] = PopulateUnsqueezeParameter;
   populate_parameter_funcs_[schema::PrimitiveType_Flatten] = PopulateFlattenParameter;
@@ -1535,8 +1589,8 @@ PopulateParameterRegistry::PopulateParameterRegistry() {
   populate_parameter_funcs_[schema::PrimitiveType_ScatterND] = PopulateScatterNDParameter;
   populate_parameter_funcs_[schema::PrimitiveType_Squeeze] = PopulateSqueezeParameter;
   populate_parameter_funcs_[schema::PrimitiveType_Split] = PopulateSplitParameter;
-  populate_parameter_funcs_[schema::PrimitiveType_CaffePReLU] = PopulatePReLUParameter;
-  populate_parameter_funcs_[schema::PrimitiveType_Prelu] = PopulateLeakyReluParameter;
+  populate_parameter_funcs_[schema::PrimitiveType_PReLU] = PopulatePReLUParameter;
+  populate_parameter_funcs_[schema::PrimitiveType_LeakyReLU] = PopulateLeakyReluParameter;
   populate_parameter_funcs_[schema::PrimitiveType_PriorBox] = PopulatePriorBoxParameter;
   populate_parameter_funcs_[schema::PrimitiveType_QuantDTypeCast] = PopulateQuantDTypeCastParameter;
   populate_parameter_funcs_[schema::PrimitiveType_Lstm] = PopulateLstmParameter;
diff --git a/mindspore/lite/src/runtime/allocator.h b/mindspore/lite/src/runtime/allocator.h
index b5acf0fdfd..4c868c1e22 100644
--- a/mindspore/lite/src/runtime/allocator.h
+++ b/mindspore/lite/src/runtime/allocator.h
@@ -72,7 +72,7 @@ class DefaultAllocator : public Allocator {
   bool lockFlag = false;
 };
 
-#define MAX_MALLOC_SIZE 500 * 1024 * 1024
+#define MAX_MALLOC_SIZE (2000 * 1024 * 1024)
 
 }  // namespace mindspore::lite
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
index 51a0d07154..6e7388801b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) {
 }
 
 int ConvolutionBaseCPUKernel::SetIfPerChannel() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
   uint8_t per_channel = 0b0;
   if (conv_quant_arg_->input_arg_num_ != kPerTensor) {
-    int in_channel = conv_param_->input_channel_;
-    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != in_channel) {
+    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != input_channel) {
       MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel.";
       return RET_ERROR;
     }
@@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
   }
 
   if (conv_quant_arg_->filter_arg_num_ != kPerTensor) {
-    int filter_num = conv_param_->output_channel_;
-    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != filter_num) {
+    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != output_channel) {
       MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num.";
       return RET_ERROR;
     }
@@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
   }
 
   if (conv_quant_arg_->output_arg_num_ != kPerTensor) {
-    int out_channel = conv_param_->output_channel_;
-    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != out_channel) {
+    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != output_channel) {
       MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel.";
       return RET_ERROR;
     }
@@ -321,10 +322,12 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
     return ret;
   }
   // now only consider per tensor for output
-  CalculateActivationRangeQuantized(
-    conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
-    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
-    &conv_param_->conv_quant_arg_.out_act_max_[0]);
+  bool relu = conv_param_->act_type_ == ActType_Relu;
+  bool relu6 = conv_param_->act_type_ == ActType_Relu6;
+  CalculateActivationRangeQuantized(relu, relu6, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
+                                    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_,
+                                    &conv_param_->conv_quant_arg_.out_act_min_[0],
+                                    &conv_param_->conv_quant_arg_.out_act_max_[0]);
 
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/leaky_relu_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/leaky_relu_base.cc
index 5c9311308c..a13739bc6d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/leaky_relu_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/leaky_relu_base.cc
@@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_LeakyReLU;
 namespace mindspore::kernel {
 int LeakyReluBaseCPUKernel::Init() { return RET_OK; }
 
-kernel::LiteKernel *CpuPreluInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+kernel::LiteKernel *CpuLeakyReluInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const Context *ctx,
                                               const kernel::KernelKey &desc,
@@ -41,7 +41,7 @@ kernel::LiteKernel *CpuPreluInt8KernelCreator(const std::vector<lite::tensor::Te
   MS_ASSERT(desc.type == schema::PrimitiveType_LeakyRelu);
   auto *kernel = new (std::nothrow) LeakyReluInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new PreluCPUKernel fail!";
+    MS_LOG(ERROR) << "new LeakyReluInt8CPUKernel fail!";
     return nullptr;
   }
   auto ret = kernel->Init();
@@ -54,5 +54,5 @@ kernel::LiteKernel *CpuPreluInt8KernelCreator(const std::vector<lite::tensor::Te
   return kernel;
 }
 
-REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_LeakyReLU, CpuPreluInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_LeakyReLU, CpuLeakyReluInt8KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
index 4d73c7c658..01b719e1b1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
@@ -35,30 +35,15 @@ kernel::LiteKernel *CpuMatmulKernelCreator(const std::vector<lite::tensor::Tenso
   auto input_tensor = inputs.at(kInputIndex);
   auto data_type = input_tensor->data_type();
   kernel::LiteKernel *kernel = nullptr;
-  switch (data_type) {
-    case kNumberTypeInt8:
-    case kNumberTypeUInt8: {
-      kernel = new (std::nothrow) MatmulInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
-      if (kernel == nullptr) {
-        MS_LOG(ERROR) << "kernel is nullptr.";
-        return nullptr;
-      }
-      break;
-    }
-
-    case kNumberTypeFloat32: {
-      kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive);
-      if (kernel == nullptr) {
-        MS_LOG(ERROR) << "kernel is nullptr.";
-        return nullptr;
-      }
-      break;
-    }
-
-    default:
-      break;
+  if (data_type == kNumberTypeInt8 || data_type == kNumberTypeUInt8) {
+    kernel = new (std::nothrow) MatmulInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {
+    kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr.";
+    return nullptr;
   }
-
   auto ret = kernel->Init();
   if (ret != RET_OK) {
     delete kernel;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/pooling_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/pooling_base.cc
index c16eea8dc4..472a86a574 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/pooling_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/pooling_base.cc
@@ -42,6 +42,12 @@ int PoolingBaseCPUKernel::SetQuantParam() {
   pooling_quant_arg_[1][0].scale_ = out_quant_arg.front().scale;
   pooling_quant_arg_[1][0].zp_ = out_quant_arg.front().zeroPoint;
   pooling_param_->quant_args_ = pooling_quant_arg_;
+  if (pooling_quant_arg_[0][0].scale_ == pooling_quant_arg_[1][0].scale_ &&
+      pooling_quant_arg_[0][0].zp_ == pooling_quant_arg_[1][0].zp_) {
+    pooling_param_->quantize_ = false;
+  } else {
+    pooling_param_->quantize_ = true;
+  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
index e354183e5b..9cf795c862 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
@@ -40,8 +40,14 @@ int PriorBoxCPUKernel::Init() {
     return RET_NULL_PTR;
   }
 
-  MS_ASSERT(in_tensors_.size() == kInputNum);
-  MS_ASSERT(out_tensors_.size() == kOutputNum);
+  if (in_tensors_.size() != kInputNum) {
+    MS_LOG(ERROR) << "Size of input tensors is wrong.";
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() != kOutputNum) {
+    MS_LOG(ERROR) << "Size of input tensors is wrong.";
+    return RET_ERROR;
+  }
 
   if (!InferShapeDone()) {
     return RET_OK;
@@ -147,7 +153,7 @@ int PriorBoxCPUKernel::PriorBoxImpl(int task_id) {
   return ret;
 }
 
-int RunPriorBox(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int RunPriorBox(void *cdata, int task_id) {
   auto prior_box = reinterpret_cast<PriorBoxCPUKernel *>(cdata);
 
   auto error_code = prior_box->PriorBoxImpl(task_id);
@@ -164,7 +170,7 @@ int PriorBoxCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail! Ret error code[" << prepare_ret << "]";
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(RunPriorBox, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, RunPriorBox, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "PriorBox run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
index 60287713e2..a1ba123cf5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@@ -95,7 +95,7 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) {
   return RET_OK;
 }
 
-int QuantDTypeCastRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int QuantDTypeCastRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<QuantDTypeCastCPUKernel *>(cdata);
   auto ret = g_kernel->QuantDTypeCast(task_id);
   if (ret != RET_OK) {
@@ -119,7 +119,7 @@ int QuantDTypeCastCPUKernel::Run() {
     int8_ptr_ = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
   }
 
-  auto ret = LiteBackendParallelLaunch(QuantDTypeCastRun, this, thread_n_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, QuantDTypeCastRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/resize_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/resize_base.cc
index 7dcc4b1828..62a17607ef 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/resize_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/resize_base.cc
@@ -41,7 +41,8 @@ int ResizeBaseCPUKernel::CheckParameters() {
     return RET_NULL_PTR;
   }
   method_ = parameter->method_;
-  if (method_ != schema::ResizeMethod_BILINEAR && method_ != schema::ResizeMethod_NEAREST_NEIGHBOR) {
+  if (method_ != static_cast<int>(schema::ResizeMethod_BILINEAR) &&
+      method_ != static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR)) {
     MS_LOG(ERROR) << "Resize method should be bilinear or nearest_neighbor, but got " << method_;
     return RET_INVALID_OP_ATTR;
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
new file mode 100644
index 0000000000..46dc6dc63e
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/activation_fp16.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_HSWISH;
+using mindspore::schema::ActivationType_LEAKY_RELU;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::PrimitiveType_Activation;
+
+namespace mindspore::kernel {
+int ActivationFp16CPUKernel::Init() { return RET_OK; }
+
+int ActivationFp16CPUKernel::ReSize() { return RET_OK; }
+
+int ActivationFp16CPUKernel::MallocTmpBuffer() {
+  fp16_input_  = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
+  if (fp16_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_ERROR;
+  }
+  fp16_output_ = MallocOutputFp16(out_tensors_.at(0), context_);
+  if (fp16_output_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void ActivationFp16CPUKernel::FreeTmpBuffer() {
+  if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
+    if (fp16_input_ != nullptr) {
+      context_->allocator->Free(fp16_input_);
+      fp16_input_ = nullptr;
+    }
+  }
+  if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
+    if (fp16_output_ != nullptr) {
+      context_->allocator->Free(fp16_output_);
+      fp16_output_ = nullptr;
+    }
+  }
+}
+
+int ActivationFp16CPUKernel::DoActivation(int task_id) {
+  auto length = in_tensors_.at(0)->ElementsNum();
+
+  int stride = UP_DIV(length, thread_count_);
+  int count = MSMIN(stride, length - stride * task_id);
+
+  int error_code;
+  if (type_ == schema::ActivationType_RELU) {
+    error_code = ReluFp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count);
+  } else if (type_ == schema::ActivationType_RELU6) {
+    error_code = Relu6Fp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count);
+  } else if (type_ == schema::ActivationType_LEAKY_RELU) {
+    error_code = LReluFp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count, alpha_);
+  } else if (type_ == schema::ActivationType_SIGMOID) {
+    error_code = SigmoidFp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count);
+  } else if (type_ == schema::ActivationType_TANH) {
+    error_code = TanhFp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count);
+  } else if (type_ == schema::ActivationType_HSWISH) {
+    error_code = HSwishFp16(fp16_input_ + stride * task_id, fp16_output_ + stride * task_id, count);
+  } else {
+    MS_LOG(ERROR) << "Activation fp16 not support type: " << type_;
+    return RET_ERROR;
+  }
+  return error_code;
+}
+
+int ActivationRun(void *cdata, int task_id) {
+  auto activation_kernel = reinterpret_cast<ActivationFp16CPUKernel *>(cdata);
+  auto error_code = activation_kernel->DoActivation(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ActivationRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ActivationFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return ret;
+  }
+
+  ret = MallocTmpBuffer();
+  if (ret != RET_OK) {
+    FreeTmpBuffer();
+    return ret;
+  }
+
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationRun, this, thread_count_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
+    FreeTmpBuffer();
+    return RET_ERROR;
+  }
+
+  auto out_tensor = out_tensors_.at(0);
+  if (out_tensor->data_type() == kNumberTypeFloat32) {
+    Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(out_tensor->Data()), out_tensor->ElementsNum());
+  }
+  FreeTmpBuffer();
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuActivationFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                   const std::vector<lite::tensor::Tensor *> &outputs,
+                                                   OpParameter *opParameter, const lite::Context *ctx,
+                                                   const kernel::KernelKey &desc,
+                                                   const mindspore::lite::PrimitiveC *primitive) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Activation);
+  auto *kernel = new (std::nothrow) ActivationFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Activation, CpuActivationFp16KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h
new file mode 100644
index 0000000000..8cdfe18ef4
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_ACTIVATION_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_ACTIVATION_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/fp16/activation_fp16.h"
+
+namespace mindspore::kernel {
+class ActivationFp16CPUKernel : public LiteKernel {
+ public:
+  ActivationFp16CPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+                          const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                          const mindspore::lite::PrimitiveC *primitive)
+      : LiteKernel(param, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {
+    type_ = (reinterpret_cast<ActivationParameter *>(param))->type_;
+    alpha_ = (float16_t)((reinterpret_cast<ActivationParameter *>(param))->alpha_);
+  }
+  ~ActivationFp16CPUKernel() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoActivation(int task_id);
+  int MallocTmpBuffer();
+  void FreeTmpBuffer();
+
+ private:
+  int thread_count_;
+  int type_;
+  float16_t alpha_;
+  float16_t *fp16_input_ = nullptr;
+  float16_t *fp16_output_ = nullptr;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_ACTIVATION_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
index 3bb3554999..4c0e7e0f1b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/fp16/arithmetic_fp16.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
 #include "nnacl/fp16/arithmetic_fp16.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "schema/model_generated.h"
@@ -29,7 +30,6 @@ using mindspore::lite::RET_OK;
 
 using mindspore::schema::PrimitiveType_Add;
 using mindspore::schema::PrimitiveType_Div;
-using mindspore::schema::PrimitiveType_Eltwise;
 using mindspore::schema::PrimitiveType_Equal;
 using mindspore::schema::PrimitiveType_FloorDiv;
 using mindspore::schema::PrimitiveType_FloorMod;
@@ -47,114 +47,57 @@ using mindspore::schema::PrimitiveType_SquaredDifference;
 using mindspore::schema::PrimitiveType_Sub;
 
 namespace mindspore::kernel {
-void ArithmeticFP16CPUKernel::FreeTmpBuffer() {
-  if (tile_data0_ != nullptr) {
-    free(tile_data0_);
-    tile_data0_ = nullptr;
-  }
-  if (tile_data1_ != nullptr) {
-    free(tile_data1_);
-    tile_data1_ = nullptr;
-  }
-  if (input0_fp16_ != nullptr) {
-    context_->allocator->Free(input0_fp16_);
-    input0_fp16_ = nullptr;
-  }
-  if (input1_fp16_ != nullptr) {
-    context_->allocator->Free(input1_fp16_);
-    input1_fp16_ = nullptr;
-  }
-  if (output_fp16_ != nullptr) {
-    context_->allocator->Free(output_fp16_);
-    output_fp16_ = nullptr;
+ARITHMETIC_FUNC_INFO_FP16 arithmetic_fun_table_fp16[] = {
+  {PrimitiveType_Mul, schema::ActivationType_RELU, ElementMulReluFp16, ElementOptMulReluFp16},
+  {PrimitiveType_Mul, schema::ActivationType_RELU6, ElementMulRelu6Fp16, ElementOptMulRelu6Fp16},
+  {PrimitiveType_Mul, schema::ActivationType_NO_ACTIVATION, ElementMulFp16, ElementOptMulFp16},
+  {PrimitiveType_Add, schema::ActivationType_RELU, ElementAddReluFp16, ElementOptAddReluFp16},
+  {PrimitiveType_Add, schema::ActivationType_RELU6, ElementAddRelu6Fp16, ElementOptAddRelu6Fp16},
+  {PrimitiveType_Add, schema::ActivationType_NO_ACTIVATION, ElementAddFp16, ElementOptAddFp16},
+  {PrimitiveType_Sub, schema::ActivationType_RELU, ElementSubReluFp16, ElementOptSubReluFp16},
+  {PrimitiveType_Sub, schema::ActivationType_RELU6, ElementSubRelu6Fp16, ElementOptSubRelu6Fp16},
+  {PrimitiveType_Sub, schema::ActivationType_NO_ACTIVATION, ElementSubFp16, ElementOptSubFp16},
+  {PrimitiveType_Div, schema::ActivationType_RELU, ElementDivReluFp16, ElementOptDivReluFp16},
+  {PrimitiveType_Div, schema::ActivationType_RELU6, ElementDivRelu6Fp16, ElementOptDivRelu6Fp16},
+  {PrimitiveType_Div, schema::ActivationType_NO_ACTIVATION, ElementDivFp16, ElementOptDivFp16},
+  {PrimitiveType_FloorMod, schema::ActivationType_NO_ACTIVATION, ElementFloorModFp16, ElementOptFloorModFp16},
+  {PrimitiveType_FloorDiv, schema::ActivationType_NO_ACTIVATION, ElementFloorDivFp16, ElementOptFloorDivFp16},
+  {PrimitiveType_LogicalAnd, schema::ActivationType_NO_ACTIVATION, ElementLogicalAndFp16, ElementOptLogicalAndFp16},
+  {PrimitiveType_LogicalOr, schema::ActivationType_NO_ACTIVATION, ElementLogicalOrFp16, ElementOptLogicalOrFp16},
+  {PrimitiveType_SquaredDifference, schema::ActivationType_NO_ACTIVATION, ElementSquaredDifferenceFp16,
+   ElementOptSquaredDifferenceFp16},
+  {PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION, ElementMaximumFp16, ElementOptMaximumFp16},
+  {PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION, ElementMinimumFp16, ElementOptMinimumFp16},
+  {PrimitiveType_NotEqual, schema::ActivationType_NO_ACTIVATION, ElementNotEqualFp16, ElementOptNotEqualFp16},
+  {PrimitiveType_Equal, schema::ActivationType_NO_ACTIVATION, ElementEqualFp16, ElementOptEqualFp16},
+  {PrimitiveType_Less, schema::ActivationType_NO_ACTIVATION, ElementLessFp16, ElementOptLessFp16},
+  {PrimitiveType_LessEqual, schema::ActivationType_NO_ACTIVATION, ElementLessEqual, ElementOptLessEqualFp16},
+  {PrimitiveType_Greater, schema::ActivationType_NO_ACTIVATION, ElementGreaterFp16, ElementOptGreaterFp16},
+  {PrimitiveType_GreaterEqual, schema::ActivationType_NO_ACTIVATION, ElementGreaterEqualFp16,
+   ElementOptGreaterEqualFp16},
+};
+
+ArithmeticFuncFp16 GetArithmeticFun(int primitive_type, int activation_type) {
+  for (size_t i = 0; i < sizeof(arithmetic_fun_table_fp16); i++) {
+    if (arithmetic_fun_table_fp16[i].primitive_type_ == primitive_type &&
+        arithmetic_fun_table_fp16[i].activation_type_ == activation_type) {
+      return arithmetic_fun_table_fp16[i].func_;
+    }
   }
+  return nullptr;
 }
 
-ArithmeticFP16CPUKernel::~ArithmeticFP16CPUKernel() { FreeTmpBuffer(); }
+ArithmeticOptFuncFp16 GetOptimizedArithmeticFun(int primitive_type, int activation_type) {
+  for (size_t i = 0; i < sizeof(arithmetic_fun_table_fp16); i++) {
+    if (arithmetic_fun_table_fp16[i].primitive_type_ == primitive_type &&
+        arithmetic_fun_table_fp16[i].activation_type_ == activation_type) {
+      return arithmetic_fun_table_fp16[i].opt_func_;
+    }
+  }
+  return nullptr;
+}
 
 int ArithmeticFP16CPUKernel::Init() {
-  switch (op_parameter_->type_) {
-    case PrimitiveType_Mul:
-      switch (arithmeticParameter_->activation_type_) {
-        case schema::ActivationType_RELU:
-          arithmetic_run_ = ElementMulReluFp16;
-          break;
-        case schema::ActivationType_RELU6:
-          arithmetic_run_ = ElementMulRelu6Fp16;
-          break;
-        default:
-          arithmetic_run_ = ElementMulFp16;
-          break;
-      }
-      break;
-    case PrimitiveType_Add:
-      switch (arithmeticParameter_->activation_type_) {
-        case schema::ActivationType_RELU:
-          arithmetic_run_ = ElementAddReluFp16;
-          break;
-        case schema::ActivationType_RELU6:
-          arithmetic_run_ = ElementAddRelu6Fp16;
-          break;
-        default:
-          arithmetic_run_ = ElementAddFp16;
-          break;
-      }
-      break;
-    case PrimitiveType_Sub:
-      switch (arithmeticParameter_->activation_type_) {
-        case schema::ActivationType_RELU:
-          arithmetic_run_ = ElementSubReluFp16;
-          break;
-        case schema::ActivationType_RELU6:
-          arithmetic_run_ = ElementSubRelu6Fp16;
-          break;
-        default:
-          arithmetic_run_ = ElementSubFp16;
-          break;
-      }
-    case PrimitiveType_Div:
-      switch (arithmeticParameter_->activation_type_) {
-        case schema::ActivationType_RELU:
-          arithmetic_run_ = ElementDivReluFp16;
-          break;
-        case schema::ActivationType_RELU6:
-          arithmetic_run_ = ElementDivRelu6Fp16;
-          break;
-        default:
-          arithmetic_run_ = ElementDivFp16;
-          break;
-      }
-    case PrimitiveType_FloorMod:
-      arithmetic_run_ = ElementFloorModFp16;
-    case PrimitiveType_FloorDiv:
-      arithmetic_run_ = ElementFloorDivFp16;
-    case PrimitiveType_LogicalAnd:
-      arithmetic_run_ = ElementLogicalAndFp16;
-    case PrimitiveType_LogicalOr:
-      arithmetic_run_ = ElementLogicalOrFp16;
-    case PrimitiveType_SquaredDifference:
-      arithmetic_run_ = ElementSquaredDifferenceFp16;
-    case PrimitiveType_Maximum:
-      arithmetic_run_ = ElementMaximumFp16;
-    case PrimitiveType_Minimum:
-      arithmetic_run_ = ElementMinimumFp16;
-    case PrimitiveType_NotEqual:
-      arithmetic_run_ = ElementNotEqualFp16;
-    case PrimitiveType_Equal:
-      arithmetic_run_ = ElementEqualFp16;
-    case PrimitiveType_Less:
-      arithmetic_run_ = ElementLessFp16;
-    case PrimitiveType_LessEqual:
-      arithmetic_run_ = ElementLessEqual;
-    case PrimitiveType_Greater:
-      arithmetic_run_ = ElementGreaterFp16;
-    case PrimitiveType_GreaterEqual:
-      arithmetic_run_ = ElementGreaterEqualFp16;
-    default:
-      MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
-      arithmetic_run_ = nullptr;
-      break;
-  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -162,185 +105,81 @@ int ArithmeticFP16CPUKernel::Init() {
 }
 
 int ArithmeticFP16CPUKernel::ReSize() {
-  arithmeticParameter_->in_elements_num0_ = in_tensors_[0]->ElementsNum();
-  arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum();
-  arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum();
+  param_->in_elements_num0_ = in_tensors_[0]->ElementsNum();
+  param_->in_elements_num1_ = in_tensors_[1]->ElementsNum();
+  param_->out_elements_num_ = out_tensors_[0]->ElementsNum();
 
-  if (arithmeticParameter_->in_elements_num0_ == 1 || arithmeticParameter_->in_elements_num1_ == 1) {
-    switch (arithmeticParameter_->op_parameter_.type_) {
-      case PrimitiveType_Mul:
-        arithmeticParameter_->broadcasting_ = false;
-        switch (arithmeticParameter_->activation_type_) {
-          case schema::ActivationType_RELU:
-            arithmetic_opt_run_ = ElementOptMulReluFp16;
-            break;
-          case schema::ActivationType_RELU6:
-            arithmetic_opt_run_ = ElementOptDivRelu6Fp16;
-            break;
-          default:
-            arithmetic_opt_run_ = ElementOptDivFp16;
-            break;
-        }
-        break;
-      case PrimitiveType_Add:
-        arithmeticParameter_->broadcasting_ = false;
-        switch (arithmeticParameter_->activation_type_) {
-          case schema::ActivationType_RELU:
-            arithmetic_opt_run_ = ElementOptAddReluFp16;
-            break;
-          case schema::ActivationType_RELU6:
-            arithmetic_opt_run_ = ElementOptAddRelu6Fp16;
-            break;
-          default:
-            arithmetic_opt_run_ = ElementOptAddFp16;
-            break;
-        }
-        break;
-      case PrimitiveType_Sub:
-        arithmeticParameter_->broadcasting_ = false;
-        switch (arithmeticParameter_->activation_type_) {
-          case schema::ActivationType_RELU:
-            arithmetic_opt_run_ = ElementOptSubReluFp16;
-            break;
-          case schema::ActivationType_RELU6:
-            arithmetic_opt_run_ = ElementOptSubRelu6Fp16;
-            break;
-          default:
-            arithmetic_opt_run_ = ElementOptSubFp16;
-            break;
-        }
-        break;
-      case PrimitiveType_Div:
-        arithmeticParameter_->broadcasting_ = false;
-        switch (arithmeticParameter_->activation_type_) {
-          case schema::ActivationType_RELU:
-            arithmetic_opt_run_ = ElementOptDivReluFp16;
-            break;
-          case schema::ActivationType_RELU6:
-            arithmetic_opt_run_ = ElementOptDivRelu6Fp16;
-            break;
-          default:
-            arithmetic_opt_run_ = ElementOptDivFp16;
-            break;
-        }
-        break;
-      case PrimitiveType_FloorMod:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptFloorModFp16;
-      case PrimitiveType_FloorDiv:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptFloorDivFp16;
-      case PrimitiveType_LogicalAnd:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptLogicalAndFp16;
-      case PrimitiveType_LogicalOr:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptLogicalOrFp16;
-      case PrimitiveType_SquaredDifference:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptSquaredDifferenceFp16;
-      case PrimitiveType_Maximum:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptMaximumFp16;
-      case PrimitiveType_Minimum:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptMinimumFp16;
-      case PrimitiveType_NotEqual:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptNotEqualFp16;
-      case PrimitiveType_Equal:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptEqualFp16;
-      case PrimitiveType_Less:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptLessFp16;
-      case PrimitiveType_LessEqual:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptLessEqualFp16;
-      case PrimitiveType_Greater:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptGreaterFp16;
-      case PrimitiveType_GreaterEqual:
-        arithmeticParameter_->broadcasting_ = false;
-        arithmetic_opt_run_ = ElementOptGreaterEqualFp16;
-      default:
+  if (param_->in_elements_num0_ == 1 || param_->in_elements_num1_ == 1) {
+    param_->broadcasting_ = false;
+    arithmetic_opt_func_ = GetOptimizedArithmeticFun(param_->op_parameter_.type_, param_->activation_type_);
+  } else {
+    arithmetic_func_ = GetArithmeticFun(param_->op_parameter_.type_, param_->activation_type_);
+  }
+  if (arithmetic_opt_func_ == nullptr && arithmetic_func_ == nullptr) {
+    MS_LOG(ERROR) << "arithmetic_opt_func_ and arithmetic_func_ function is nullptr!";
+    return RET_ERROR;
+  }
+  if (param_->broadcasting_) {
+    outside_ = 1;
+    for (int i = param_->ndim_ - 1; i >= 0; --i) {
+      if (param_->in_shape0_[i] != param_->in_shape1_[i]) {
+        break_pos_ = i;
         break;
+      }
+      outside_ *= param_->out_shape_[i];
     }
+    ComputeStrides(param_->in_shape0_, param_->in_strides0_, param_->ndim_);
+    ComputeStrides(param_->in_shape1_, param_->in_strides1_, param_->ndim_);
+    ComputeStrides(param_->out_shape_, param_->out_strides_, param_->ndim_);
   }
   return RET_OK;
 }
 
-int ArithmeticFP16CPUKernel::broadcast_run_(float16_t *input0, float16_t *input1, float16_t *output, int dim) {
+int ArithmeticFP16CPUKernel::BroadcastRun(float16_t *input0, float16_t *input1, float16_t *output, int dim,
+                                          int out_count, int cur_offset) {
   if (dim > break_pos_) {
-    return arithmetic_run_(input0 + out_thread_stride_, input1 + out_thread_stride_, output + out_thread_stride_,
-                           out_count_);
+    return arithmetic_func_(input0 + cur_offset, input1 + cur_offset, output + cur_offset, out_count);
   }
-  for (int i = 0; i < arithmeticParameter_->out_shape_[dim]; ++i) {
-    int pos0_ = arithmeticParameter_->in_shape0_[0] == 1 ? 0 : i;
-    int pos1_ = arithmeticParameter_->in_shape1_[0] == 1 ? 0 : i;
-    return broadcast_run_(input0 + pos0_ * arithmeticParameter_->in_strides0_[dim],
-                          input1 + pos1_ * arithmeticParameter_->in_strides1_[dim],
-                          output + i * arithmeticParameter_->out_strides_[dim], dim + 1);
+  for (int i = 0; i < param_->out_shape_[dim]; ++i) {
+    int pos0 = param_->in_shape0_[dim] == 1 ? 0 : i;
+    int pos1 = param_->in_shape1_[dim] == 1 ? 0 : i;
+    int ret = BroadcastRun(input0 + pos0 * param_->in_strides0_[dim], input1 + pos1 * param_->in_strides1_[dim],
+                           output + i * param_->out_strides_[dim], dim + 1, out_count, cur_offset);
+    if (ret != RET_OK) {
+      return RET_ERROR;
+    }
   }
   return RET_OK;
 }
 
 int ArithmeticFP16CPUKernel::DoArithmetic(int task_id) {
-  auto input0 = reinterpret_cast<float16_t *>(in_tensors_[0]->Data());
-  auto input1 = reinterpret_cast<float16_t *>(in_tensors_[1]->Data());
-  auto output = reinterpret_cast<float16_t *>(out_tensors_[0]->Data());
-  auto element_num = out_tensors_[0]->ElementsNum();
+  int stride_per_thread = UP_DIV(param_->broadcasting_ ? outside_ : param_->out_elements_num_, context_->thread_num_);
+  int cur_offset = stride_per_thread * task_id;
+  int cur_count = MSMIN(stride_per_thread, param_->out_elements_num_ - cur_offset);
 
-  float16_t *input0_data = input0_fp16_ == nullptr ? input0 : input0_fp16_;
-  float16_t *input1_data1 = input1_fp16_ == nullptr ? input1 : input1_fp16_;
-  auto output_data = output_fp16_ == nullptr ? output : output_fp16_;
-  int stride = UP_DIV(element_num, context_->thread_num_);
-  int count = MSMIN(stride, element_num - stride * task_id);
-  auto thread_stride = stride * task_id;
-
-  if (arithmetic_run_ == nullptr) {
-    MS_LOG(ERROR) << "arithmetic_run function is nullptr!";
-    return RET_ERROR;
-  }
-
-  int error_code = RET_OK;
-  if (arithmeticParameter_->broadcasting_) {
-    error_code =
-      arithmetic_run_(tile_data0_ + thread_stride, tile_data1_ + thread_stride, output_data + thread_stride, count);
-  } else if (arithmetic_opt_run_ != nullptr) {
-    if (arithmeticParameter_->in_elements_num0_ == 1) {
-      error_code = arithmetic_opt_run_(input0_data, input1_data1 + thread_stride, output_data + thread_stride, count,
-                                       arithmeticParameter_);
-    } else if (arithmeticParameter_->in_elements_num1_ == 1) {
-      error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1, output_data + thread_stride, count,
-                                       arithmeticParameter_);
-    } else {
-      error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1 + thread_stride,
-                                       output_data + thread_stride, count, arithmeticParameter_);
-    }
+  int ret = RET_OK;
+  if (param_->broadcasting_) {
+    ret = BroadcastRun(input0_fp16_, input1_fp16_, output_fp16_, 0, cur_count, cur_offset);
+  } else if (param_->in_elements_num0_ == 1) {
+    ret = arithmetic_opt_func_(input0_fp16_, input1_fp16_ + cur_offset, output_fp16_ + cur_offset, cur_count, param_);
+  } else if (param_->in_elements_num1_ == 1) {
+    ret = arithmetic_opt_func_(input0_fp16_ + cur_offset, input1_fp16_, output_fp16_ + cur_offset, cur_count, param_);
   } else {
-    error_code =
-      arithmetic_run_(input0_data + thread_stride, input1_data1 + thread_stride, output_data + thread_stride, count);
+    ret = arithmetic_func_(input0_fp16_ + cur_offset, input1_fp16_ + cur_offset, output_fp16_ + cur_offset, cur_count);
   }
-  if (error_code != RET_OK) {
-    FreeTmpBuffer();
-    return RET_ERROR;
-  }
-  if (output_fp16_ != nullptr) {
-    auto output_fp32 = reinterpret_cast<float *>(out_tensors_[0]->Data());
-    Float16ToFloat32(output_data + thread_stride, output_fp32 + thread_stride, count);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "DoArithmetic failed, ret = " << ret;
   }
-  return RET_OK;
+  return ret;
 }
 
-static int ArithmeticsRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ArithmeticsRunFp16(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticFP16CPUKernel *>(cdata);
-  auto error_code = arithmetic_kernel->DoArithmetic(task_id);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "ArithmeticsRun error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
+  auto ret = arithmetic_kernel->DoArithmetic(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticsRunFp16 error task_id[" << task_id << "] ret[" << ret << "]";
   }
-  return RET_OK;
+  return ret;
 }
 
 int ArithmeticFP16CPUKernel::Run() {
@@ -349,77 +188,43 @@ int ArithmeticFP16CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
+  auto output_tensor = out_tensors_.at(0);
+  is_input0_fp32_ = in_tensors_.at(0)->data_type() == kNumberTypeFloat32;
+  is_input1_fp32_ = in_tensors_.at(1)->data_type() == kNumberTypeFloat32;
+  is_output_fp32_ = output_tensor->data_type() == kNumberTypeFloat32;
 
-  arithmeticParameter_->in_elements_num0_ = in_tensors_[0]->ElementsNum();
-  arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum();
-  arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum();
-  if (in_tensors_[0]->data_type() == kNumberTypeFloat32 || in_tensors_[0]->data_type() == kNumberTypeFloat) {
-    input0_fp16_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(arithmeticParameter_->in_elements_num0_ * sizeof(float16_t)));
-    if (input0_fp16_ == nullptr) {
-      MS_LOG(ERROR) << "malloc data fail!";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
+  input0_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
+  input1_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
+  output_fp16_ = MallocOutputFp16(output_tensor, context_);
+  if (input0_fp16_ == nullptr || input1_fp16_ == nullptr || output_fp16_ == nullptr) {
+    MS_LOG(ERROR) << "Memory allocation failed";
+    FreeTmpBuffer();
+    return RET_ERROR;
   }
-  if (in_tensors_[1]->data_type() == kNumberTypeFloat32 || in_tensors_[1]->data_type() == kNumberTypeFloat) {
-    input1_fp16_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(arithmeticParameter_->in_elements_num1_ * sizeof(float16_t)));
-    if (input0_fp16_ == nullptr) {
-      MS_LOG(ERROR) << "malloc data fail!";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsRunFp16, this, context_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]";
   }
-  if (out_tensors_[0]->data_type() == kNumberTypeFloat32 || out_tensors_[0]->data_type() == kNumberTypeFloat) {
-    output_fp16_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(arithmeticParameter_->out_elements_num_ * sizeof(float16_t)));
-    if (output_fp16_ == nullptr) {
-      MS_LOG(ERROR) << "malloc data fail!";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
+  if (is_output_fp32_) {
+    Float16ToFloat32(output_fp16_, reinterpret_cast<float *>(output_tensor->Data()), output_tensor->ElementsNum());
   }
+  FreeTmpBuffer();
+  return ret;
+}
 
-  if (in_tensors_[0]->data_type() == kNumberTypeFloat32 || in_tensors_[0]->data_type() == kNumberTypeFloat) {
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[0]->Data()), input0_fp16_,
-                     arithmeticParameter_->in_elements_num0_);
-  }
-  if (in_tensors_[1]->data_type() == kNumberTypeFloat32 || in_tensors_[1]->data_type() == kNumberTypeFloat) {
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->Data()), input1_fp16_,
-                     arithmeticParameter_->in_elements_num1_);
+void ArithmeticFP16CPUKernel::FreeTmpBuffer() {
+  if (is_input0_fp32_) {
+    context_->allocator->Free(input0_fp16_);
+    input0_fp16_ = nullptr;
   }
-
-  if (arithmeticParameter_->broadcasting_) {
-    auto tile_size = arithmeticParameter_->out_elements_num_ * sizeof(float16_t);
-    tile_data0_ = reinterpret_cast<float16_t *>(malloc(tile_size));
-    if (tile_data0_ == nullptr) {
-      MS_LOG(ERROR) << "malloc data fail!";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
-    tile_data1_ = reinterpret_cast<float16_t *>(malloc(tile_size));
-    if (tile_data1_ == nullptr) {
-      MS_LOG(ERROR) << "malloc data fail!";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
-    auto input0 = reinterpret_cast<float16_t *>(in_tensors_[0]->Data());
-    auto input1 = reinterpret_cast<float16_t *>(in_tensors_[1]->Data());
-
-    float16_t *input0_data = input0_fp16_ == nullptr ? input0 : input0_fp16_;
-    float16_t *input1_data1 = input1_fp16_ == nullptr ? input1 : input1_fp16_;
-
-    TileDimensionsFp16(input0_data, input1_data1, tile_data0_, tile_data1_, arithmeticParameter_);
+  if (is_input1_fp32_) {
+    context_->allocator->Free(input1_fp16_);
+    input1_fp16_ = nullptr;
   }
-
-  ret = LiteBackendParallelLaunch(ArithmeticsRun, this, context_->thread_num_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Arithmetic function fail!ret: " << ret;
-    FreeTmpBuffer();
-    return ret;
+  if (is_output_fp32_) {
+    context_->allocator->Free(output_fp16_);
+    output_fp16_ = nullptr;
   }
-  return RET_OK;
 }
 
 kernel::LiteKernel *CpuArithmeticFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
@@ -446,21 +251,20 @@ kernel::LiteKernel *CpuArithmeticFp16KernelCreator(const std::vector<lite::tenso
   return kernel;
 }
 
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Mul, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Add, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Sub, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Div, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FloorMod, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Maximum, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Minimum, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NotEqual, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Equal, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Less, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LessEqual, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Greater, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, CpuArithmeticFp16KernelCreator)
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Eltwise, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Mul, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Add, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Sub, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Div, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FloorMod, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Maximum, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Minimum, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NotEqual, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Equal, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Less, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LessEqual, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Greater, CpuArithmeticFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, CpuArithmeticFp16KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
index e16dd03e2b..d3ec77e461 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
@@ -23,40 +23,46 @@
 #include "schema/model_generated.h"
 
 namespace mindspore::kernel {
-class ArithmeticFP16CPUKernel : public LiteKernel {
-  typedef int (*ArithmeticRun)(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
-  typedef int (*ArithmeticOptRun)(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
-                                  ArithmeticParameter *param);
+typedef int (*ArithmeticFuncFp16)(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
+typedef int (*ArithmeticOptFuncFp16)(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
+                                     ArithmeticParameter *param);
+typedef struct {
+  int primitive_type_;
+  int activation_type_;
+  ArithmeticFuncFp16 func_;
+  ArithmeticOptFuncFp16 opt_func_;
+} ARITHMETIC_FUNC_INFO_FP16;
 
+class ArithmeticFP16CPUKernel : public LiteKernel {
  public:
   ArithmeticFP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                           const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                           const mindspore::lite::PrimitiveC *primitive)
       : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
+    param_ = reinterpret_cast<ArithmeticParameter *>(parameter);
   }
-  ~ArithmeticFP16CPUKernel() override;
+  ~ArithmeticFP16CPUKernel() = default;
 
   int Init() override;
   int ReSize() override;
   int Run() override;
   int DoArithmetic(int task_id);
-  int broadcast_run_(float16_t *input0, float16_t *input1, float16_t *output, int dim);
+  int BroadcastRun(float16_t *input0, float16_t *input1, float16_t *output, int dim, int out_count,
+                   int out_thread_stride);
 
  private:
   void FreeTmpBuffer();
-  int break_pos_;
   int outside_;
-  int out_thread_stride_;
-  int out_count_;
-  float16_t *tile_data0_ = nullptr;
-  float16_t *tile_data1_ = nullptr;
+  int break_pos_;
+  bool is_input0_fp32_ = false;
+  bool is_input1_fp32_ = false;
+  bool is_output_fp32_ = false;
   float16_t *input0_fp16_ = nullptr;
   float16_t *input1_fp16_ = nullptr;
   float16_t *output_fp16_ = nullptr;
-  ArithmeticParameter *arithmeticParameter_ = nullptr;
-  ArithmeticRun arithmetic_run_ = nullptr;
-  ArithmeticOptRun arithmetic_opt_run_ = nullptr;
+  ArithmeticParameter *param_ = nullptr;
+  ArithmeticFuncFp16 arithmetic_func_ = nullptr;
+  ArithmeticOptFuncFp16 arithmetic_opt_func_ = nullptr;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_ARITHMETIC_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
index 6de61af026..c3735673d3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/fp16/batchnorm_fp16.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
 #include "nnacl/fp16/batchnorm_fp16.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "src/kernel_registry.h"
@@ -23,46 +24,72 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::schema::PrimitiveType_BatchNorm;
 
 namespace mindspore::kernel {
-int BatchnormFp16CPUKernel::DoExecute(int task_id) {
-  auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
-
-  if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
-    auto input = in_tensors_.at(0);
-    auto mean = in_tensors_.at(1);
-    auto variance = in_tensors_.at(2);
-    auto output = out_tensors_.at(0);
-
-    auto input_fp16 = context_->allocator->Malloc(input->ElementsNum() * sizeof(float16_t));
-    auto mean_fp16 = context_->allocator->Malloc(mean->ElementsNum() * sizeof(float16_t));
-    auto variance_fp16 = context_->allocator->Malloc(variance->ElementsNum() * sizeof(float16_t));
-    auto output_fp16 = context_->allocator->Malloc(output->ElementsNum() * sizeof(float16_t));
-    if (input_fp16 == nullptr || mean_fp16 == nullptr || variance_fp16 == nullptr || output_fp16 == nullptr) {
-      context_->allocator->Free(input_fp16);
-      context_->allocator->Free(mean_fp16);
-      context_->allocator->Free(variance_fp16);
-      context_->allocator->Free(output_fp16);
+int BatchnormFp16CPUKernel::InitConstTensor() {
+  is_input_fp32_ = in_tensors_.at(0)->data_type() == kNumberTypeFloat32;
+  is_output_fp32_ = out_tensors_.at(0)->data_type() == kNumberTypeFloat32;
+  if (is_input_fp32_) {
+    auto mean_fp32 = in_tensors_.at(1);
+    auto variance_fp32 = in_tensors_.at(2);
+    mean_ = malloc(mean_fp32->ElementsNum() * sizeof(float16_t));
+    variance_ = malloc(variance_fp32->ElementsNum() * sizeof(float16_t));
+    if (mean_ == nullptr || variance_ == nullptr) {
+      FreeMeanAndVariance();
+      return RET_ERROR;
     }
-    Float32ToFloat16(reinterpret_cast<float *>(input->Data()),
-                     reinterpret_cast<float16_t *>(input_fp16), input->ElementsNum());
-    Float32ToFloat16(reinterpret_cast<float *>(mean->Data()),
-                     reinterpret_cast<float16_t *>(mean_fp16), mean->ElementsNum());
-    Float32ToFloat16(reinterpret_cast<float *>(variance->Data()),
-                     reinterpret_cast<float16_t *>(variance_fp16), variance->ElementsNum());
+    Float32ToFloat16(reinterpret_cast<float *>(mean_fp32->Data()),
+                     reinterpret_cast<float16_t *>(mean_), mean_fp32->ElementsNum());
+    Float32ToFloat16(reinterpret_cast<float *>(variance_fp32->Data()),
+                     reinterpret_cast<float16_t *>(variance_), variance_fp32->ElementsNum());
+  } else {
+    BatchnormCPUKernel::InitConstTensor();
+  }
+  return RET_OK;
+}
 
-    BatchNormFp16(input_fp16, mean_fp16, variance_fp16, param, task_id, output_fp16);
+int BatchnormFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail! Ret error code: " << ret;
+    return ret;
+  }
+  auto input_tensor = in_tensors_.at(0);
+  auto output_tensor = out_tensors_.at(0);
+  input_ = ConvertInputFp32toFp16(input_tensor, context_);
+  output_ = MallocOutputFp16(output_tensor, context_);
+  if (input_ == nullptr || output_ == nullptr) {
+    FreeInputAndOutput();
+    MS_LOG(ERROR) << "input or output is nullptr";
+    return RET_ERROR;
+  }
 
-    Float16ToFloat32(reinterpret_cast<float16_t *>(output_fp16), reinterpret_cast<float *>(output),
-                     output->ElementsNum());
-    context_->allocator->Free(input_fp16);
-    context_->allocator->Free(mean_fp16);
-    context_->allocator->Free(variance_fp16);
-    context_->allocator->Free(output_fp16);
-    return mindspore::lite::RET_OK;
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormRun, this, op_parameter_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
   }
-  BatchNormFp16(in_tensors_.at(0)->Data(), mean_, variance_, param, task_id, out_tensors_.at(0)->Data());
+  if (is_output_fp32_) {
+    Float16ToFloat32(output_, reinterpret_cast<float *>(output_tensor->Data()), output_tensor->ElementsNum());
+  }
+  FreeInputAndOutput();
+  return ret;
+}
+
+int BatchnormFp16CPUKernel::DoExecute(int task_id) {
+  auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
+  BatchNormFp16(input_, mean_, variance_, param, task_id, output_);
   return mindspore::lite::RET_OK;
 }
 
+void BatchnormFp16CPUKernel::FreeInputAndOutput() {
+  if (is_input_fp32_) {
+    context_->allocator->Free(input_);
+    input_ = nullptr;
+  }
+  if (is_output_fp32_) {
+    context_->allocator->Free(output_);
+    output_ = nullptr;
+  }
+}
+
 kernel::LiteKernel *CpuBatchnormFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                   const std::vector<lite::tensor::Tensor *> &outputs,
                                                   OpParameter *opParameter, const lite::Context *ctx,
@@ -83,5 +110,5 @@ kernel::LiteKernel *CpuBatchnormFp16KernelCreator(const std::vector<lite::tensor
   return kernel;
 }
 
-// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, CpuBatchnormFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, CpuBatchnormFp16KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
index eb493fc086..eeec184169 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
@@ -29,7 +29,16 @@ class BatchnormFp16CPUKernel : public BatchnormCPUKernel {
       : BatchnormCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   virtual ~BatchnormFp16CPUKernel() {}
 
-  virtual int DoExecute(int task_id);
+  int Run() override;
+  int InitConstTensor() override;
+  int DoExecute(int task_id) override;
+
+ private:
+  void FreeInputAndOutput();
+  bool is_input_fp32_ = false;
+  bool is_output_fp32_ = false;
+  float16_t *input_ = nullptr;
+  float16_t *output_ = nullptr;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
index d67ab064d8..da776d2cc9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Cast;
 
 namespace mindspore::kernel {
 namespace {
-int CastRun(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CastRun(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "input cdata is nullptr!";
     return RET_ERROR;
   }
 
-  return reinterpret_cast<CastFp16CPUKernel *>(cdata)->DoCast(thread_id);
+  return reinterpret_cast<CastFp16CPUKernel *>(cdata)->DoCast(task_id);
 }
 }  // namespace
 
@@ -91,7 +91,7 @@ int CastFp16CPUKernel::Run() {
   if (data_num_ == 0) {
     return RET_OK;
   }
-  return LiteBackendParallelLaunch(CastRun, this, op_parameter_->thread_num_);
+  return ParallelLaunch(THREAD_POOL_DEFAULT, CastRun, this, op_parameter_->thread_num_);
 }
 
 kernel::LiteKernel *CpuCastFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 1bd9c81f36..7c0e35c706 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -38,8 +38,7 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() {
   matmul_param_->deep_ = conv_param_->input_channel_;
   matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM);
   matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
-  matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
-  matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
+  matmul_param_->act_type_ = conv_param_->act_type_;
   return RET_OK;
 }
 
@@ -57,7 +56,7 @@ Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() {
 }
 
 int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
-  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
+  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                       conv_param_->stride_w_ != 1);
 
   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
@@ -70,11 +69,19 @@ int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
     return RET_MEMORY_FAILED;
   }
   memset(pack_input_, 0, matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t));
+
+  if (pre_trans_input_) {
+    input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
+    if (input_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
+      return RET_MEMORY_FAILED;
+    }
+    memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t));
+  }
   return RET_OK;
 }
 
 int Convolution1x1FP16CPUKernel::InitWeightBias() {
-  auto bias_tensor = in_tensors_.at(kBiasIndex);
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   auto input_channel = weight_tensor->Channel();
   auto output_channel = weight_tensor->Batch();
@@ -87,6 +94,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
   }
   memset(bias_data_, 0, size);
   if (in_tensors_.size() == 3) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
     if (bias_tensor->data_type() == kNumberTypeFloat16) {
       memcpy(bias_data_, bias_tensor->Data(), output_channel * sizeof(float16_t));
     } else {
@@ -108,21 +116,19 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
 }
 
 int Convolution1x1FP16CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-
   matmul_param_ = new (std::nothrow) MatMulParameter();
   if (matmul_param_ == nullptr) {
     MS_LOG(ERROR) << "Init matmul_param_ failed.";
     return RET_ERROR;
   }
-
   int ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return ret;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -131,6 +137,10 @@ void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
     free(pack_input_);
     pack_input_ = nullptr;
   }
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
+    free(input_ptr_);
+    input_ptr_ = nullptr;
+  }
   return;
 }
 
@@ -182,7 +192,7 @@ int Convolution1x1FP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int Convolution1x1Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int Convolution1x1Fp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -205,21 +215,12 @@ int Convolution1x1FP16CPUKernel::Run() {
     return ret;
   }
 
-  if (pre_trans_input_) {
-    input_ptr_ = reinterpret_cast<float16_t *>(
-      ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
-    if (input_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
-      return RET_MEMORY_FAILED;
-    }
-  }
-
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
     Pre1x1Trans(
       execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
       execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_);
 
-    int error_code = LiteBackendParallelLaunch(Convolution1x1Fp16Impl, this, thread_count_);
+    int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Fp16Impl, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
       return RET_ERROR;
@@ -229,10 +230,6 @@ int Convolution1x1FP16CPUKernel::Run() {
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
 
-  if (pre_trans_input_ && input_ptr_ != nullptr) {
-    ctx_->allocator->Free(input_ptr_);
-    input_ptr_ = nullptr;
-  }
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
index 883d7bca0b..99eef1760e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
@@ -95,8 +95,16 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
   const int tile_num = 16;
   const int k_plane = 36;
   int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
+  int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
   MS_ASSERT(ctx_->allocator != nullptr);
 
+  size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t);
+  tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
+  if (tile_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "malloc tile_buffer_ failed.";
+    return RET_ERROR;
+  }
+
   size_t block_unit_buffer_size = thread_count_ * k_plane * C8NUM * sizeof(float16_t);
   block_unit_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
   if (block_unit_buffer_ == nullptr) {
@@ -134,14 +142,14 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() {
 }
 
 int Convolution3x3FP16CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -152,10 +160,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
     return ret;
   }
 
-  if (tile_buffer_ != nullptr) {
-    free(tile_buffer_);
-    tile_buffer_ = nullptr;
-  }
   if (nhwc4_input_ != nullptr) {
     free(nhwc4_input_);
     nhwc4_input_ = nullptr;
@@ -166,10 +170,8 @@ int Convolution3x3FP16CPUKernel::ReSize() {
     MS_LOG(ERROR) << "ConvolutionBase init failed.";
     return ret;
   }
-  const int tile_num = 16;
-  const int k_plane = 36;
-  int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
 
+  int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
   size_t nhwc8_input_size =
     iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
   nhwc4_input_ = malloc(nhwc8_input_size);
@@ -179,14 +181,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
   }
   memset(nhwc4_input_, 0, nhwc8_input_size);
 
-  size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t);
-  tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
-  if (tile_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "malloc tile_buffer_ failed.";
-    return RET_ERROR;
-  }
-  memset(tile_buffer_, 0, tile_buffer_size);
-
   return RET_OK;
 }
 
@@ -197,7 +191,7 @@ int Convolution3x3FP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int Convolution3x3Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int Convolution3x3Fp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution3x3FP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -207,6 +201,28 @@ static int Convolution3x3Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void
   return RET_OK;
 }
 
+int Convolution3x3FP16CPUKernel::PostProcess() {
+  auto act_type = conv_param_->act_type_;
+  switch (act_type) {
+    case ActType_No:
+      UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                          conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    case ActType_Relu:
+      UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                              conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    case ActType_Relu6:
+      UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                               conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupport activation type.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int Convolution3x3FP16CPUKernel::Run() {
   auto ret = Prepare();
   if (ret != RET_OK) {
@@ -229,27 +245,18 @@ int Convolution3x3FP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   PackNHWCToNHWC8Fp16(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Fp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Fp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 fp16 error error_code[" << error_code << "]";
     FreeTmpBuffer();
     return RET_ERROR;
   }
 
-  // get real output
-  bool relu = conv_param_->is_relu_;
-  bool relu6 = conv_param_->is_relu6_;
-  if (relu) {
-    UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                            conv_param_->output_w_, conv_param_->output_channel_);
-  } else if (relu6) {
-    UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                             conv_param_->output_w_, conv_param_->output_channel_);
-  } else {
-    UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                        conv_param_->output_w_, conv_param_->output_channel_);
+  ret = PostProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Post process failed.";
+    return ret;
   }
-
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
index e14ebc6f99..507c6ae206 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
@@ -39,10 +39,6 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
       free(transformed_filter_addr_);
       transformed_filter_addr_ = nullptr;
     }
-    if (tile_buffer_ != nullptr) {
-      free(tile_buffer_);
-      tile_buffer_ = nullptr;
-    }
   }
 
   int Init() override;
@@ -52,9 +48,14 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int InitWeightBias();
   int InitTmpBuffer();
   void ConfigInputOutput();
+  int PostProcess();
 
  private:
   void FreeTmpBuffer() {
+    if (tile_buffer_ != nullptr) {
+      ctx_->allocator->Free(tile_buffer_);
+      tile_buffer_ = nullptr;
+    }
     if (block_unit_buffer_ != nullptr) {
       ctx_->allocator->Free(block_unit_buffer_);
       block_unit_buffer_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index cf88b2be25..fadaa906a5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
+#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
 #include "nnacl/fp16/pack_fp16.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "schema/model_generated.h"
@@ -30,72 +31,34 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
 ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
-  if (sliding_ != nullptr) {
-    delete sliding_;
-    sliding_ = nullptr;
-  }
   if (packed_weight_ != nullptr) {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
-}
-
-void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      delete packed_input_;
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      delete packed_output_;
-      packed_output_ = nullptr;
-    }
-  }
-}
-
-int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
-  if (conv_param_->input_channel_ % C4NUM != 0) {
-    need_align_ = true;
-    int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
-    int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
-    packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
-    if (packed_input_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-
-    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
-    packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
-    if (packed_output_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
+  ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
+
   auto weight_tensor = in_tensors_[kWeightIndex];
-  int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
-  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
+  int channel = weight_tensor->Batch();
+  int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
-                           weight_tensor->Batch());
+  PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                     weight_tensor->Batch());
 
-  bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
+  bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
+  memset(bias_data_, 0, channel * sizeof(float16_t));
   auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
@@ -104,18 +67,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
       bias_fp16[i] = (float16_t)ori_bias[i];
     }
   }
-
-  conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
   return RET_OK;
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Init() {
-  sliding_ = new (std::nothrow) SlidingWindowParam;
-  if (sliding_ == nullptr) {
-    MS_LOG(ERROR) << "new sliding window param failed.";
-    return RET_ERROR;
-  }
-
   auto ret = InitWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
@@ -129,28 +84,21 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
-  FreeTmpBuffer();
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
-  InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
-
-  ret = InitBuffer();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
-    return RET_ERROR;
-  }
+  conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
   return RET_OK;
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
-  ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
-               sliding_, task_id);
+  ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
+             task_id);
   return RET_OK;
 }
 
-static int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvDwFp16Run(void *cdata, int task_id) {
   auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseFp16CPUKernel *>(cdata);
   auto ret = conv_dw_fp16->Execute(task_id);
   if (ret != RET_OK) {
@@ -161,40 +109,28 @@ static int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
-    return RET_ERROR;
-  }
 
   ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Get Execute tensor failed.";
     return ret;
   }
-  if (need_align_) {
-    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
-                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
-  } else {
-    packed_input_ = execute_input_;
-  }
-  if (!need_align_) {
-    packed_output_ = execute_output_;
-  }
 
-  ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
   }
-  if (need_align_) {
-    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
-                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  }
+
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
@@ -207,7 +143,14 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::T
                                                const mindspore::lite::PrimitiveC *primitive) {
   MS_ASSERT(opParameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
-  auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  kernel::LiteKernel *kernel;
+  if (conv_param->input_channel_ < 32) {
+    kernel =
+      new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel is nullptr.";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index 2355fe8199..ff0a3d0314 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -25,14 +25,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
-                  const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
-                  int task_id);
+void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                const float16_t *bias_data, const ConvParameter *conv_param, int task_id);
 #ifdef __cplusplus
 }
 #endif
 
-
 namespace mindspore::kernel {
 class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  public:
@@ -46,17 +44,11 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int ReSize() override;
   int Run() override;
 
-  int InitBuffer();
   int InitWeightBias();
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
-  SlidingWindowParam *sliding_ = nullptr;
   float16_t *packed_weight_ = nullptr;
-  float16_t *packed_input_ = nullptr;
-  float16_t *packed_output_ = nullptr;
-  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
new file mode 100644
index 0000000000..4e8aa956f8
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -0,0 +1,190 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
+#include "nnacl/fp16/pack_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DepthwiseConv2D;
+
+namespace mindspore::kernel {
+ConvolutionDepthwiseSWFp16CPUKernel::~ConvolutionDepthwiseSWFp16CPUKernel() {
+  if (sliding_ != nullptr) {
+    delete sliding_;
+    sliding_ = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::InitBuffer() {
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
+    int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+    int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
+    packed_input_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(float16_t)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
+    packed_output_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(pack_output_size * sizeof(float16_t)));
+    if (packed_output_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
+  // init weight: o, h, w, i; o == group, i == 1
+  auto weight_tensor = in_tensors_[kWeightIndex];
+  int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
+  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
+  int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
+
+  packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                           weight_tensor->Batch());
+
+  bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
+  auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
+  if (in_tensors_.size() == kInputSize2) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
+    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
+      bias_fp16[i] = (float16_t)ori_bias[i];
+    }
+  }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::Init() {
+  sliding_ = new (std::nothrow) SlidingWindowParam;
+  if (sliding_ == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param failed.";
+    return RET_ERROR;
+  }
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::ReSize() {
+  auto ret = ConvolutionBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::Execute(int task_id) {
+  ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
+               sliding_, task_id);
+  return RET_OK;
+}
+
+static int ConvDwSWFp16Run(void *cdata, int task_id) {
+  auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseSWFp16CPUKernel *>(cdata);
+  auto ret = conv_dw_fp16->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvolutionDepthwiseSWFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
+    return RET_ERROR;
+  }
+
+  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get Execute tensor failed.";
+    return ret;
+  }
+  if (need_align_) {
+    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  } else {
+    packed_input_ = execute_input_;
+  }
+  if (!need_align_) {
+    packed_output_ = execute_output_;
+  }
+
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWFp16Run, this, conv_param_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  if (need_align_) {
+    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_input_);
+    context_->allocator->Free(packed_output_);
+  }
+  ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
new file mode 100644
index 0000000000..582fde8eec
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "nnacl/fp16/conv_depthwise_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
+                  const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
+                  int task_id);
+#ifdef __cplusplus
+}
+#endif
+
+namespace mindspore::kernel {
+class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+ public:
+  ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
+                                      const mindspore::lite::PrimitiveC *primitive)
+      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~ConvolutionDepthwiseSWFp16CPUKernel() override;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+  int InitBuffer();
+  int InitWeightBias();
+  int Execute(int task_id);
+
+ private:
+  SlidingWindowParam *sliding_ = nullptr;
+  float16_t *packed_weight_ = nullptr;
+  float16_t *packed_input_ = nullptr;
+  float16_t *packed_output_ = nullptr;
+  bool need_align_ = false;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index 8a6f3baf05..e229074f67 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -38,10 +38,13 @@ using mindspore::schema::PrimitiveType_Conv2D;
 
 namespace mindspore::kernel {
 int ConvolutionFP16CPUKernel::InitWeightBias() {
-  int kernel_h = conv_param_->kernel_h_;
-  int kernel_w = conv_param_->kernel_w_;
-  int in_channel = conv_param_->input_channel_;
-  int out_channel = conv_param_->output_channel_;
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  int kernel_h = filter_tensor->Height();
+  int kernel_w = filter_tensor->Width();
+  int in_channel = filter_tensor->Channel();
+  int out_channel = filter_tensor->Batch();
+  conv_param_->input_channel_ = in_channel;
+  conv_param_->output_channel_ = out_channel;
   int oc8 = UP_DIV(out_channel, C8NUM);
   int ic4 = UP_DIV(in_channel, C4NUM);
   int kernel_plane = kernel_h * kernel_w;
@@ -81,38 +84,34 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
 }
 
 int ConvolutionFP16CPUKernel::InitTmpBuffer() {
-  int kernel_h = conv_param_->kernel_h_;
-  int kernel_w = conv_param_->kernel_w_;
   int in_batch = conv_param_->input_batch_;
   int in_channel = conv_param_->input_channel_;
   int out_channel = conv_param_->output_channel_;
   int channel_block = UP_DIV(in_channel, C4NUM);
-  int kernel_plane = kernel_h * kernel_w;
-
-  // malloc packed_inputs
   int cal_num = 16;
   int output_count = conv_param_->output_h_ * conv_param_->output_w_;
   int output_tile_count = UP_DIV(output_count, cal_num);
+  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
   int unit_size = kernel_plane * channel_block * C4NUM;
   int packed_input_size = output_tile_count * cal_num * unit_size;
 
-  packed_input_ = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
+  packed_input_ =
+    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(in_batch * packed_input_size * sizeof(float16_t)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc packed_input_ failed.";
     return RET_ERROR;
   }
-  memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));
 
-  size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
-                            conv_param_->input_w_ * sizeof(float16_t);
-  nhwc4_input_ = malloc(nhwc4_input_size);
+  size_t nhwc4_input_size =
+    channel_block * C4NUM * in_batch * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
   if (nhwc4_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
     return RET_ERROR;
   }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
 
-  tmp_output_block_ = reinterpret_cast<float16_t *>(malloc(cal_num * out_channel * sizeof(float16_t)));
+  tmp_output_block_ =
+    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(thread_count_ * cal_num * out_channel * sizeof(float16_t)));
   if (tmp_output_block_ == nullptr) {
     MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
     return RET_ERROR;
@@ -133,9 +132,15 @@ void ConvolutionFP16CPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionFP16CPUKernel::Init() {
+  auto ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init weight bias failed.";
+    return RET_ERROR;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
+  ConfigInputOutput();
   return ReSize();
 }
 
@@ -146,28 +151,11 @@ int ConvolutionFP16CPUKernel::ReSize() {
     return ret;
   }
 
-  FreeTmpBuffer();
-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret;
     return ret;
   }
-  ret = InitWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init weight bias failed.";
-    return RET_ERROR;
-  }
-  ret = InitTmpBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init tmp buffer failed.";
-    return RET_ERROR;
-  }
-  ConfigInputOutput();
   return RET_OK;
 }
 
@@ -177,7 +165,7 @@ int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -200,18 +188,25 @@ int ConvolutionFP16CPUKernel::Run() {
     return ret;
   }
 
+  ret = InitTmpBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init tmp buffer failed.";
+    return RET_ERROR;
+  }
+
   int in_batch = conv_param_->input_batch_;
   int in_h = conv_param_->input_h_;
   int in_w = conv_param_->input_w_;
   int in_channel = conv_param_->input_channel_;
   convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
+    FreeTmpBuffer();
     return RET_ERROR;
   }
-
+  FreeTmpBuffer();
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
index 7ea1052b17..41c4aada5c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -29,7 +29,16 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
                            const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                            const mindspore::lite::PrimitiveC *primitive)
       : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ConvolutionFP16CPUKernel() override { FreeTmpBuffer(); }
+  ~ConvolutionFP16CPUKernel() override {
+    if (fp16_weight_ != nullptr) {
+      free(fp16_weight_);
+      fp16_weight_ = nullptr;
+    }
+    if (packed_weight_ != nullptr) {
+      free(packed_weight_);
+      packed_weight_ = nullptr;
+    }
+  }
 
   int Init() override;
   int ReSize() override;
@@ -41,21 +50,16 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
 
  private:
   void FreeTmpBuffer() {
-    if (fp16_weight_ != nullptr) {
-      free(fp16_weight_);
-      fp16_weight_ = nullptr;
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
     }
-
     if (packed_input_ != nullptr) {
-      free(packed_input_);
+      ctx_->allocator->Free(packed_input_);
       packed_input_ = nullptr;
     }
-    if (packed_weight_ != nullptr) {
-      free(packed_weight_);
-      packed_weight_ = nullptr;
-    }
     if (tmp_output_block_ != nullptr) {
-      free(tmp_output_block_);
+      ctx_->allocator->Free(tmp_output_block_);
       tmp_output_block_ = nullptr;
     }
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
index 7b7d1e17b3..eb06a05c56 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
@@ -126,14 +126,14 @@ void ConvolutionSWFP16CPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionSWFP16CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   ConfigInputOutput();
   return ReSize();
 }
@@ -186,7 +186,7 @@ int ConvolutionSWFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionSWFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionSWFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionSWFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -219,7 +219,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionSWFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionSWFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index 0f55fe633b..6ec07a0b7f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -17,7 +17,6 @@
 #include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
 #include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
 #include "nnacl/fp16/conv_fp16.h"
-#include "nnacl/fp16/common_func.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "nnacl/fp16/pack_fp16.h"
 #include "nnacl/fp16/winograd_transform_fp16.h"
@@ -219,6 +218,14 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
   int output_h = conv_param_->output_h_;
   int output_w = conv_param_->output_w_;
   int oc8 = UP_DIV(channel_out, C8NUM);
+  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+
+  size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t);
+  trans_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tile_buffer_size));
+  if (trans_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc trans_input_ failed.";
+    return RET_ERROR;
+  }
 
   gemm_out_ = reinterpret_cast<float16_t *>(
     ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t)));
@@ -270,19 +277,18 @@ int ConvolutionWinogradFP16CPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionWinogradFP16CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   kernel_unit_ = conv_param_->kernel_h_;
   input_unit_ = output_unit_ + kernel_unit_ - 1;
   conv_param_->input_unit_ = input_unit_;
   conv_param_->output_unit_ = output_unit_;
-
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -297,10 +303,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
     free(nhwc4_input_);
     nhwc4_input_ = nullptr;
   }
-  if (trans_input_ != nullptr) {
-    free(trans_input_);
-    trans_input_ = nullptr;
-  }
 
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -312,10 +314,8 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
   conv_param_->input_unit_ = input_unit_;
   conv_param_->output_unit_ = output_unit_;
 
-  int cal_num = 16;
   int channel_in = conv_param_->input_channel_;
   int ic8 = UP_DIV(channel_in, C8NUM);
-
   size_t nhwc8_input_size =
     ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
   nhwc4_input_ = malloc(nhwc8_input_size);
@@ -325,14 +325,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
   }
   memset(nhwc4_input_, 0, nhwc8_input_size);
 
-  size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t);
-  trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
-  if (trans_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc trans_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(trans_input_, 0, tile_buffer_size);
-
   ret = ConfigInputOutput();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConfigInputOutput failed.";
@@ -348,7 +340,7 @@ int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionWinogradFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -358,6 +350,28 @@ static int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv,
   return RET_OK;
 }
 
+int ConvolutionWinogradFP16CPUKernel::PostProcess() {
+  auto act_type = conv_param_->act_type_;
+  switch (act_type) {
+    case ActType_No:
+      UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                               conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    case ActType_Relu:
+      UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                                   conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    case ActType_Relu6:
+      UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                                    conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupport activation type.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int ConvolutionWinogradFP16CPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
@@ -383,23 +397,17 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   PackNHWCToNHWC8Fp16(execute_input_, nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionWinogradFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionWinogradFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
     FreeTmpBuffer();
     return RET_ERROR;
   }
 
-  // get real output
-  if (conv_param_->is_relu_) {
-    UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                                 conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  } else if (conv_param_->is_relu6_) {
-    UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                                  conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  } else {
-    UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                             conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+  ret = PostProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Post process failed.";
+    return ret;
   }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
index d9dc03611e..f8a63de2d4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -38,10 +38,6 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
       free(fp16_weight_);
       fp16_weight_ = nullptr;
     }
-    if (trans_input_ != nullptr) {
-      free(trans_input_);
-      trans_input_ = nullptr;
-    }
     if (trans_weight_ != nullptr) {
       delete trans_weight_;
       trans_weight_ = nullptr;
@@ -56,9 +52,14 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int MallocFilterMatrix(int oc_block, int oc_block_num);
   int InitTmpBuffer();
   int ConfigInputOutput();
+  int PostProcess();
 
  private:
   void FreeTmpBuffer() {
+    if (trans_input_ != nullptr) {
+      ctx_->allocator->Free(trans_input_);
+      trans_input_ = nullptr;
+    }
     if (tmp_data_ != nullptr) {
       ctx_->allocator->Free(tmp_data_);
       tmp_data_ = nullptr;
@@ -85,7 +86,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   TmpBufferAddressFp16 tmp_buffer_address_list_[4];
 };
 int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
-                                 ConvParameter *conv_param, int oc_block);
+                                ConvParameter *conv_param, int oc_block);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 3a8ca200f2..8018f43f63 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -37,20 +37,6 @@ DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
-}
-
-void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      delete packed_input_;
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      delete packed_output_;
-      packed_output_ = nullptr;
-    }
-  }
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
@@ -69,14 +55,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
 int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
   int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
-  packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
+  packed_input_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(float16_t)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
 
   int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
-  packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
+  packed_output_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(pack_output_size * sizeof(float16_t)));
   if (packed_output_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -137,17 +123,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
-  FreeTmpBuffer();
   InitSlideParam();
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
-  ret = InitBuffer();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
@@ -157,7 +137,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-static int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int DeconvDwFp16Run(void *cdata, int task_id) {
   auto deconv_dw_fp16 = reinterpret_cast<DeconvolutionDepthwiseFp16CPUKernel *>(cdata);
   auto ret = deconv_dw_fp16->Execute(task_id);
   if (ret != RET_OK) {
@@ -168,13 +148,18 @@ static int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata)
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
     return RET_ERROR;
   }
 
@@ -193,7 +178,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
   if (!need_align_) {
     packed_output_ = execute_output_;
   }
-  ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -202,6 +187,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
   if (need_align_) {
     PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_input_);
+    context_->allocator->Free(packed_output_);
   }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
index 984c19731e..539d129664 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@@ -52,7 +52,6 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
   SlidingWindowParam *sliding_ = nullptr;
   float16_t *packed_weight_ = nullptr;
   float16_t *packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index 0deb852cf9..817bb91497 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -137,7 +137,7 @@ void DeConvolutionFp16CPUKernel::FreeRunBuf() {
   return;
 }
 
-static int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int DeConvFp16Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -188,7 +188,7 @@ int DeConvolutionFp16CPUKernel::Run() {
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
     RowMajor2Col16MajorFp16(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_);
 
-    error_code = LiteBackendParallelLaunch(DeConvFp16Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvFp16Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
index dddad9ae9b..80116acb65 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
@@ -21,6 +21,7 @@
 #include "include/errorcode.h"
 #include "nnacl/op_base.h"
 #include "nnacl/fp16/cast_fp16.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -29,29 +30,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Pooling;
 
 namespace mindspore::kernel {
-int PoolingFp16CPUKernel::InitBuffer() {
-  int in_batch = pooling_param_->input_batch_;
-  int in_h = pooling_param_->input_h_;
-  int in_w = pooling_param_->input_w_;
-  int in_channel = pooling_param_->input_channel_;
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(in_batch * in_h * in_w * in_channel * sizeof(float16_t)));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-
-  int out_batch = pooling_param_->output_batch_;
-  int out_h = pooling_param_->output_h_;
-  int out_w = pooling_param_->output_w_;
-  int out_channel = pooling_param_->output_channel_;
-  fp16_output_ = reinterpret_cast<float16_t *>(malloc(out_batch * out_h * out_w * out_channel * sizeof(float16_t)));
-  if (fp16_output_ == nullptr) {
-    MS_LOG(ERROR) << "fp16_out malloc failed.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int PoolingFp16CPUKernel::Init() {
   auto ret = PoolingBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -71,17 +49,11 @@ int PoolingFp16CPUKernel::ReSize() {
     MS_LOG(ERROR) << "PoolingBase ReSize fai1!ret: " << ret;
     return ret;
   }
-
-  ret = InitBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init Buffer fail!ret: " << ret;
-    return ret;
-  }
   return RET_OK;
 }
 
 int PoolingFp16CPUKernel::RunImpl(int task_id) {
-  if (pooling_param_->max_pooling_) {
+  if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
     MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);
   } else {
     AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);
@@ -89,7 +61,7 @@ int PoolingFp16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int PoolingFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int PoolingFp16Impl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingFp16CPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -105,19 +77,32 @@ int PoolingFp16CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  auto ele_num = in_tensors_.front()->ElementsNum();
-  auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
-  Float32ToFloat16(input_ptr, fp16_input_, ele_num);
 
-  int error_code = LiteBackendParallelLaunch(PoolingFp16Impl, this, thread_count_);
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto in_data_type_ = input_tensor->data_type();
+  MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);
+  fp16_input_ = ConvertInputFp32toFp16(input_tensor, context_);
+
+  auto out_tensor = out_tensors_.at(kOutputIndex);
+  auto out_data_type_ = out_tensor->data_type();
+  MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);
+  fp16_output_ = MallocOutputFp16(out_tensor, context_);
+
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
     return RET_ERROR;
   }
 
-  auto out_ele_num = out_tensors_.front()->ElementsNum();
-  auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  Float16ToFloat32(fp16_output_, output_ptr, out_ele_num);
+  if (in_data_type_ == kNumberTypeFloat32) {
+    context_->allocator->Free(fp16_input_);
+  }
+  if (out_data_type_ == kNumberTypeFloat32) {
+    auto out_ele_num = out_tensor->ElementsNum();
+    auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
+    Float16ToFloat32(fp16_output_, output_addr, out_ele_num);
+    context_->allocator->Free(fp16_output_);
+  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h
index 2424955aeb..adf2145571 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h
@@ -28,17 +28,9 @@ class PoolingFp16CPUKernel : public PoolingBaseCPUKernel {
                        const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                        const mindspore::lite::PrimitiveC *primitive)
       : PoolingBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~PoolingFp16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
-    if (fp16_output_ != nullptr) {
-      free(fp16_output_);
-    }
-  };
+  ~PoolingFp16CPUKernel() override = default;
 
   int Init() override;
-  int InitBuffer();
   int ReSize() override;
   int Run() override;
   int RunImpl(int task_id);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
index 3bc9d21ada..5b689b0595 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
@@ -67,7 +67,7 @@ int ReduceFp16CPUKernel::CallReduceUnit(int task_id) {
   return ret;
 }
 
-static int ReduceImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ReduceImpl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceFp16CPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -112,7 +112,7 @@ int ReduceFp16CPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       FreeTmpBuffer();
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h
index 9c824947b1..f94102f5ae 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h
@@ -44,7 +44,6 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel {
  private:
   Reducer reducer_ = nullptr;
   std::vector<float16_t *> data_buffers_;
-  const float *src_data_ = nullptr;
   float *dst_data_ = nullptr;
   float16_t *fp16_input_ = nullptr;
   const float16_t *fp16_src_data_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc
index a751ae528f..140779d5ad 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc
@@ -30,7 +30,7 @@ using mindspore::schema::PrimitiveType_Reshape;
 
 namespace mindspore::kernel {
 
-int ReshapeCPUKernel::Run() {
+int ReshapeFp16CPUKernel::Run() {
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
@@ -73,4 +73,31 @@ int ReshapeCPUKernel::Run() {
   }
   return RET_OK;
 }
+
+kernel::LiteKernel *CpuReshapeFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                const std::vector<lite::tensor::Tensor *> &outputs,
+                                                OpParameter *opParameter, const Context *ctx,
+                                                const kernel::KernelKey &desc,
+                                                const mindspore::lite::PrimitiveC *primitive) {
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Input opParameter is nullptr!";
+    return nullptr;
+  }
+  MS_ASSERT(desc.type == schema::PrimitiveType_Reshape);
+  auto *kernel = new (std::nothrow) ReshapeFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new ReshapeFp16CPUKernel fail!";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reshape, CpuReshapeFp16KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
new file mode 100644
index 0000000000..edfe40b321
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+#include <vector>
+#include "src/runtime/kernel/arm/fp16/softmax_fp16.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "nnacl/fp16/softmax_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_SoftMax;
+
+namespace mindspore::kernel {
+int SoftmaxFp16CPUKernel::Init() {
+  auto ret = SoftmaxBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int SoftmaxFp16CPUKernel::ReSize() {
+  return SoftmaxBaseCPUKernel::ReSize();
+}
+
+int SoftmaxFp16CPUKernel::MallocTmpBuffer() {
+  auto n_dim = softmax_param_->n_dim_;
+  auto axis = softmax_param_->axis_;
+  if (axis == -1) {
+    softmax_param_->axis_ += n_dim;
+    axis = softmax_param_->axis_;
+  }
+  auto in_shape = in_tensors_.front()->shape();
+  int out_plane_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    out_plane_size *= in_shape[i];
+  }
+  int in_plane_size = 1;
+  for (int i = axis + 1; i < n_dim; i++) {
+    in_plane_size *= in_shape[i];
+  }
+
+  sum_data_ =
+    reinterpret_cast<float16_t *>(context_->allocator->Malloc(out_plane_size * in_plane_size * sizeof(float16_t)));
+  if (sum_data_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data for softmax fail!";
+    return RET_ERROR;
+  }
+  memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t));
+
+  input_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_);
+  if (input_fp16_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_ERROR;
+  }
+  output_fp16_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_);
+  if (output_fp16_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void SoftmaxFp16CPUKernel::FreeTmpBuffer() {
+  if (sum_data_ != nullptr) {
+    context_->allocator->Free(sum_data_);
+    sum_data_ = nullptr;
+  }
+  if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) {
+    if (input_fp16_ != nullptr) {
+      context_->allocator->Free(input_fp16_);
+      input_fp16_ = nullptr;
+    }
+  }
+
+  if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) {
+    if (output_fp16_ != nullptr) {
+      context_->allocator->Free(output_fp16_);
+      output_fp16_ = nullptr;
+    }
+  }
+}
+
+int SoftmaxFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
+    return RET_ERROR;
+  }
+  ret = MallocTmpBuffer();
+  if (ret != RET_OK) {
+    FreeTmpBuffer();
+    MS_LOG(ERROR) << "MallocTmpBuffer failed";
+    return RET_ERROR;
+  }
+  SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_);
+  auto out_tensor = out_tensors_.at(kOutputIndex);
+  if (out_tensor->data_type() == kNumberTypeFloat32) {
+    Float16ToFloat32(output_fp16_, reinterpret_cast<float *>(out_tensor->Data()), out_tensor->ElementsNum());
+  }
+  FreeTmpBuffer();
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuSoftmaxFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                const std::vector<lite::tensor::Tensor *> &outputs,
+                                                OpParameter *opParameter, const lite::Context *ctx,
+                                                const kernel::KernelKey &desc,
+                                                const mindspore::lite::PrimitiveC *primitive) {
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Input opParameter is nullptr!";
+    return nullptr;
+  }
+  MS_ASSERT(desc.type == schema::PrimitiveType_SoftMax);
+  auto *kernel = new (std::nothrow) SoftmaxFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new SoftmaxFp16CPUKernel fail!";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_SoftMax, CpuSoftmaxFp16KernelCreator)
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h
new file mode 100644
index 0000000000..669a595c2d
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SOFTMAX_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SOFTMAX_FP16_H_
+
+#include <arm_neon.h>
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/base/softmax_base.h"
+
+namespace mindspore::kernel {
+class SoftmaxFp16CPUKernel : public SoftmaxBaseCPUKernel {
+ public:
+  SoftmaxFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                       const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                       const mindspore::lite::PrimitiveC *primitive)
+      : SoftmaxBaseCPUKernel(parameter, inputs, outputs, ctx, primitive), sum_data_(nullptr) {}
+  ~SoftmaxFp16CPUKernel() = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int MallocTmpBuffer();
+  void FreeTmpBuffer();
+
+ private:
+  float16_t *sum_data_ = nullptr;
+  float16_t *input_fp16_ = nullptr;
+  float16_t *output_fp16_ = nullptr;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SOFTMAX_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
index 5174e10baf..3a4e9f41cc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
@@ -63,7 +63,7 @@ int SplitFp16CPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-static int SplitRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int SplitRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitFp16CPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -97,7 +97,7 @@ int SplitFp16CPUKernel::Run() {
       output_ptr_[i] = reinterpret_cast<float16_t *>(out_tensors_.at(i)->Data());
     }
   }
-  ret = LiteBackendParallelLaunch(SplitRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "split error error_code[" << ret << "]";
     return RET_ERROR;
@@ -111,8 +111,8 @@ int SplitFp16CPUKernel::Run() {
       context_->allocator->Free(output_ptr_[i]);
       output_ptr_[i] = nullptr;
     }
-    return RET_OK;
   }
+  return RET_OK;
 }
 
 kernel::LiteKernel *CpuSplitFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
index 0333c36870..eca0714e18 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
@@ -30,10 +30,6 @@ using mindspore::lite::RET_OP_EXECUTE_FAILURE;
 using mindspore::schema::PrimitiveType_Transpose;
 
 namespace mindspore::kernel {
-namespace {
-constexpr int kTransposeInputNum = 1;
-constexpr int kTransposeOutputNum = 1;
-}  // namespace
 int TransposeFp16CPUKernel::Init() {
   TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
   num_unit_ = static_cast<int>(in_tensors_[kInputIndex]->shape().at(param->perm_[kNHWC_H]));
@@ -121,7 +117,7 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) {
   return RET_OK;
 }
 
-static int TransposeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int TransposeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<TransposeFp16CPUKernel *>(cdata);
   auto ret = g_kernel->TransposeParallel(task_id);
   if (ret != RET_OK) {
@@ -166,7 +162,7 @@ int TransposeFp16CPUKernel::Run() {
   in_shape_ = const_cast<int *>(in_tensor->shape().data());
   out_shape_ = const_cast<int *>(out_tensor->shape().data());
 
-  ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, TransposeRun, this, thread_h_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
     FreeFp16Buffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
index 41b76206a9..3b61a0c7ca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
@@ -67,7 +67,7 @@ int ActivationCPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ActivationRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ActivationRun(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<ActivationCPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -83,7 +83,7 @@ int ActivationCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(ActivationRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
index 67e1b24697..c5cb1b6d07 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
@@ -28,13 +28,13 @@ using mindspore::schema::PrimitiveType_AddN;
 
 namespace mindspore::kernel {
 namespace {
-int AddNLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int AddNLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<AddNCPUKernel *>(cdata);
-  return kernel->AddNParallelRun(thread_id);
+  return kernel->AddNParallelRun(task_id);
 }
 }  // namespace
 
@@ -74,7 +74,7 @@ int AddNCPUKernel::Run() {
   in1_addr_ = input0_data;
   in2_addr_ = input1_data;
   out_addr_ = output_data;
-  ret = LiteBackendParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddNLaunch, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "addn launch fail!ret: " << ret;
     return RET_ERROR;
@@ -82,7 +82,7 @@ int AddNCPUKernel::Run() {
   for (size_t i = 2; i < in_tensors_.size(); ++i) {
     in1_addr_ = reinterpret_cast<float *>(in_tensors_[i]->Data());
     in2_addr_ = output_data;
-    ret = LiteBackendParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddNLaunch, this, op_parameter_->thread_num_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "addn launch fail!ret: " << ret << ", input index: " << i;
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
index 2f56c5d9e9..6a72842ce5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
@@ -29,6 +29,9 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Eltwise;
 
 namespace mindspore::kernel {
+
+ArithmeticCPUKernel::~ArithmeticCPUKernel() {}
+
 int ArithmeticCPUKernel::Init() {
   if (!InferShapeDone()) {
     return RET_OK;
@@ -42,23 +45,77 @@ int ArithmeticCPUKernel::ReSize() {
   arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum();
 
   if (arithmeticParameter_->in_elements_num0_ == 1 || arithmeticParameter_->in_elements_num1_ == 1) {
-    if (arithmeticParameter_->activation_type_ == schema::ActivationType_NO_ACTIVATION) {
-      switch (arithmeticParameter_->op_parameter_.type_) {
-        case PrimitiveType_Mul:
-          arithmeticParameter_->broadcasting_ = false;
-          arithmetic_opt_run_ = ElementOptMul;
-          break;
-        case PrimitiveType_Add:
-          arithmeticParameter_->broadcasting_ = false;
-          arithmetic_opt_run_ = ElementOptAdd;
-          break;
-        case PrimitiveType_Sub:
-          arithmeticParameter_->broadcasting_ = false;
-          arithmetic_opt_run_ = ElementOptSub;
-          break;
-        default:
-          break;
-      }
+    switch (arithmeticParameter_->op_parameter_.type_) {
+      case PrimitiveType_Mul:
+        switch (arithmeticParameter_->activation_type_) {
+          case schema::ActivationType_RELU:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptMulRelu;
+            break;
+          case schema::ActivationType_RELU6:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptMulRelu6;
+            break;
+          default:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptMul;
+            break;
+        }
+        break;
+      case PrimitiveType_Add:
+        switch (arithmeticParameter_->activation_type_) {
+          case schema::ActivationType_RELU:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptAddRelu;
+            break;
+          case schema::ActivationType_RELU6:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptAddRelu6;
+            break;
+          default:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptAdd;
+            break;
+        }
+        break;
+      case PrimitiveType_Sub:
+        switch (arithmeticParameter_->activation_type_) {
+          case schema::ActivationType_RELU:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptSubRelu;
+            break;
+          case schema::ActivationType_RELU6:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptSubRelu6;
+            break;
+          default:
+            arithmeticParameter_->broadcasting_ = false;
+            arithmetic_opt_run_ = ElementOptSub;
+            break;
+        }
+        break;
+      default:
+        break;
+    }
+  }
+  return RET_OK;
+}
+
+int ArithmeticCPUKernel::BroadcastRun(float *input0, float *input1, float *output, int dim, int out_count,
+                                        int out_thread_stride) {
+  if (dim > break_pos_) {
+    return arithmetic_run_(input0 + out_thread_stride, input1 + out_thread_stride, output + out_thread_stride,
+                           out_count);
+  }
+  for (int i = 0; i < arithmeticParameter_->out_shape_[dim]; ++i) {
+    int pos0_ = arithmeticParameter_->in_shape0_[dim] == 1 ? 0 : i;
+    int pos1_ = arithmeticParameter_->in_shape1_[dim] == 1 ? 0 : i;
+    int error_code =
+      BroadcastRun(input0 + pos0_ * arithmeticParameter_->in_strides0_[dim],
+                     input1 + pos1_ * arithmeticParameter_->in_strides1_[dim],
+                     output + i * arithmeticParameter_->out_strides_[dim], dim + 1, out_count, out_thread_stride);
+    if (error_code != RET_OK) {
+      return error_code;
     }
   }
   return RET_OK;
@@ -81,8 +138,10 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) {
 
   int error_code = RET_OK;
   if (arithmeticParameter_->broadcasting_) {
-    error_code = arithmetic_run_(tile_data0_ + stride * task_id, tile_data1_ + stride * task_id,
-                                 output_data + stride * task_id, count);
+    stride = UP_DIV(outside_, thread_count_);
+    out_count_ = MSMIN(stride, outside_ - stride * task_id);
+    out_thread_stride_ = stride * task_id;
+    error_code = BroadcastRun(input0_data, input1_data1, output_data, 0, out_count_, out_thread_stride_);
   } else if (arithmetic_opt_run_ != nullptr) {
     if (arithmeticParameter_->in_elements_num0_ == 1) {
       error_code = arithmetic_opt_run_(input0_data, input1_data1 + stride * task_id, output_data + stride * task_id,
@@ -104,7 +163,7 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) {
   return RET_OK;
 }
 
-int ArithmeticsRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticsRun(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticCPUKernel *>(cdata);
   auto error_code = arithmetic_kernel->DoArithmetic(task_id);
   if (error_code != RET_OK) {
@@ -120,31 +179,27 @@ int ArithmeticCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-
   if (arithmeticParameter_->broadcasting_) {
-    auto input_data0 = reinterpret_cast<float *>(in_tensors_[0]->Data());
-    auto input_data1 = reinterpret_cast<float *>(in_tensors_[1]->Data());
-    auto length = arithmeticParameter_->out_elements_num_ * sizeof(float);
-    MS_ASSERT(context_->allocator != nullptr);
-    tile_data0_ = reinterpret_cast<float *>(context_->allocator->Malloc(length));
-    tile_data1_ = reinterpret_cast<float *>(context_->allocator->Malloc(length));
-    if (tile_data0_ == nullptr || tile_data1_ == nullptr) {
-      MS_LOG(ERROR) << "Memory allocation failed";
-      context_->allocator->Free(tile_data0_);
-      context_->allocator->Free(tile_data1_);
-      return RET_ERROR;
+    outside_ = 1;
+    for (auto i = arithmeticParameter_->ndim_ - 1; i >= 0; --i) {
+      if (arithmeticParameter_->in_shape0_[i] != arithmeticParameter_->in_shape1_[i]) {
+        break_pos_ = i;
+        break;
+      }
+      outside_ *= arithmeticParameter_->out_shape_[i];
     }
-    TileDimensions(input_data0, input_data1, tile_data0_, tile_data1_, arithmeticParameter_);
+    ComputeStrides(arithmeticParameter_->in_shape0_, arithmeticParameter_->in_strides0_, arithmeticParameter_->ndim_);
+    ComputeStrides(arithmeticParameter_->in_shape1_, arithmeticParameter_->in_strides1_, arithmeticParameter_->ndim_);
+    ComputeStrides(arithmeticParameter_->out_shape_, arithmeticParameter_->out_strides_, arithmeticParameter_->ndim_);
   }
-  ret = LiteBackendParallelLaunch(ArithmeticsRun, this, thread_count_);
-  if (arithmeticParameter_->broadcasting_) {
-    context_->allocator->Free(tile_data0_);
-    context_->allocator->Free(tile_data1_);
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Arithmetic function error error_code[" << ret << "]";
+
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsRun, this, thread_count_);
+
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Arithmetic function error error_code[" << error_code << "]";
+    return RET_ERROR;
   }
-  return ret;
+  return RET_OK;
 }
 
 kernel::LiteKernel *CpuArithmeticFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h
index f4b42a25a1..c55bf35bfa 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h
@@ -45,8 +45,6 @@ class ArithmeticCPUKernel : public LiteKernel {
   typedef int (*ArithmeticRun)(float *input0, float *input1, float *output, int element_size);
   typedef int (*ArithmeticOptRun)(float *input0, float *input1, float *output, int element_size,
                                   ArithmeticParameter *param);
-  typedef int (*ArithmeticBroadcastRun)(float *input0, float *input1, float *tile_input0, float *tile_input1,
-                                        float *output, int element_size, ArithmeticParameter *param);
 
  public:
   ArithmeticCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
@@ -109,64 +107,50 @@ class ArithmeticCPUKernel : public LiteKernel {
         break;
       case PrimitiveType_LogicalAnd:
         arithmetic_run_ = ElementLogicalAnd;
-        arithmetic_broadcast_run_ = BroadcastLogicalAnd;
         break;
       case PrimitiveType_LogicalOr:
         arithmetic_run_ = ElementLogicalOr;
-        arithmetic_broadcast_run_ = BroadcastLogicalOr;
         break;
       case PrimitiveType_Maximum:
         arithmetic_run_ = ElementMaximum;
-        arithmetic_broadcast_run_ = BroadcastMaximum;
         break;
       case PrimitiveType_Minimum:
         arithmetic_run_ = ElementMinimum;
-        arithmetic_broadcast_run_ = BroadcastMinimum;
         break;
       case PrimitiveType_FloorDiv:
         arithmetic_run_ = ElementFloorDiv;
-        arithmetic_broadcast_run_ = BroadcastFloorDiv;
         break;
       case PrimitiveType_FloorMod:
         arithmetic_run_ = ElementFloorMod;
-        arithmetic_broadcast_run_ = BroadcastFloorMod;
         break;
       case PrimitiveType_Equal:
         arithmetic_run_ = ElementEqual;
-        arithmetic_broadcast_run_ = BroadcastEqual;
         break;
       case PrimitiveType_NotEqual:
         arithmetic_run_ = ElementNotEqual;
-        arithmetic_broadcast_run_ = BroadcastNotEqual;
         break;
       case PrimitiveType_Less:
         arithmetic_run_ = ElementLess;
-        arithmetic_broadcast_run_ = BroadcastLess;
         break;
       case PrimitiveType_LessEqual:
         arithmetic_run_ = ElementLessEqual;
-        arithmetic_broadcast_run_ = BroadcastLessEqual;
         break;
       case PrimitiveType_Greater:
         arithmetic_run_ = ElementGreater;
-        arithmetic_broadcast_run_ = BroadcastGreater;
         break;
       case PrimitiveType_GreaterEqual:
         arithmetic_run_ = ElementGreaterEqual;
-        arithmetic_broadcast_run_ = BroadcastGreaterEqual;
         break;
       case PrimitiveType_SquaredDifference:
         arithmetic_run_ = ElementSquaredDifference;
-        arithmetic_broadcast_run_ = BroadcastSquaredDifference;
         break;
       default:
         MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
         arithmetic_run_ = nullptr;
-        arithmetic_broadcast_run_ = nullptr;
         break;
     }
   }
-  ~ArithmeticCPUKernel() = default;
+  ~ArithmeticCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -174,12 +158,14 @@ class ArithmeticCPUKernel : public LiteKernel {
   int DoArithmetic(int task_id);
 
  private:
+  int BroadcastRun(float *input0, float *input1, float *output, int dim, int out_count, int out_thread_stride);
+  int break_pos_;
+  int outside_;
+  int out_thread_stride_;
+  int out_count_;
   int thread_count_;
-  float *tile_data0_ = nullptr;
-  float *tile_data1_ = nullptr;
   ArithmeticParameter *arithmeticParameter_;
   ArithmeticRun arithmetic_run_ = nullptr;
-  ArithmeticBroadcastRun arithmetic_broadcast_run_ = nullptr;
   ArithmeticOptRun arithmetic_opt_run_ = nullptr;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
index 57fd294072..75d568b609 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
@@ -41,7 +41,7 @@ int ArithmeticSelfCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int ArithmeticSelfRuns(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticSelfRuns(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ArithmeticSelfCPUKernel *>(cdata);
   auto ret = g_kernel->DoArithmeticSelf(task_id);
   if (ret != RET_OK) {
@@ -80,7 +80,7 @@ int ArithmeticSelfCPUKernel::Run() {
   auto out_tensor = out_tensors_.at(0);
   in_ptr_ = reinterpret_cast<float *>(input_tensor->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensor->Data());
-  ret = LiteBackendParallelLaunch(ArithmeticSelfRuns, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticSelfRuns, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.h b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.h
index 4fb1b5ff97..18fbd93d6d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.h
@@ -46,7 +46,7 @@ class ArithmeticSelfCPUKernel : public LiteKernel {
   explicit ArithmeticSelfCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                    const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                    const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {
     switch (parameter->type_) {
       case PrimitiveType_Abs:
         arithmeticSelf_run_ = ElementAbs;
@@ -102,7 +102,6 @@ class ArithmeticSelfCPUKernel : public LiteKernel {
   size_t data_size_;
   ArithmeticSelfParameter *arithmeticSelfParameter_;
   ArithmeticSelfRun arithmeticSelf_run_;
-  const Context *ctx_;
   int thread_count_;
   float *in_ptr_;
   float *out_ptr_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
index 6bfa90c763..050b868d63 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
@@ -75,7 +75,7 @@ int BatchnormCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail! Ret error code: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormRun, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
   }
@@ -88,7 +88,7 @@ int BatchnormCPUKernel::DoExecute(int task_id) {
   return mindspore::lite::RET_OK;
 }
 
-int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int BatchNormRun(void *cdata, int task_id) {
   auto kernel = reinterpret_cast<BatchnormCPUKernel *>(cdata);
   auto ret = kernel->DoExecute(task_id);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
index 3261f4a06f..e759058618 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
@@ -48,7 +48,7 @@ class BatchnormCPUKernel : public LiteKernel {
   void *variance_ = nullptr;
 };
 
-int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int BatchNormRun(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BATCHNORM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
index 2e984644bf..4d10d0fb81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Cast;
 
 namespace mindspore::kernel {
 namespace {
-int CastRun(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CastRun(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "input cdata is nullptr!";
     return RET_ERROR;
   }
 
-  return reinterpret_cast<CastCPUKernel *>(cdata)->DoCast(thread_id);
+  return reinterpret_cast<CastCPUKernel *>(cdata)->DoCast(task_id);
 }
 }  // namespace
 
@@ -111,7 +111,7 @@ int CastCPUKernel::Run() {
   if (data_num_ == 0) {
     return RET_OK;
   }
-  return LiteBackendParallelLaunch(CastRun, this, op_parameter_->thread_num_);
+  return ParallelLaunch(THREAD_POOL_DEFAULT, CastRun, this, op_parameter_->thread_num_);
 }
 
 kernel::LiteKernel *CpuCastFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
index 5e353cb7a8..b3330d9479 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
@@ -28,12 +28,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_ConstantOfShape;
 
 namespace mindspore::kernel {
-
-namespace {
-constexpr int kInputNum = 1;
-constexpr int kOutputNum = 1;
-}  // namespace
-
 int ConstantOfShapeCPUKernel::Init() { return RET_OK; }
 
 int ConstantOfShapeCPUKernel::ReSize() { return RET_OK; }
@@ -47,7 +41,7 @@ int ConstantOfShapeCPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int ConstantOfShapeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConstantOfShapeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ConstantOfShapeCPUKernel *>(cdata);
   auto ret = g_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -68,7 +62,7 @@ int ConstantOfShapeCPUKernel::Run() {
   param_->unit_ = UP_DIV(param_->element_sz_, thread_num);
   param_->op_parameter_.thread_num_ = thread_num;
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
-  auto ret = LiteBackendParallelLaunch(ConstantOfShapeRun, this, thread_num);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConstantOfShapeRun, this, thread_num);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConstantOfShapeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
index 7d83c4fba1..c5252fdc47 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -83,7 +83,28 @@ int ConvolutionCPUKernel::InitTmpBuffer() {
   int out_channel = conv_param_->output_channel_;
   MS_ASSERT(ctx_->allocator != nullptr);
 
-  tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(TILE_NUM * out_channel * sizeof(float)));
+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
+  size_t nhwc4_input_size =
+    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
+  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
+  if (nhwc4_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
+    return RET_ERROR;
+  }
+
+  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
+  int output_tile_count = UP_DIV(output_count, TILE_NUM);
+  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
+  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
+  packed_input_ =
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc packed input failed.";
+    return RET_ERROR;
+  }
+
+  tmp_output_block_ =
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(thread_count_ * TILE_NUM * out_channel * sizeof(float)));
   if (tmp_output_block_ == nullptr) {
     MS_LOG(ERROR) << "malloc tmp output block failed.";
     return RET_ERROR;
@@ -104,14 +125,14 @@ void ConvolutionCPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   ConfigInputOutput();
   return ReSize();
 }
@@ -123,40 +144,11 @@ int ConvolutionCPUKernel::ReSize() {
     return ret;
   }
 
-  if (nhwc4_input_ != nullptr) {
-    free(nhwc4_input_);
-    nhwc4_input_ = nullptr;
-  }
-  if (packed_input_ != nullptr) {
-    free(packed_input_);
-    packed_input_ = nullptr;
-  }
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvolutionBase init failed.";
     return RET_ERROR;
   }
-
-  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
-    return RET_ERROR;
-  }
-  memset(nhwc4_input_, 0, nhwc4_input_size);
-
-  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
-  int output_tile_count = UP_DIV(output_count, TILE_NUM);
-  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
-  packed_input_ = reinterpret_cast<float *>(malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float)));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc packed input failed.";
-    return RET_ERROR;
-  }
-  memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size * sizeof(float));
   return RET_OK;
 }
 
@@ -171,7 +163,7 @@ int ConvolutionCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -199,7 +191,7 @@ int ConvolutionCPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
     FreeTmpBuffer();
@@ -232,34 +224,31 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
   int kernel_h = conv_param->kernel_h_;
   int kernel_w = conv_param->kernel_w_;
-  int stride_h = conv_param->stride_h_;
-  int stride_w = conv_param->stride_w_;
-  int dilation_h = conv_param->dilation_h_;
-  int dilation_w = conv_param->dilation_w_;
   conv_param->input_h_ = inputs.front()->Height();
   conv_param->input_w_ = inputs.front()->Width();
+  conv_param->input_channel_ = inputs.front()->Channel();
   conv_param->output_h_ = outputs.front()->Height();
   conv_param->output_w_ = outputs.front()->Width();
+  conv_param->output_channel_ = outputs.front()->Channel();
+  conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
   bool use_winograd = false;
-  bool use_sw = false;
   int out_unit;
   InputTransformUnitFunc input_trans_func = nullptr;
   OutputTransformUnitFunc output_trans_func = nullptr;
   if (primitive != nullptr && primitive->GetInferFlag()) {
     CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
-    use_sw = CheckIfUseSlideWindow(conv_param);
   }
 
   kernel::LiteKernel *kernel;
   if (kernel_h == 1 && kernel_w == 1) {
     kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
-  } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
-    kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
   } else if (use_winograd) {
-    kernel =
-      new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
-  } else if (use_sw) {
-    kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
+    if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
+      kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
+    } else {
+      kernel = new (std::nothrow)
+        kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
+    }
   } else {
     kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
index 8b8fb3aca5..58af23b2c6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
@@ -35,10 +35,6 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
       free(packed_weight_);
       packed_weight_ = nullptr;
     }
-    if (packed_input_ != nullptr) {
-      free(packed_input_);
-      packed_input_ = nullptr;
-    }
   }
 
   int Init() override;
@@ -55,6 +51,14 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
       ctx_->allocator->Free(tmp_output_block_);
       tmp_output_block_ = nullptr;
     }
+    if (nhwc4_input_ != nullptr) {
+      ctx_->allocator->Free(nhwc4_input_);
+      nhwc4_input_ = nullptr;
+    }
+    if (packed_input_ != nullptr) {
+      ctx_->allocator->Free(packed_input_);
+      packed_input_ = nullptr;
+    }
   }
   float *packed_input_ = nullptr;
   float *packed_weight_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
index 30b2b6a1e3..983cdddd1b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -35,9 +35,9 @@ Convolution1x1CPUKernel::~Convolution1x1CPUKernel() {
 }
 
 void Convolution1x1CPUKernel::FreeTmpBuffer() {
-  if (pack_input_ != nullptr) {
-    free(pack_input_);
-    pack_input_ = nullptr;
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
+    free(input_ptr_);
+    input_ptr_ = nullptr;
   }
   return;
 }
@@ -59,10 +59,9 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
   matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
   matmul_param_->col_ = conv_param_->output_channel_;
   matmul_param_->deep_ = conv_param_->input_channel_;
-  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
+  matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
   matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
-  matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
-  matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
+  matmul_param_->act_type_ = conv_param_->act_type_;
   return;
 }
 
@@ -94,18 +93,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
 }
 
 int Convolution1x1CPUKernel::InitConv1x1Param() {
-  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
+  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                       conv_param_->stride_w_ != 1);
 
   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
   thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM;
 
-  pack_input_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float)));
-  if (pack_input_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
-    return RET_MEMORY_FAILED;
+  if (pre_trans_input_) {
+    input_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float)));
+    if (input_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
+      return RET_MEMORY_FAILED;
+    }
+    memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float));
   }
-  memset(pack_input_, 0, matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float));
+
   return RET_OK;
 }
 
@@ -113,25 +115,24 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
   output_ptr_ = src_output;
 
   if (pre_trans_input_) {
-    Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_);
+    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float));
   } else {
     input_ptr_ = src_input;
   }
 
-  RowMajor2Col8Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
+  RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
   return;
 }
 
 int Convolution1x1CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-
   int error_code = InitConv1x1BiasWeight();
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Convolution base init failed.";
     return error_code;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -140,17 +141,14 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
   if (cur_oc <= 0) {
     return RET_OK;
   }
-
-  auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
-
-  MatMul(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
-         output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
-         matmul_param_->row_, cur_oc, matmul_param_->col_, true);
-
+  MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
+            output_ptr_ + task_id * thread_stride_, reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id,
+            matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_,
+            OutType_Nhwc);
   return RET_OK;
 }
 
-int Convolution1x1Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution1x1Run(void *cdata, int task_id) {
   auto conv1x1 = reinterpret_cast<Convolution1x1CPUKernel *>(cdata);
   auto error_code = conv1x1->DoConv1x1(task_id);
   if (error_code != RET_OK) {
@@ -169,29 +167,27 @@ int Convolution1x1CPUKernel::Run() {
   auto src_in = reinterpret_cast<float *>(in_tensors_[0]->Data());
   auto src_out = reinterpret_cast<float *>(out_tensors_[0]->Data());
 
-  if (pre_trans_input_) {
-    input_ptr_ =
-      reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float)));
-    if (input_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
-      return RET_MEMORY_FAILED;
-    }
+  pack_input_ =
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
+  if (pack_input_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
+    return RET_MEMORY_FAILED;
   }
 
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
     Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
                 src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);
 
-    int error_code = LiteBackendParallelLaunch(Convolution1x1Run, this, thread_count_);
+    int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "conv1x1 strassen error error_code[" << error_code << "]";
       return RET_ERROR;
     }
   }
 
-  if (pre_trans_input_) {
-    ctx_->allocator->Free(input_ptr_);
-    input_ptr_ = nullptr;
+  if (pack_input_ != nullptr) {
+    ctx_->allocator->Free(pack_input_);
+    pack_input_ = nullptr;
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
index d30a9c4402..a17a0853b0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -94,10 +94,19 @@ int Convolution3x3CPUKernel::InitWeightBias() {
 }
 
 int Convolution3x3CPUKernel::InitTmpBuffer() {
+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
   int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
+  int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
   const int k_plane = 16;
   MS_ASSERT(ctx_->allocator != nullptr);
 
+  size_t tile_buffer_size = thread_count_ * C12NUM * C16NUM * ic4 * C4NUM * sizeof(float);
+  tile_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
+  if (tile_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "malloc tile buffer failed.";
+    return RET_ERROR;
+  }
+
   size_t block_unit_buffer_size = thread_count_ * k_plane * C4NUM * sizeof(float);
   block_unit_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(block_unit_buffer_size));
   if (block_unit_buffer_ == nullptr) {
@@ -105,13 +114,20 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
     return RET_ERROR;
   }
 
-  size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float);
+  size_t tmp_dst_buffer_size = thread_count_ * C12NUM * k_plane * oC8 * C8NUM * sizeof(float);
   tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
   if (tmp_dst_buffer_ == nullptr) {
     MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed.";
     return RET_ERROR;
   }
 
+  size_t col_buffer_size = thread_count_ * C12NUM * C4NUM * ic4 * sizeof(float);
+  col_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(col_buffer_size));
+  if (col_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "malloc col_buffer_ failed.";
+    return RET_ERROR;
+  }
+
   size_t nc4hw4_out_size =
     oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float);
   nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size));
@@ -124,6 +140,7 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
   tmp_buffer_address_list_[1] = block_unit_buffer_;
   tmp_buffer_address_list_[2] = tmp_dst_buffer_;
   tmp_buffer_address_list_[3] = nc4hw4_out_;
+  tmp_buffer_address_list_[4] = col_buffer_;
   return RET_OK;
 }
 
@@ -138,14 +155,14 @@ void Convolution3x3CPUKernel::ConfigInputOutput() {
 }
 
 int Convolution3x3CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.ret: " << ret;
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   ConfigInputOutput();
   return ReSize();
 }
@@ -161,10 +178,6 @@ int Convolution3x3CPUKernel::ReSize() {
     free(nhwc4_input_);
     nhwc4_input_ = nullptr;
   }
-  if (tile_buffer_ != nullptr) {
-    free(tile_buffer_);
-    tile_buffer_ = nullptr;
-  }
 
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -182,13 +195,6 @@ int Convolution3x3CPUKernel::ReSize() {
   }
   memset(nhwc4_input_, 0, nhwc4_input_size);
 
-  size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float);
-  tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
-  if (tile_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "malloc tile buffer failed.";
-    return RET_ERROR;
-  }
-  memset(tile_buffer_, 0, tile_buffer_size);
   return RET_OK;
 }
 
@@ -197,13 +203,12 @@ int Convolution3x3CPUKernel::RunImpl(int task_id) {
     MS_LOG(ERROR) << "gemm_func is nullptr.";
     return RET_ERROR;
   }
-  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
   Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_),
-              output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
+              tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
   return RET_OK;
 }
 
-int Convolution3x3Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution3x3Impl(void *cdata, int task_id) {
   auto conv3x3 = reinterpret_cast<Convolution3x3CPUKernel *>(cdata);
   auto error_code = conv3x3->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -213,6 +218,29 @@ int Convolution3x3Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
   return RET_OK;
 }
 
+int Convolution3x3CPUKernel::PostProcess() {
+  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
+  auto act_type = conv_param_->act_type_;
+  switch (act_type) {
+    case ActType_No:
+      PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                           conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    case ActType_Relu:
+      PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                               conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    case ActType_Relu6:
+      PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                                conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupport activation type.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int Convolution3x3CPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
@@ -230,25 +258,17 @@ int Convolution3x3CPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 error error_code[" << error_code << "]";
     FreeTmpBuffer();
     return RET_ERROR;
   }
 
-  auto is_relu = conv_param_->is_relu_;
-  auto is_relu6 = conv_param_->is_relu6_;
-  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  if (is_relu) {
-    PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
-                             conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  } else if (is_relu6) {
-    PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
-                              conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  } else {
-    PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
-                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  ret = PostProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Post process failed.";
+    return ret;
   }
   FreeTmpBuffer();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
index 383c8f1bc6..53003839ec 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
@@ -33,10 +33,6 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
     if (transformed_filter_addr_ != nullptr) {
       free(transformed_filter_addr_);
     }
-    if (tile_buffer_ != nullptr) {
-      free(tile_buffer_);
-      tile_buffer_ = nullptr;
-    }
   }
   int Init() override;
   int ReSize() override;
@@ -45,9 +41,14 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
   int InitWeightBias();
   int InitTmpBuffer();
   void ConfigInputOutput();
+  int PostProcess();
 
  private:
   void FreeTmpBuffer() {
+    if (tile_buffer_ != nullptr) {
+      ctx_->allocator->Free(tile_buffer_);
+      tile_buffer_ = nullptr;
+    }
     if (block_unit_buffer_ != nullptr) {
       ctx_->allocator->Free(block_unit_buffer_);
       block_unit_buffer_ = nullptr;
@@ -60,14 +61,19 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
       ctx_->allocator->Free(nc4hw4_out_);
       nc4hw4_out_ = nullptr;
     }
+    if (col_buffer_ != nullptr) {
+      ctx_->allocator->Free(col_buffer_);
+      col_buffer_ = nullptr;
+    }
   }
 
   float *transformed_filter_addr_ = nullptr;
   float *tile_buffer_ = nullptr;
   float *block_unit_buffer_ = nullptr;
   float *tmp_dst_buffer_ = nullptr;
+  float *col_buffer_ = nullptr;
   float *nc4hw4_out_ = nullptr;
-  TmpBufferAddress tmp_buffer_address_list_[4];
+  TmpBufferAddress tmp_buffer_address_list_[5];
   GEMM_FUNC_FP32 gemm_func_ = nullptr;
 };
 void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
index df59dbf383..53ea4cf09f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/fp32/convolution_depthwise.h"
+#include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -30,13 +31,13 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {
 ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() {
   if (packed_weight_ != nullptr) {
-    delete packed_weight_;
+    free(packed_weight_);
     packed_weight_ = nullptr;
   }
 }
 
 int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
-  // init weight: o, h, w, i; o == group, i == 1
+  // init weight: k, h, w, c; k == group == output_channel, c == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
   int channel = weight_tensor->Batch();
@@ -47,7 +48,7 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), channel);
+  PackWeightKHWToHWKFp32(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), channel);
 
   auto bias_tensor = in_tensors_[kBiasIndex];
   bias_data_ = reinterpret_cast<float *>(malloc(channel * sizeof(float)));
@@ -88,7 +89,7 @@ int ConvolutionDepthwiseCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int ConvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwRun(void *cdata, int task_id) {
   auto conv_dw = reinterpret_cast<ConvolutionDepthwiseCPUKernel *>(cdata);
   auto ret = conv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -99,22 +100,23 @@ int ConvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int ConvolutionDepthwiseCPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
-    return RET_ERROR;
-  }
+
   auto input_tensor = in_tensors_.at(kInputIndex);
   input_ptr_ = reinterpret_cast<float *>(input_tensor->Data());
 
   auto output_tensor = out_tensors_.at(kOutputIndex);
   output_ptr_ = reinterpret_cast<float *>(output_tensor->Data());
 
-  ret = LiteBackendParallelLaunch(ConvDwRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -129,9 +131,13 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
                                                const mindspore::lite::PrimitiveC *primitive) {
   MS_ASSERT(opParameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
   kernel::LiteKernel *kernel;
-  kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
-
+  if (conv_param->input_channel_ < 32) {
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel is nullptr.";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
deleted file mode 100644
index b56df7423c..0000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h"
-#include "schema/model_generated.h"
-#include "src/kernel_registry.h"
-#include "include/errorcode.h"
-#include "src/runtime/runtime_api.h"
-
-using mindspore::kernel::KERNEL_ARCH::kCPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_DepthwiseConv2D;
-
-namespace mindspore::kernel {
-ConvolutionDepthwise3x3CPUKernel::~ConvolutionDepthwise3x3CPUKernel() {
-  FreeTmpBufer();
-  if (block_buffer_ != nullptr) {
-    free(block_buffer_);
-    block_buffer_ = nullptr;
-  }
-  if (packed_weight_ != nullptr) {
-    free(packed_weight_);
-    packed_weight_ = nullptr;
-  }
-}
-
-void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      free(packed_input_);
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      free(packed_output_);
-      packed_output_ = nullptr;
-    }
-  }
-  if (trans_buffer_ != nullptr) {
-    free(trans_buffer_);
-    trans_buffer_ = nullptr;
-  }
-}
-
-int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
-  // init weight: o, h, w, i; o == group, i == 1
-  auto weight_tensor = in_tensors_[kWeightIndex];
-  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  // o h w 1 -> o/4 h w 1 4
-  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
-  int weight_c4_size = OC4 * C4NUM * 9;
-  auto tmp_weight = reinterpret_cast<float *>(malloc(weight_c4_size * sizeof(float)));
-  if (tmp_weight == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-  memset(tmp_weight, 0, weight_c4_size * sizeof(float));
-  PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
-                       weight_tensor->Batch());
-
-  // weight transform
-  int packed_weight_size = OC4 * C4NUM * 16;
-  packed_weight_ = reinterpret_cast<float *>(malloc(packed_weight_size * sizeof(float)));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-  memset(packed_weight_, 0, packed_weight_size * sizeof(float));
-  ConvDw3x3Fp32FilterTrans(packed_weight_, tmp_weight, OC4);
-
-  // init bias
-  bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
-  if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
-  }
-  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
-  return RET_OK;
-}
-
-int ConvolutionDepthwise3x3CPUKernel::InitBuffer() {
-  if (conv_param_->input_channel_ % C4NUM != 0) {
-    need_align_ = true;
-    int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-    int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
-    packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
-    if (packed_input_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-    memset(packed_input_, 0, pack_input_size * sizeof(float));
-
-    int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
-    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
-    packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
-    if (packed_output_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-  }
-
-  // malloc transform buffer
-  trans_size_ = UP_DIV(conv_param_->output_w_, 2) * UP_DIV(conv_param_->output_h_, 2) * 16 * C4NUM;
-  size_t trans_buffer_size = thread_count_ * trans_size_ * sizeof(float);
-  trans_buffer_ = reinterpret_cast<float *>(malloc(trans_buffer_size));
-  if (trans_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "malloc trans buffer failed.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int ConvolutionDepthwise3x3CPUKernel::Init() {
-  // malloc one block buffer
-  block_buffer_ = reinterpret_cast<float *>(malloc(thread_count_ * 16 * C4NUM * sizeof(float)));
-  if (block_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "malloc block buffer failed.";
-    return RET_ERROR;
-  }
-  auto ret = InitWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret;
-    return ret;
-  }
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int ConvolutionDepthwise3x3CPUKernel::ReSize() {
-  FreeTmpBufer();
-  ConvolutionBaseCPUKernel::Init();
-
-  auto ret = InitBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Depthwise3x3 fp32 initBuffer error!ret: " << ret;
-    return ret;
-  }
-  return RET_OK;
-}
-
-int ConvolutionDepthwise3x3CPUKernel::Execute(int task_id) {
-  auto trans_buf = trans_buffer_ + task_id * trans_size_;
-  auto block_buf = block_buffer_ + task_id * 16 * C4NUM;
-  ConvDw3x3Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), trans_buf,
-                block_buf, conv_param_, task_id);
-  return RET_OK;
-}
-
-int ConvDw3x3Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto conv_dw_3x3 = reinterpret_cast<ConvolutionDepthwise3x3CPUKernel *>(cdata);
-  auto ret = conv_dw_3x3->Execute(task_id);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "ConvolutionDepthwise3x3Run error task_id[" << task_id << "] error_code[" << ret << "]";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int ConvolutionDepthwise3x3CPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare failed.";
-    return ret;
-  }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
-    return RET_ERROR;
-  }
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
-
-  // pack input: to nhwc4
-  if (need_align_) {
-    PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
-                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
-  } else {
-    packed_input_ = input_addr;
-  }
-
-  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  if (!need_align_) {
-    packed_output_ = output_addr;
-  }
-
-  ret = LiteBackendParallelLaunch(ConvDw3x3Run, this, conv_param_->thread_num_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "ConvDw3x3Run error: error_code[" << ret << "]";
-    return RET_ERROR;
-  }
-
-  if (need_align_) {
-    PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
-                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  }
-  return RET_OK;
-}
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
index 99aa130087..10ed18bb03 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
@@ -36,20 +36,6 @@ ConvolutionDepthwiseSWCPUKernel::~ConvolutionDepthwiseSWCPUKernel() {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
-}
-
-void ConvolutionDepthwiseSWCPUKernel::FreeTmpBuffer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      delete packed_input_;
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      delete packed_output_;
-      packed_output_ = nullptr;
-    }
-  }
 }
 
 int ConvolutionDepthwiseSWCPUKernel::InitWeightBias() {
@@ -89,7 +75,7 @@ int ConvolutionDepthwiseSWCPUKernel::InitBuffer() {
     need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
     int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
-    packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
+    packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float)));
     if (packed_input_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
@@ -97,7 +83,7 @@ int ConvolutionDepthwiseSWCPUKernel::InitBuffer() {
 
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
-    packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
+    packed_output_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_output_size * sizeof(float)));
     if (packed_output_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
@@ -125,16 +111,9 @@ int ConvolutionDepthwiseSWCPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseSWCPUKernel::ReSize() {
-  FreeTmpBuffer();
   ConvolutionBaseCPUKernel::Init();
   InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
   conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
-
-  auto ret = InitBuffer();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
@@ -144,7 +123,7 @@ int ConvolutionDepthwiseSWCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int ConvDwSWRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwSWRun(void *cdata, int task_id) {
   auto conv_dw = reinterpret_cast<ConvolutionDepthwiseSWCPUKernel *>(cdata);
   auto ret = conv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -155,13 +134,20 @@ int ConvDwSWRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int ConvolutionDepthwiseSWCPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
     return RET_ERROR;
   }
   auto input_tensor = in_tensors_.at(kInputIndex);
@@ -181,7 +167,7 @@ int ConvolutionDepthwiseSWCPUKernel::Run() {
     packed_output_ = output_ptr;
   }
 
-  ret = LiteBackendParallelLaunch(ConvDwSWRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwSWRun error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -190,7 +176,10 @@ int ConvolutionDepthwiseSWCPUKernel::Run() {
   if (need_align_) {
     PackNHWC4ToNHWCFp32(packed_output_, output_ptr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_input_);
+    context_->allocator->Free(packed_output_);
   }
+
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.h
index 07fb16e62f..58e236efe8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.h
@@ -40,7 +40,6 @@ class ConvolutionDepthwiseSWCPUKernel : public ConvolutionBaseCPUKernel {
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
   SlidingWindowParam *sliding_ = nullptr;
   float *packed_weight_ = nullptr;
   float *packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
index 11c561133c..38a86cd63a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
@@ -96,14 +96,14 @@ void ConvolutionSWCPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionSWCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   // config input output
   ConfigInputOutput();
   return ReSize();
@@ -159,7 +159,7 @@ int ConvolutionSWCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionSWImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionSWImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionSWCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -187,7 +187,7 @@ int ConvolutionSWCPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionSWImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionSWImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index 29504aa36d..40dcf4dd4b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -76,7 +76,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
     int out_c_block = i / oc_block;
     int out_c_res = i % oc_block;
     int input_oz_offset = i * kernel_unit * kernel_unit * channel_in;
-    int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res;
+    int output_oz_offset = out_c_block * strides[1] + out_c_res;
     for (int j = 0; j < channel_in; j++) {
       int ic4_block = j / C4NUM;
       int ic4_res = j % C4NUM;
@@ -93,7 +93,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
       MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row);
 
       for (int z = 0; z < input_unit_square; z++) {
-        int output_xy_offset = output_iz_offset + z * strides[1];
+        int output_xy_offset = output_iz_offset + z * strides[0];
         *(trans_weight_data + output_xy_offset) = trans_out_data[z];
       }
     }
@@ -151,7 +151,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
 
 int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) {
   int channel_in = conv_param_->input_channel_;
-  int ic4 = UP_DIV(channel_in, BLOCK);
+  int ic4 = UP_DIV(channel_in, C4NUM);
 
   // set data
   auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float);
@@ -196,10 +196,19 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
   int output_h = conv_param_->output_h_;
   int output_w = conv_param_->output_w_;
   int oc4 = UP_DIV(channel_out, C4NUM);
+  int oc8 = UP_DIV(channel_out, C8NUM);
+  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
   MS_ASSERT(ctx_->allocator != nullptr);
 
+  size_t tile_buffer_size = thread_count_ * C12NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
+  trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
+  if (trans_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc trans_input_ failed.";
+    return RET_ERROR;
+  }
+
   gemm_out_ = reinterpret_cast<float *>(
-    ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float)));
+    ctx_->allocator->Malloc(thread_count_ * C12NUM * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float)));
   if (gemm_out_ == nullptr) {
     MS_LOG(ERROR) << "malloc gemm_out_ failed.";
     return RET_ERROR;
@@ -222,10 +231,18 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
     return RET_ERROR;
   }
 
+  col_buffer_ =
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(thread_count_ * C12NUM * ic4 * C4NUM * sizeof(float)));
+  if (col_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "malloc col_buffer_ failed.";
+    return RET_ERROR;
+  }
+
   tmp_buffer_address_list_[0] = trans_input_;
   tmp_buffer_address_list_[1] = gemm_out_;
   tmp_buffer_address_list_[2] = tmp_out_data_;
   tmp_buffer_address_list_[3] = tmp_data_;
+  tmp_buffer_address_list_[4] = col_buffer_;
   return RET_OK;
 }
 
@@ -253,19 +270,18 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionWinogradCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   kernel_unit_ = conv_param_->kernel_h_;
   input_unit_ = output_unit_ + kernel_unit_ - 1;
   conv_param_->input_unit_ = input_unit_;
   conv_param_->output_unit_ = output_unit_;
-
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -280,10 +296,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
     free(nhwc4_input_);
     nhwc4_input_ = nullptr;
   }
-  if (trans_input_ != nullptr) {
-    free(trans_input_);
-    trans_input_ = nullptr;
-  }
 
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -306,14 +318,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
   }
   memset(nhwc4_input_, 0, nhwc4_input_size);
 
-  size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
-  trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
-  if (trans_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc trans_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(trans_input_, 0, tile_buffer_size);
-
   ret = ConfigInputOutput();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConfigInputOutput failed.";
@@ -333,7 +337,7 @@ int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionWinogradImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionWinogradImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionWinogradCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -343,42 +347,59 @@ int ConvolutionWinogradImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata
   return RET_OK;
 }
 
+int ConvolutionWinogradCPUKernel::PostProcess() {
+  auto out_tensor = out_tensors_.front();
+  auto out_data = reinterpret_cast<float *>(out_tensor->Data());
+  auto act_type = conv_param_->act_type_;
+  switch (act_type) {
+    case ActType_No:
+      UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                           conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    case ActType_Relu:
+      UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                               conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    case ActType_Relu6:
+      UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                                conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupport activation type.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int ConvolutionWinogradCPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  // malloc tmp buffer
+
   auto ret = InitTmpBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init tmp buffer failed.";
     return RET_ERROR;
   }
+
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto ori_input_data = input_tensor->Data();
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionWinogradImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionWinogradImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
     FreeTmpBuffer();
     return RET_ERROR;
   }
 
-  // get real output
-  auto out_tensor = out_tensors_.front();
-  auto out_data = reinterpret_cast<float *>(out_tensor->Data());
-  if (conv_param_->is_relu_) {
-    UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                             conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  } else if (conv_param_->is_relu6_) {
-    UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                              conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  } else {
-    UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                         conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
+  ret = PostProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Post process failed.";
+    return ret;
   }
   FreeTmpBuffer();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
index 44ff046cb5..45e6f3f7b1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
@@ -38,10 +38,6 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
       delete trans_weight_;
       trans_weight_ = nullptr;
     }
-    if (trans_input_ != nullptr) {
-      free(trans_input_);
-      trans_input_ = nullptr;
-    }
   };
   int Init() override;
   int ReSize() override;
@@ -51,9 +47,14 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
   int MallocFilterMatrix(int oc_block, int oc_block_num);
   int InitTmpBuffer();
   int ConfigInputOutput();
+  int PostProcess();
 
  private:
   void FreeTmpBuffer() {
+    if (trans_input_ != nullptr) {
+      ctx_->allocator->Free(trans_input_);
+      trans_input_ = nullptr;
+    }
     if (tmp_data_ != nullptr) {
       ctx_->allocator->Free(tmp_data_);
       tmp_data_ = nullptr;
@@ -66,6 +67,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
       ctx_->allocator->Free(tmp_out_data_);
       tmp_out_data_ = nullptr;
     }
+    if (col_buffer_ != nullptr) {
+      ctx_->allocator->Free(col_buffer_);
+      col_buffer_ = nullptr;
+    }
   }
   int kernel_unit_;
   int input_unit_;
@@ -74,6 +79,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
   float *trans_input_ = nullptr;
   float *gemm_out_ = nullptr;
   float *tmp_out_data_ = nullptr;
+  float *col_buffer_ = nullptr;
   Matrix *trans_weight_ = nullptr;
   InputTransformUnitFunc input_trans_func_;
   OutputTransformUnitFunc output_trans_func_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
index b8c4bca55f..711db31678 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Crop;
 
 namespace mindspore::kernel {
 namespace {
-int CropLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CropLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<CropCPUKernel *>(cdata);
-  return kernel->CropParallelRun(thread_id);
+  return kernel->CropParallelRun(task_id);
 }
 }  // namespace
 
@@ -68,7 +68,7 @@ int CropCPUKernel::Run() {
     return RET_OK;
   }
 
-  auto ret = LiteBackendParallelLaunch(CropLaunch, this, param->op_parameter_.thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, CropLaunch, this, param->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
index 6233c633df..26870046df 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@@ -26,27 +26,17 @@ using mindspore::schema::PrimitiveType_DeConv2D;
 
 namespace mindspore::kernel {
 DeConvolutionCPUKernel::~DeConvolutionCPUKernel() {
-  FreeTmpBuffer();
   if (matmul_param_ != nullptr) {
     delete matmul_param_;
     matmul_param_ = nullptr;
   }
-}
-
-void DeConvolutionCPUKernel::FreeTmpBuffer() {
   if (weight_ptr_ != nullptr) {
     free(weight_ptr_);
     weight_ptr_ = nullptr;
   }
-  if (pack_input_ != nullptr) {
-    free(pack_input_);
-    pack_input_ = nullptr;
-  }
-  return;
 }
 
 int DeConvolutionCPUKernel::ReSize() {
-  FreeTmpBuffer();
   ConvolutionBaseCPUKernel::Init();
 
   int error_code = InitParam();
@@ -54,36 +44,35 @@ int DeConvolutionCPUKernel::ReSize() {
     MS_LOG(ERROR) << "deconv InitParam error!ret: " << error_code;
     return error_code;
   }
-
-  error_code = InitWeightBias();
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "deconv InitWeightBias error!ret: " << error_code;
-    return error_code;
-  }
   return RET_OK;
 }
 
 int DeConvolutionCPUKernel::InitWeightBias() {
-  bias_data_ = malloc(UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float));
+  auto weight_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = weight_tensor->Batch();
+  auto output_channel = weight_tensor->Channel();
+  auto kernel_h_ = weight_tensor->Height();
+  auto kernel_w_ = weight_tensor->Width();
+
+  bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
     return RET_ERROR;
   }
-  memset(bias_data_, 0, UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float));
+  memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float));
   if (in_tensors_.size() == 3) {
-    memcpy(bias_data_, in_tensors_[2]->Data(), conv_param_->output_channel_ * sizeof(float));
+    memcpy(bias_data_, in_tensors_[2]->Data(), output_channel * sizeof(float));
   }
 
-  size_t weight_pack_size = conv_param_->input_channel_ * conv_param_->kernel_w_ * conv_param_->kernel_h_ *
-                            UP_ROUND(conv_param_->output_channel_, C8NUM) * sizeof(float);
+  size_t weight_pack_size = input_channel * kernel_w_ * kernel_h_ * UP_ROUND(output_channel, C8NUM) * sizeof(float);
   weight_ptr_ = reinterpret_cast<float *>(malloc(weight_pack_size));
   if (weight_ptr_ == nullptr) {
     MS_LOG(ERROR) << "deconv malloc weight_ptr_ error!";
     return RET_ERROR;
   }
   memset(weight_ptr_, 0, weight_pack_size);
-  PackNHWCToC8HWN8Fp32(reinterpret_cast<float *>(in_tensors_[1]->Data()), weight_ptr_, conv_param_->input_channel_,
-                       kernel_plane_, conv_param_->output_channel_);
+  PackNHWCToC8HWN8Fp32(reinterpret_cast<float *>(in_tensors_[1]->Data()), weight_ptr_, input_channel,
+                       kernel_w_ * kernel_h_, output_channel);
   return RET_OK;
 }
 
@@ -95,21 +84,15 @@ int DeConvolutionCPUKernel::InitParam() {
   matmul_param_->row_ = input_plane_;
   matmul_param_->deep_ = conv_param_->input_channel_;
   matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
-  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
+  matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
   matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
 
   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
   thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
-
-  pack_input_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float)));
-  if (pack_input_ == nullptr) {
-    MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
-int DeConvFp32Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeConvFp32Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvolutionCPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -126,18 +109,23 @@ int DeConvolutionCPUKernel::DoDeconv(int task_id) {
     return RET_OK;
   }
 
-  auto tmp_buffer = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_8_;
-  MatMul(pack_input_, weight_ptr_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, tmp_buffer,
-         nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_8_, oc * C8NUM * kernel_plane_,
-         matmul_param_->col_, false);
+  auto tmp_buffer = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_12_;
+  MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
+            tmp_buffer, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_12_, oc * C8NUM * kernel_plane_,
+            matmul_param_->col_, OutType_C8);
 
-  DeConvPostFp32C8x8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
-                     reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
-                     output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
+  DeConvPostFp32C12x8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
+                      reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
+                      output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
   return RET_OK;
 }
 
 int DeConvolutionCPUKernel::Init() {
+  int error_code = InitWeightBias();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "deconv InitWeightBias error!ret: " << error_code;
+    return error_code;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -153,6 +141,10 @@ void DeConvolutionCPUKernel::FreeRunBuf() {
     ctx_->allocator->Free(tmp_buffer_);
     tmp_buffer_ = nullptr;
   }
+  if (pack_input_ != nullptr) {
+    ctx_->allocator->Free(pack_input_);
+    pack_input_ = nullptr;
+  }
   return;
 }
 
@@ -165,11 +157,18 @@ int DeConvolutionCPUKernel::InitRunBuf() {
   }
 
   tmp_buffer_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float)));
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->col_8_ * sizeof(float)));
   if (tmp_buffer_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!";
     return RET_NULL_PTR;
   }
+
+  pack_input_ =
+    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
+  if (pack_input_ == nullptr) {
+    MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -192,9 +191,9 @@ int DeConvolutionCPUKernel::Run() {
     input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_;
     output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_;
 
-    RowMajor2Col8Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_);
+    RowMajor2Col12Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_);
 
-    error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvFp32Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
       return error_code;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h
index b0a57a2c6d..3cbfac3869 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h
@@ -49,7 +49,6 @@ class DeConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
   void FreeRunBuf();
   int InitParam();
   int InitWeightBias();
-  void FreeTmpBuffer();
 
  private:
   MatMulParameter *matmul_param_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
index 7af1563963..10a097a047 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@@ -36,20 +36,6 @@ DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
-}
-
-void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      delete packed_input_;
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      delete packed_output_;
-      packed_output_ = nullptr;
-    }
-  }
 }
 
 int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
@@ -100,7 +86,7 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
     need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
     int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
-    packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
+    packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float)));
     if (packed_input_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
@@ -108,7 +94,7 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
 
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
-    packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
+    packed_output_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_output_size * sizeof(float)));
     if (packed_output_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
@@ -137,15 +123,8 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
 }
 
 int DeconvolutionDepthwiseCPUKernel::ReSize() {
-  FreeTmpBuffer();
   InitSlideParam();
   ConvolutionBaseCPUKernel::Init();
-
-  auto ret = InitBuffer();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret;
-    return ret;
-  }
   return RET_OK;
 }
 
@@ -155,7 +134,7 @@ int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeconvDwRun(void *cdata, int task_id) {
   auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
   auto ret = deconv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -166,15 +145,23 @@ int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int DeconvolutionDepthwiseCPUKernel::Run() {
-  auto prepare_ret = Prepare();
-  if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
-    return prepare_ret;
-  }
   if (conv_param_->input_channel_ != conv_param_->output_channel_) {
     MS_LOG(ERROR) << "Only support input channel equals output channel.";
     return RET_ERROR;
   }
+
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
+    return ret;
+  }
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret;
+    return ret;
+  }
+
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
 
@@ -191,7 +178,7 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
     packed_output_ = output_addr;
   }
 
-  auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -200,6 +187,8 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
   if (need_align_) {
     PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_input_);
+    context_->allocator->Free(packed_output_);
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
index 17b513d796..b1e1ab9fca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
@@ -41,7 +41,6 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
   SlidingWindowParam *sliding_ = nullptr;
   float *packed_weight_ = nullptr;
   float *packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
index 09636fccfb..bd54b2e2be 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
@@ -46,7 +46,7 @@ int EluCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int EluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int EluRun(void *cdata, int task_id) {
   auto EluData = reinterpret_cast<EluCPUKernel *>(cdata);
   auto ret = EluData->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -65,7 +65,7 @@ int EluCPUKernel::Run() {
   input_addr = reinterpret_cast<float *>(in_tensors_.front()->Data());
   output_addr = reinterpret_cast<float *>(out_tensors_.front()->Data());
 
-  auto ret = LiteBackendParallelLaunch(EluRun, this, elu_parameter_->thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, EluRun, this, elu_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Elu error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
index ee0e316035..ef832f6257 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
@@ -61,7 +61,7 @@ int EmbeddingLookupCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int EmbeddingLookupRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int EmbeddingLookupRun(void *cdata, int task_id) {
   auto EmbeddingLookupData = reinterpret_cast<EmbeddingLookupCPUKernel *>(cdata);
   auto ret = EmbeddingLookupData->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -102,7 +102,7 @@ int EmbeddingLookupCPUKernel::Run() {
   output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
   ids_addr_ = reinterpret_cast<int *>(in_tensors_.back()->Data());
 
-  auto ret = LiteBackendParallelLaunch(EmbeddingLookupRun, this, embedding_lookup_parameter_->thread_num);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, EmbeddingLookupRun, this, embedding_lookup_parameter_->thread_num);
   context_->allocator->Free(input_addr_);
   context_->allocator->Free(embedding_lookup_parameter_->is_regulated_);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
index d196bc0a51..3a49462bb2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
@@ -56,7 +56,7 @@ int ExpandDimsCPUKernel::DoExpandDims(int task_id) {
   return RET_OK;
 }
 
-int ExpandDimsRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ExpandDimsRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ExpandDimsCPUKernel *>(cdata);
   auto ret = g_kernel->DoExpandDims(task_id);
   if (ret != RET_OK) {
@@ -74,7 +74,7 @@ int ExpandDimsCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  auto ret = LiteBackendParallelLaunch(ExpandDimsRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, ExpandDimsRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ExpandDimsRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.h b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.h
index 85f1cddeb4..eb545601d4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.h
@@ -32,7 +32,7 @@ class ExpandDimsCPUKernel : public LiteKernel {
   ExpandDimsCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                       const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                       const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
   ~ExpandDimsCPUKernel() override = default;
 
   int Init() override;
@@ -46,7 +46,6 @@ class ExpandDimsCPUKernel : public LiteKernel {
   size_t data_size_;
   float *in_ptr_;
   float *out_ptr_;
-  const Context *ctx_;
   int thread_count_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
index 359ecf7d2a..3ae36bf99d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
@@ -28,12 +28,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Fill;
 
 namespace mindspore::kernel {
-
-namespace {
-constexpr int kInputNum = 1;
-constexpr int kOutputNum = 1;
-}  // namespace
-
 int FillCPUKernel::Init() {
   if (!InferShapeDone()) {
     return RET_OK;
@@ -62,7 +56,7 @@ int FillCPUKernel::DoFill(int task_id) {
   return RET_OK;
 }
 
-int FillRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FillRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<FillCPUKernel *>(cdata);
   auto ret = g_kernel->DoFill(task_id);
   if (ret != RET_OK) {
@@ -83,7 +77,7 @@ int FillCPUKernel::Run() {
   auto fill_data = reinterpret_cast<float *>(fillData->Data());
   src_data_ = fill_data[0];
   out_ptr_ = reinterpret_cast<float *>(output->Data());
-  auto ret = LiteBackendParallelLaunch(FillRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, FillRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "FillRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.h
index bc73bbbbbd..b92948a453 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.h
@@ -30,7 +30,7 @@ class FillCPUKernel : public LiteKernel {
   FillCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                 const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                 const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
   ~FillCPUKernel() override = default;
 
   int Init() override;
@@ -44,7 +44,6 @@ class FillCPUKernel : public LiteKernel {
   int data_size_;
   float src_data_;
   float *out_ptr_;
-  const Context *ctx_;
   int thread_count_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
index be66970d26..226f609a98 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
@@ -27,18 +27,14 @@ FullconnectionCPUKernel::~FullconnectionCPUKernel() {
 }
 
 void FullconnectionCPUKernel::FreeBuf() {
-  if (a_c8_ptr_ != nullptr) {
-    free(a_c8_ptr_);
-    a_c8_ptr_ = nullptr;
+  if (a_c12_ptr_ != nullptr) {
+    free(a_c12_ptr_);
+    a_c12_ptr_ = nullptr;
   }
   if (b_r8_ptr_ != nullptr) {
     free(b_r8_ptr_);
     b_r8_ptr_ = nullptr;
   }
-  if (c_r8x8_ptr_ != nullptr) {
-    free(c_r8x8_ptr_);
-    c_r8x8_ptr_ = nullptr;
-  }
   if (bias_ptr_ != nullptr) {
     free(bias_ptr_);
     bias_ptr_ = nullptr;
@@ -51,8 +47,8 @@ int FullconnectionCPUKernel::ReSize() {
   fc_param_->col_ = (in_tensors_[1]->shape())[0];
   fc_param_->deep_ = (in_tensors_[1]->shape())[1];
 
-  fc_param_->row_8_ = UP_ROUND(fc_param_->row_, 8);
-  fc_param_->col_8_ = UP_ROUND(fc_param_->col_, 8);
+  fc_param_->row_12_ = UP_ROUND(fc_param_->row_, C12NUM);
+  fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
 
   thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8));
   thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_);
@@ -63,11 +59,11 @@ int FullconnectionCPUKernel::ReSize() {
     memcpy(bias_ptr_, in_tensors_[2]->Data(), fc_param_->col_ * sizeof(float));
   }
 
-  a_c8_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_8_ * fc_param_->deep_ * sizeof(float)));
-  if (a_c8_ptr_ == nullptr) {
+  a_c12_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_12_ * fc_param_->deep_ * sizeof(float)));
+  if (a_c12_ptr_ == nullptr) {
     return RET_MEMORY_FAILED;
   }
-  memset(a_c8_ptr_, 0, fc_param_->row_8_ * fc_param_->deep_ * sizeof(float));
+  memset(a_c12_ptr_, 0, fc_param_->row_12_ * fc_param_->deep_ * sizeof(float));
 
   b_r8_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float)));
   if (b_r8_ptr_ == nullptr) {
@@ -76,17 +72,10 @@ int FullconnectionCPUKernel::ReSize() {
   }
   memset(b_r8_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float));
 
-  c_r8x8_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_8_ * fc_param_->col_8_ * sizeof(float)));
-  if (c_r8x8_ptr_ == nullptr) {
-    FreeBuf();
-    return RET_MEMORY_FAILED;
-  }
-  memset(c_r8x8_ptr_, 0, fc_param_->row_8_ * fc_param_->col_8_ * sizeof(float));
-
-  fc_param_->a_const_ = false;
-  fc_param_->b_const_ = false;
-  InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c8_ptr_);
-  InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
+  fc_param_->a_const_ = (in_tensors_[0]->Data() != nullptr);
+  fc_param_->b_const_ = (in_tensors_[1]->Data() != nullptr);
+  if (fc_param_->a_const_) InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_);
+  if (fc_param_->b_const_) InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
   return RET_OK;
 }
 
@@ -98,30 +87,14 @@ int FullconnectionCPUKernel::Init() {
 }
 
 void FullconnectionCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
-  if (fc_param_->a_const_ == true) {
-    return;
-  }
-  if (src_ptr == nullptr) {
-    return;
-  }
-  fc_param_->a_const_ = true;
-  RowMajor2Col8Major(src_ptr, a_c8_ptr_, fc_param_->row_, fc_param_->deep_);
-  return;
+  RowMajor2Col12Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_);
 }
 
 void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
-  if (fc_param_->b_const_ == true) {
-    return;
-  }
-  if (src_ptr == nullptr) {
-    return;
-  }
-  fc_param_->b_const_ = true;
   RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_);
-  return;
 }
 
-int FcFp32MatmulRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FcFp32MatmulRun(void *cdata, int task_id) {
   auto fc = reinterpret_cast<FullconnectionCPUKernel *>(cdata);
   auto error_code = fc->DoMatmul(task_id);
   if (error_code != RET_OK) {
@@ -132,15 +105,14 @@ int FcFp32MatmulRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int FullconnectionCPUKernel::DoMatmul(int task_id) {
-  int cur_oc = MSMIN(thread_stride_, UP_DIV(fc_param_->col_8_, 8) - task_id * thread_stride_);
+  int cur_oc = MSMIN(thread_stride_ * C8NUM, fc_param_->col_ - task_id * thread_stride_ * C8NUM);
   if (cur_oc <= 0) {
     return RET_OK;
   }
 
-  MatMul(a_c8_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_,
-         c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->row_8_,
-         bias_ptr_ + task_id * thread_stride_ * C8NUM, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_8_,
-         cur_oc * 8, 0, false);
+  MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_,
+            c_r_ptr + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM,
+            fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, OutType_Nhwc);
   return RET_OK;
 }
 
@@ -152,14 +124,13 @@ int FullconnectionCPUKernel::Run() {
   }
   auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
   auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
-  auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
+  c_r_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
 
-  InitMatrixA(a_ptr, a_c8_ptr_);
-  InitMatrixB(b_ptr, b_r8_ptr_);
+  if (!fc_param_->a_const_) InitMatrixA(a_ptr, a_c12_ptr_);
+  if (!fc_param_->b_const_) InitMatrixB(b_ptr, b_r8_ptr_);
 
-  LiteBackendParallelLaunch(FcFp32MatmulRun, this, thread_count_);
+  ParallelLaunch(THREAD_POOL_DEFAULT, FcFp32MatmulRun, this, thread_count_);
 
-  Row8x8Major2RowMajor(c_r8x8_ptr_, output_ptr, fc_param_->row_, fc_param_->col_, fc_param_->col_);
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h
index 5ad3ac994d..50bb60277f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h
@@ -47,9 +47,9 @@ class FullconnectionCPUKernel : public FullconnectionBaseCPUKernel {
   void InitMatrixB(float *src_ptr, float *dst_ptr);
 
  private:
-  float *a_c8_ptr_ = nullptr;
+  float *a_c12_ptr_ = nullptr;
   float *b_r8_ptr_ = nullptr;
-  float *c_r8x8_ptr_ = nullptr;
+  float *c_r_ptr = nullptr;
   float *bias_ptr_ = nullptr;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
index 66b862e6b4..52879bdc44 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
@@ -13,9 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <vector>
 #include "src/runtime/kernel/arm/fp32/gather.h"
+#include <vector>
+#include "nnacl/gather_parameter.h"
+#include "nnacl/fp32/gather.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
@@ -30,14 +31,19 @@ using mindspore::schema::PrimitiveType_Gather;
 namespace mindspore::kernel {
 
 int GatherCPUKernel::Init() {
-  axis_ = (reinterpret_cast<GatherParameter *>(op_parameter_))->axis_;
-  batchDims_ = (reinterpret_cast<GatherParameter *>(op_parameter_))->batchDims_;
   if (!InferShapeDone()) {
     return RET_OK;
   }
   return ReSize();
 }
 
+GatherCPUKernel::~GatherCPUKernel() {
+  if (indices_data_ != nullptr) {
+    free(indices_data_);
+    indices_data_ = nullptr;
+  }
+}
+
 int GatherCPUKernel::ReSize() { return RET_OK; }
 
 int GatherCPUKernel::DoGather(int task_id) {
@@ -46,7 +52,6 @@ int GatherCPUKernel::DoGather(int task_id) {
   auto out_tensor = out_tensors_.at(0);
 
   auto input_ptr = reinterpret_cast<float *>(input_tensor->Data());
-  auto indices_ptr = reinterpret_cast<int *>(indices_tensor->Data());
   auto output_ptr = reinterpret_cast<float *>(out_tensor->Data());
 
   auto input_int32 = reinterpret_cast<int32_t *>(input_tensor->Data());
@@ -55,26 +60,18 @@ int GatherCPUKernel::DoGather(int task_id) {
   auto in_shape = input_tensor->shape();
   int in_rank = in_shape.size();
   int indices_element_size = indices_tensor->ElementsNum();
+  auto axis = (reinterpret_cast<GatherParameter *>(op_parameter_))->axis_;
 
-  const int limit = in_shape[axis_];
-  for (int i = 0; i < indices_element_size; ++i) {
-    if (indices_ptr[i] >= limit) {
-      MS_LOG(ERROR) << " indice data: " << indices_ptr[i] << " is not in [ 0, " << limit - 1 << " ]";
-      return RET_ERROR;
-    }
-  }
+  const int limit = in_shape[axis];
 
-  int outer_size = 1;
-  for (int i = 0; i < axis_; ++i) {
+  int outer_size = 1, inner_size = 1;
+  for (int i = 0; i < axis; ++i) {
     outer_size *= in_shape[i];
   }
-
-  int inner_size = 1;
-  for (int i = axis_ + 1; i < in_rank; ++i) {
+  for (int i = axis + 1; i < in_rank; ++i) {
     inner_size *= in_shape[i];
   }
-
-  int stride = UP_DIV(outer_size, thread_count_);
+  int stride = UP_DIV(outer_size, op_parameter_->thread_num_);
   int count = MSMIN(stride, outer_size - stride * task_id);
   auto thread_stride = stride * task_id;
 
@@ -82,27 +79,22 @@ int GatherCPUKernel::DoGather(int task_id) {
   if (input_tensor->data_type() == kNumberTypeInt32) {
     input_int32 += thread_stride * limit;
     output_int32 += thread_stride * indices_element_size;
-    error_code = GatherInt32(input_int32, count, inner_size, limit, indices_ptr, indices_element_size, output_int32);
+    error_code = GatherInt32(input_int32, count, inner_size, limit, indices_data_, indices_element_size, output_int32);
   } else {
     input_ptr += thread_stride * limit;
     output_ptr += thread_stride * indices_element_size;
-    error_code = Gather(input_ptr, count, inner_size, limit, indices_ptr, indices_element_size, output_ptr);
+    error_code = Gather(input_ptr, count, inner_size, limit, indices_data_, indices_element_size, output_ptr);
   }
-
-  if (error_code != RET_OK) {
-    return RET_ERROR;
-  }
-  return RET_OK;
+  return error_code;
 }
 
-int GatherRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherRun(void *cdata, int task_id) {
   auto gather_kernel = reinterpret_cast<GatherCPUKernel *>(cdata);
   auto error_code = gather_kernel->DoGather(task_id);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
   }
-  return RET_OK;
+  return error_code;
 }
 
 int GatherCPUKernel::Run() {
@@ -111,12 +103,30 @@ int GatherCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(GatherRun, this, thread_count_);
+
+  auto indices_tensor = in_tensors_.at(1);
+  indices_data_ = reinterpret_cast<int *>(malloc(indices_tensor->Size()));
+  if (indices_data_ == nullptr) {
+    MS_LOG(ERROR) << "Memory allocation failed";
+    return RET_ERROR;
+  }
+  auto in_shape = in_tensors_.at(0)->shape();
+  int indices_element_size = indices_tensor->ElementsNum();
+  auto axis = (reinterpret_cast<GatherParameter *>(op_parameter_))->axis_;;
+  auto indices_ptr = reinterpret_cast<float *>(indices_tensor->Data());
+  const int limit = in_shape[axis];
+  for (int i = 0; i < indices_element_size; ++i) {
+    indices_data_[i] = static_cast<int>(indices_ptr[i]);
+    if (indices_data_[i] >= limit) {
+      MS_LOG(ERROR) << " indice data: " << indices_data_[i] << " is not in [ 0, " << limit - 1 << " ]";
+      return RET_ERROR;
+    }
+  }
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, GatherRun, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Gather function error error_code[" << error_code << "]";
-    return RET_ERROR;
   }
-  return RET_OK;
+  return error_code;
 }
 
 kernel::LiteKernel *CpuGatherFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.h b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.h
index 65df4f510c..334d93a648 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GATHER_H_
 
 #include <vector>
-#include "nnacl/fp32/gather.h"
+#include "nnacl/gather_parameter.h"
 #include "src/lite_kernel.h"
 
 namespace mindspore::kernel {
@@ -27,8 +27,8 @@ class GatherCPUKernel : public LiteKernel {
   GatherCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                   const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
-  ~GatherCPUKernel() override = default;
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~GatherCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -36,9 +36,7 @@ class GatherCPUKernel : public LiteKernel {
   int DoGather(int task_id);
 
  private:
-  int thread_count_;
-  int batchDims_;
-  int axis_;
+  int *indices_data_ = nullptr;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
index 518d74589e..961178e734 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
@@ -105,7 +105,7 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) {
   return RET_OK;
 }
 
-int GatherNdRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherNdRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<GatherNdCPUKernel *>(cdata);
   auto ret = g_kernel->DoGatherNd(task_id);
   if (ret != RET_OK) {
@@ -123,7 +123,7 @@ int GatherNdCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
-  auto ret = LiteBackendParallelLaunch(GatherNdRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, GatherNdRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.h
index 4b8f016d04..22261d1493 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.h
@@ -32,7 +32,7 @@ class GatherNdCPUKernel : public LiteKernel {
   GatherNdCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                     const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                     const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
   ~GatherNdCPUKernel() override;
 
   int Init() override;
@@ -48,7 +48,6 @@ class GatherNdCPUKernel : public LiteKernel {
   int *in_offset_ = nullptr;
   float *in_ptr_;
   float *out_ptr_;
-  const Context *ctx_;
   int thread_count_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
index df623ff0e3..ab8b01b598 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
@@ -26,11 +26,10 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_LeakyReLU;
-using mindspore::schema::PrimitiveType_Prelu;
 
 namespace mindspore::kernel {
 namespace {
-int LeakyReluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int LeakyReluRun(void *cdata, int task_id) {
   auto kernel_relu = reinterpret_cast<LeakyReluCPUKernel *>(cdata);
   auto ret = kernel_relu->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -66,7 +65,7 @@ int LeakyReluCPUKernel::Run() {
   input_data = reinterpret_cast<float *>(input->Data());
   output_data = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
 
-  auto ret = LiteBackendParallelLaunch(LeakyReluRun, this, context_->thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, LeakyReluRun, this, context_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PReluDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -100,5 +99,4 @@ kernel::LiteKernel *CpuLeakyReluFp32KernelCreator(const std::vector<lite::tensor
 }
 
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LeakyReLU, CpuLeakyReluFp32KernelCreator)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Prelu, CpuLeakyReluFp32KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
index 85cc36f414..15de35e18b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
@@ -63,7 +63,7 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
   return RET_OK;
 }
 
-int LocalResponseNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int LocalResponseNormRun(void *cdata, int task_id) {
   auto lrn = reinterpret_cast<LocalResponseNormCPUKernel *>(cdata);
   auto error_code = lrn->DoLocalResponseNorm(task_id);
   if (error_code != RET_OK) {
@@ -79,7 +79,7 @@ int LocalResponseNormCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(LocalResponseNormRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, LocalResponseNormRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "LocalResponseNorm function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
index dbec2fb6b3..61392a80a4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
@@ -28,20 +28,16 @@ namespace mindspore::kernel {
 MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); }
 
 void MatmulCPUKernel::FreeTmpBuffer() {
-  if (a_c8_ptr_ != nullptr) {
-    ctx_->allocator->Free(a_c8_ptr_);
-    a_c8_ptr_ = nullptr;
+  if (a_c12_ptr_ != nullptr) {
+    free(a_c12_ptr_);
+    a_c12_ptr_ = nullptr;
   }
   if (b_r8_ptr_ != nullptr) {
-    ctx_->allocator->Free(b_r8_ptr_);
+    free(b_r8_ptr_);
     b_r8_ptr_ = nullptr;
   }
-  if (c_r8x8_ptr_ != nullptr) {
-    ctx_->allocator->Free(c_r8x8_ptr_);
-    c_r8x8_ptr_ = nullptr;
-  }
   if (bias_ptr_ != nullptr) {
-    ctx_->allocator->Free(bias_ptr_);
+    free(bias_ptr_);
     bias_ptr_ = nullptr;
   }
 }
@@ -66,80 +62,69 @@ int MatmulCPUKernel::ReSize() {
   params_->row_ = c_shape[c_shape.size() - 2];
   params_->col_ = c_shape[c_shape.size() - 1];
   params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
-  params_->row_8_ = UP_ROUND(params_->row_, 8);
+  params_->row_12_ = UP_ROUND(params_->row_, C12NUM);
   params_->col_8_ = UP_ROUND(params_->col_, 8);
   thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
   thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
 
-  a_c8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(float)));
-  if (a_c8_ptr_ == nullptr) {
+  a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float)));
+  if (a_c12_ptr_ == nullptr) {
     FreeTmpBuffer();
     return RET_MEMORY_FAILED;
   }
-  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(float));
-  b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float)));
+  memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float));
+
+  b_r8_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float)));
   if (b_r8_ptr_ == nullptr) {
     FreeTmpBuffer();
     return RET_MEMORY_FAILED;
   }
   memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
-  c_r8x8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(float)));
-  if (c_r8x8_ptr_ == nullptr) {
+
+  params_->a_const_ = (in_tensors_[0]->Data() != nullptr);
+  params_->b_const_ = (in_tensors_[1]->Data() != nullptr);
+  if (params_->a_const_ == true) {
+    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_);
+  }
+  if (params_->b_const_ == true) {
+    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
+  }
+
+  bias_ptr_ = reinterpret_cast<float *>(malloc(params_->col_8_ * sizeof(float)));
+  if (bias_ptr_ == nullptr) {
     FreeTmpBuffer();
     return RET_MEMORY_FAILED;
   }
-  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(float));
-
-  params_->a_const_ = false;
-  params_->b_const_ = false;
-  InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c8_ptr_);
-  InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_);
-
+  memset(bias_ptr_, 0, params_->col_8_ * sizeof(float));
   if (in_tensors_.size() == 3) {
-    bias_ptr_ = reinterpret_cast<float *>(malloc(params_->col_8_ * sizeof(float)));
-    if (bias_ptr_ == nullptr) {
-      FreeTmpBuffer();
-      return RET_MEMORY_FAILED;
-    }
-    memset(bias_ptr_, 0, params_->col_8_ * sizeof(float));
     memcpy(bias_ptr_, in_tensors_[2]->Data(), params_->col_ * sizeof(float));
-  } else {
-    bias_ptr_ = nullptr;
   }
 
   return RET_OK;
 }
 
 void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
-  if (params_->a_const_ == true) {
-    return;
-  }
-  if (src_ptr == nullptr) {
-    return;
-  }
-  params_->a_const_ = true;
-
-  if (params_->a_transpose_) {
-    RowMajor2Row8Major(src_ptr, dst_ptr, params_->deep_, params_->row_);
-  } else {
-    RowMajor2Col8Major(src_ptr, a_c8_ptr_, params_->row_, params_->deep_);
+  for (int i = 0; i < params_->batch; i++) {
+    float *src = src_ptr + i * params_->deep_ * params_->row_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->row_12_;
+    if (params_->a_transpose_) {
+      RowMajor2Row12Major(src, dst, params_->deep_, params_->row_);
+    } else {
+      RowMajor2Col12Major(src, dst, params_->row_, params_->deep_);
+    }
   }
   return;
 }
 
 void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
-  if (params_->b_const_ == true) {
-    return;
-  }
-  if (src_ptr == nullptr) {
-    return;
-  }
-  params_->b_const_ = true;
-
-  if (params_->b_transpose_) {
-    RowMajor2Col8Major(src_ptr, dst_ptr, params_->col_, params_->deep_);
-  } else {
-    RowMajor2Row8Major(src_ptr, dst_ptr, params_->deep_, params_->col_);
+  for (int i = 0; i < params_->batch; i++) {
+    float *src = src_ptr + i * params_->deep_ * params_->col_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
+    if (params_->b_transpose_) {
+      RowMajor2Col8Major(src, dst, params_->col_, params_->deep_);
+    } else {
+      RowMajor2Row8Major(src, dst, params_->deep_, params_->col_);
+    }
   }
   return;
 }
@@ -152,22 +137,17 @@ int MatmulCPUKernel::Init() {
 }
 
 int MatmulCPUKernel::RunImpl(int task_id) {
-  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
+  int cur_oc = MSMIN(thread_stride_ * C8NUM, params_->col_ - task_id * thread_stride_ * C8NUM);
   if (cur_oc <= 0) {
     return RET_OK;
   }
-  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
-  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
-  if (bias_ptr_) {
-    auto cur_bias = bias_ptr_ + task_id * thread_stride_ * C8NUM;
-    MatMul(a_c8_ptr_, cur_b, cur_c, cur_bias, ActType_No, params_->deep_, params_->row_8_, cur_oc * 8, 0, false);
-  } else {
-    MatMul(a_c8_ptr_, cur_b, cur_c, NULL, ActType_No, params_->deep_, params_->row_8_, cur_oc * 8, 0, false);
-  }
+  MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_,
+            c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No,
+            params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
   return RET_OK;
 }
 
-int MatmulFloatRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MatmulFloatRun(void *cdata, int task_id) {
   auto op = reinterpret_cast<MatmulCPUKernel *>(cdata);
   auto error_code = op->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -183,22 +163,22 @@ int MatmulCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->Data());
-  auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->Data());
-  auto c_ptr = reinterpret_cast<float *>(out_tensors_[0]->Data());
-  auto a_stride = params_->row_ * params_->deep_;
-  auto b_stride = params_->deep_ * params_->col_;
-  auto c_stride = params_->row_ * params_->col_;
-  for (int i = 0; i < params_->batch; ++i) {
-    auto cur_a_ptr = a_ptr + i * a_stride;
-    auto cur_b_ptr = b_ptr + i * b_stride;
-    auto cur_c_ptr = c_ptr + i * c_stride;
+  auto a_src = reinterpret_cast<float *>(in_tensors_[0]->Data());
+  auto b_src = reinterpret_cast<float *>(in_tensors_[1]->Data());
+  auto c_src = reinterpret_cast<float *>(out_tensors_[0]->Data());
 
-    InitMatrixA(cur_a_ptr, a_c8_ptr_);
-    InitMatrixB(cur_b_ptr, b_r8_ptr_);
+  if (params_->a_const_ == false) {
+    InitMatrixA(a_src, a_c12_ptr_);
+  }
+  if (params_->b_const_ == false) {
+    InitMatrixB(b_src, b_r8_ptr_);
+  }
 
-    LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
-    Row8x8Major2RowMajor(c_r8x8_ptr_, cur_c_ptr, params_->row_, params_->col_, params_->col_);
+  for (int i = 0; i < params_->batch; ++i) {
+    a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_;
+    b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_;
+    c_ptr_ = c_src + i * params_->row_ * params_->col_;
+    ParallelLaunch(THREAD_POOL_DEFAULT, MatmulFloatRun, this, thread_count_);
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
index a0a32e82f4..e93e170e11 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
@@ -41,10 +41,12 @@ class MatmulCPUKernel : public MatmulBaseCPUKernel {
   void FreeTmpBuffer();
 
  private:
-  float *a_c8_ptr_ = nullptr;
+  float *a_c12_ptr_ = nullptr;
   float *b_r8_ptr_ = nullptr;
-  float *c_r8x8_ptr_ = nullptr;
   float *bias_ptr_ = nullptr;
+  float *a_ptr_ = nullptr;
+  float *b_ptr_ = nullptr;
+  float *c_ptr_ = nullptr;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
index 757482f02a..c0bb5f87eb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
@@ -81,7 +81,7 @@ int OneHotCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int RunOneHot(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int RunOneHot(void *cdata, int task_id) {
   auto onehot_kernel = reinterpret_cast<OneHotCPUKernel *>(cdata);
   if (onehot_kernel == nullptr) {
     MS_LOG(ERROR) << "cast OneHotCPUKernel failed";
@@ -166,7 +166,7 @@ int OneHotCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(RunOneHot, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, RunOneHot, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "OneHot function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
index 012f4ab4f4..51d9e9d1a1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
@@ -68,7 +68,7 @@ int PadCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int PadImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PadImpl(void *cdata, int task_id) {
   auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
   int error_code = padKernel->RunImpl(task_id);
   if (error_code != NNACL_OK) {
@@ -102,7 +102,7 @@ int PadCPUKernel::Run() {
   auto output_data = reinterpret_cast<float *>(output->Data());
   memset(output_data, 0, output_size * sizeof(float));
 
-  int error_code = LiteBackendParallelLaunch(PadImpl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PadImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Pad run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
index 8ea453593f..2891ed0494 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
@@ -52,15 +52,33 @@ int PoolingCPUKernel::ReSize() {
 int PoolingCPUKernel::RunImpl(int task_id) {
   auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
   auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  if (pooling_param_->max_pooling_) {
-    MaxPooling(input_ptr, output_ptr, pooling_param_, task_id);
+  if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
+    switch (pooling_param_->act_type_) {
+      case ActType_Relu:
+        MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
+        break;
+      case ActType_Relu6:
+        MaxPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
+        break;
+      default:
+        MaxPooling(input_ptr, output_ptr, pooling_param_, task_id);
+    }
   } else {
-    AvgPooling(input_ptr, output_ptr, pooling_param_, task_id);
+    switch (pooling_param_->act_type_) {
+      case ActType_Relu:
+        AvgPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
+        break;
+      case ActType_Relu6:
+        AvgPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
+        break;
+      default:
+        AvgPooling(input_ptr, output_ptr, pooling_param_, task_id);
+    }
   }
   return RET_OK;
 }
 
-int PoolingImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PoolingImpl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingCPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -76,7 +94,7 @@ int PoolingCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(PoolingImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
index 61212e207d..4b1cef6fdb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
@@ -30,7 +30,7 @@ int PowerCPUKernel::Init() { return RET_OK; }
 
 int PowerCPUKernel::ReSize() { return RET_OK; }
 
-int PowerImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PowerImpl(void *cdata, int task_id) {
   auto kernel = reinterpret_cast<PowerCPUKernel *>(cdata);
   auto ret = kernel->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -46,7 +46,7 @@ int PowerCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  auto ret = LiteBackendParallelLaunch(PowerImpl, this, thread_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, PowerImpl, this, thread_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PowerCPUKernel error: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power.h b/mindspore/lite/src/runtime/kernel/arm/fp32/power.h
index c5bc0fde7c..c08a06d1bd 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power.h
@@ -30,7 +30,6 @@ class PowerCPUKernel : public PowerBaseCPUKernel {
                  const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                  const mindspore::lite::PrimitiveC *primitive)
       : PowerBaseCPUKernel(param, inputs, outputs, ctx, primitive),
-        ctx_(ctx),
         thread_count_(ctx->thread_num_),
         power_(reinterpret_cast<PowerParameter *>(op_parameter_)->power_),
         scale_(reinterpret_cast<PowerParameter *>(op_parameter_)->scale_),
@@ -43,7 +42,6 @@ class PowerCPUKernel : public PowerBaseCPUKernel {
   int RunImpl(int task_id);
 
  private:
-  const lite::Context *ctx_;
   int thread_count_;
   float power_;
   float scale_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
index 337f9edb6b..87835aa047 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
@@ -24,11 +24,11 @@ using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_CaffePReLU;
+using mindspore::schema::PrimitiveType_PReLU;
 
 namespace mindspore::kernel {
 namespace {
-int PReluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PReluRun(void *cdata, int task_id) {
   auto PRelu = reinterpret_cast<PReluCPUKernel *>(cdata);
   auto ret = PRelu->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -135,7 +135,7 @@ int PReluCPUKernel::Run() {
   auto negative_slope_tensor = in_tensors_.at(1);
   prelu_param_->slope_ = reinterpret_cast<float *>(negative_slope_tensor->Data());
 
-  auto ret = LiteBackendParallelLaunch(PReluRun, this, prelu_param_->op_parameter_.thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, PReluRun, this, prelu_param_->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PRelu Run error: error_code[" << ret << "]";
     context_->allocator->Free(input_data_);
@@ -155,7 +155,7 @@ kernel::LiteKernel *CpuPReluFp32KernelCreator(const std::vector<lite::tensor::Te
     MS_LOG(ERROR) << "input param is nullptr!";
     return nullptr;
   }
-  MS_ASSERT(desc.type == schema::PrimitiveType_Prelu);
+
   auto *kernel = new (std::nothrow) PReluCPUKernel(param, inputs, outputs, ctx, primitive);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "new PReluCPUKernel fail!";
@@ -171,5 +171,5 @@ kernel::LiteKernel *CpuPReluFp32KernelCreator(const std::vector<lite::tensor::Te
   return kernel;
 }
 
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_CaffePReLU, CpuPReluFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_PReLU, CpuPReluFp32KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/range.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/range.cc
index 2b2c0e8fea..5d19341a0a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/range.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/range.cc
@@ -27,12 +27,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Range;
 
 namespace mindspore::kernel {
-
-namespace {
-constexpr int kInputNum = 0;
-constexpr int kOutputNum = 1;
-}  // namespace
-
 int RangeCPUKernel::Init() { return RET_OK; }
 
 int RangeCPUKernel::ReSize() { return RET_OK; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/rank.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/rank.cc
index ae350e150c..5c23b27845 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/rank.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/rank.cc
@@ -27,12 +27,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Rank;
 
 namespace mindspore::kernel {
-
-namespace {
-constexpr int kInputNum = 1;
-constexpr int kOutputNum = 1;
-}  // namespace
-
 int RankCPUKernel::Init() { return RET_OK; }
 
 int RankCPUKernel::ReSize() { return RET_OK; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
index aaf0b5a5cd..27125a0d4e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -95,7 +95,7 @@ int ReduceCPUKernel::CallReduceUnit(int task_id) {
   return ret;
 }
 
-int ReduceImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReduceImpl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceCPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -125,7 +125,7 @@ int ReduceCPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
       return RET_ERROR;
@@ -145,7 +145,7 @@ int ReduceCPUKernel::Run() {
   }
   axis_size_ = tmp_shape_[last_reduce_axis];
   dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+  auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
index b4b3c360a1..b5bd381f3f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include "src/runtime/kernel/arm/fp32/resize.h"
 #include "schema/model_generated.h"
 #include "nnacl/fp32/resize.h"
@@ -38,7 +39,92 @@ int ResizeCPUKernel::Init() {
   return ReSize();
 }
 
-int ResizeImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ResizeCPUKernel::ReSize() {
+  int ret = RET_OK;
+  if (method_ == static_cast<int>(schema::ResizeMethod_BILINEAR)) {
+    FreeTmpBuffer();
+    ret = MallocTmpBuffer();
+    if (ret != RET_OK) {
+      FreeTmpBuffer();
+      return ret;
+    }
+
+    auto input = in_tensors_.at(0);
+    auto input_shape = input->shape();
+    ret = PrepareResizeBilinear(input_shape.data(), out_tensors_[0]->shape().data(), align_corners_, y_bottoms_,
+                                y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_);
+    if (ret != RET_OK) {
+      FreeTmpBuffer();
+    }
+  }
+  return ret;
+}
+
+int ResizeCPUKernel::MallocTmpBuffer() {
+  int h = new_height_;
+  int w = new_width_;
+  y_bottoms_ = reinterpret_cast<int *>(malloc(sizeof(int) * h));
+  if (y_bottoms_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+  y_tops_ = reinterpret_cast<int *>(malloc(sizeof(int) * h));
+  if (y_tops_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+  y_bottom_weights_ = reinterpret_cast<float *>(malloc(sizeof(float) * h));
+  if (y_bottom_weights_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+
+  x_lefts_ = reinterpret_cast<int *>(malloc(sizeof(int) * w));
+  if (x_lefts_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+  x_rights_ = reinterpret_cast<int *>(malloc(sizeof(int) * w));
+  if (x_rights_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+  x_left_weights_ = reinterpret_cast<float *>(malloc(sizeof(float) * w));
+  if (x_left_weights_ == nullptr) {
+    MS_LOG(ERROR) << "malloc data failed";
+    return RET_NULL_PTR;
+  }
+  return RET_OK;
+}
+void ResizeCPUKernel::FreeTmpBuffer() {
+  if (y_bottoms_ != nullptr) {
+    free(y_bottoms_);
+    y_bottoms_ = nullptr;
+  }
+  if (y_tops_ != nullptr) {
+    free(y_tops_);
+    y_tops_ = nullptr;
+  }
+  if (y_bottom_weights_ != nullptr) {
+    free(y_bottom_weights_);
+    y_bottom_weights_ = nullptr;
+  }
+
+  if (x_lefts_ != nullptr) {
+    free(x_lefts_);
+    x_lefts_ = nullptr;
+  }
+  if (x_rights_ != nullptr) {
+    free(x_rights_);
+    x_rights_ = nullptr;
+  }
+  if (x_left_weights_ != nullptr) {
+    free(x_left_weights_);
+    x_left_weights_ = nullptr;
+  }
+}
+
+int ResizeImpl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<ResizeCPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -66,8 +152,16 @@ int ResizeCPUKernel::RunImpl(int task_id) {
   int ret = 0;
   switch (method_) {
     case static_cast<int>(schema::ResizeMethod_BILINEAR): {
-      ret = ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), align_corners_,
-                           task_id, context_->thread_num_);
+      int n_h_begin, n_h_end;
+      int n = out_tensors_.at(0)->shape()[0];
+      int h = new_height_;
+      int unit = UP_DIV(n * h, context_->thread_num_);
+      n_h_begin = unit * task_id;
+      n_h_end = std::min(n_h_begin + unit, n * h);
+
+      ret = ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), y_bottoms_,
+                           y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_, n_h_begin, n_h_end);
+
       break;
     }
     case static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR): {
@@ -94,11 +188,13 @@ int ResizeCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(ResizeImpl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ResizeImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
+    FreeTmpBuffer();
     return RET_ERROR;
   }
+
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.h b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.h
index 01e651ceee..abd90bf925 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.h
@@ -31,12 +31,22 @@ class ResizeCPUKernel : public ResizeBaseCPUKernel {
                   const mindspore::lite::PrimitiveC *primitive)
       : ResizeBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
 
-  ~ResizeCPUKernel() = default;
+  ~ResizeCPUKernel() { FreeTmpBuffer(); }
 
   int Init() override;
-  int ReSize() override { return 0; };
+  int ReSize() override;
   int Run() override;
   int RunImpl(int task_id);
+  int MallocTmpBuffer();
+  void FreeTmpBuffer();
+
+ private:
+  int *y_tops_ = nullptr;
+  int *y_bottoms_ = nullptr;
+  int *x_lefts_ = nullptr;
+  int *x_rights_ = nullptr;
+  float *y_bottom_weights_ = nullptr;
+  float *x_left_weights_ = nullptr;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
index e61ff43cb8..4eb82488cc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
@@ -100,7 +100,7 @@ int ReverseCPUKernel::Init() {
   return ret;
 }
 
-int ReverseRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReverseRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ReverseCPUKernel *>(cdata);
   auto ret = g_kernel->DoReverse(task_id);
   if (ret != RET_OK) {
@@ -132,7 +132,7 @@ int ReverseCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_[0]->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_[0]->Data());
-  ret = LiteBackendParallelLaunch(ReverseRun, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ReverseRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Reverse run error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.h b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.h
index 0051afde7b..c401619938 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.h
@@ -31,7 +31,7 @@ class ReverseCPUKernel : public LiteKernel {
   ReverseCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                    const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                    const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
   ~ReverseCPUKernel() {
     if (tmp_ != nullptr) {
       free(tmp_);
@@ -52,7 +52,6 @@ class ReverseCPUKernel : public LiteKernel {
   int strides_[REVERSE_STRIDE_MAX_SIZE];
   int inCount_[REVERSE_STRIDE_MAX_SIZE];
   int outCount_[REVERSE_STRIDE_MAX_SIZE];
-  const Context *ctx_;
   int thread_count_;
   int *tmp_ = nullptr;
   float *in_ptr_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
index 21718c5553..9256ada127 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
@@ -72,7 +72,7 @@ int ROIPoolingCPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int ROIPoolingRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ROIPoolingRun(void *cdata, int task_id) {
   auto Data = reinterpret_cast<ROIPoolingCPUKernel *>(cdata);
   auto ret = Data->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -91,7 +91,7 @@ int ROIPoolingCPUKernel::Run() {
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
   roi_ptr_ = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
-  ret = LiteBackendParallelLaunch(ROIPoolingRun, this, param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ROIPoolingRun, this, param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ROIPooling error: error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
index d9c7583224..aee1ff01ee 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
@@ -35,11 +35,9 @@ ScaleCPUKernel::~ScaleCPUKernel() {
       scale_ = nullptr;
     }
   }
-  if (scale_param_->has_offset_) {
-    if (offset_ != nullptr) {
-      free(offset_);
-      offset_ = nullptr;
-    }
+  if (offset_ != nullptr) {
+    free(offset_);
+    offset_ = nullptr;
   }
 }
 
@@ -59,18 +57,15 @@ int ScaleCPUKernel::InitScaleOffset() {
     scale_ = nullptr;
   }
 
+  offset_ = reinterpret_cast<float *>(malloc(scale_param_->axis_size_ * sizeof(float)));
+  if (offset_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(offset_, 0, scale_param_->axis_size_ * sizeof(float));
   if (in_tensors_.size() == 3) {
     auto offset_tensor = in_tensors_.at(2);
-    offset_ = reinterpret_cast<float *>(malloc(offset_tensor->ElementsNum() * sizeof(float)));
-    if (offset_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
     memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float));
-    scale_param_->has_offset_ = true;
-  } else {
-    offset_ = nullptr;
-    scale_param_->has_offset_ = false;
   }
   return RET_OK;
 }
@@ -81,6 +76,9 @@ int ScaleCPUKernel::InitParameter() {
   auto scale_tensor = in_tensors_.at(1);
   auto scale_shape = scale_tensor->shape();
 
+  if (scale_param_->axis_ < 0) {
+    scale_param_->axis_ = scale_param_->axis_ + in_shape.size();
+  }
   if (scale_shape.size() + scale_param_->axis_ > in_shape.size()) {
     MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
     return RET_ERROR;
@@ -101,6 +99,7 @@ int ScaleCPUKernel::InitParameter() {
   for (size_t i = scale_param_->axis_ + scale_shape.size(); i < in_shape.size(); i++) {
     scale_param_->inner_size_ *= in_shape[i];
   }
+  scale_param_->op_parameter_.thread_num_ = MSMIN(scale_param_->op_parameter_.thread_num_, scale_param_->outer_size_);
   return RET_OK;
 }
 
@@ -114,6 +113,11 @@ int ScaleCPUKernel::Init() {
     return RET_OK;
   }
 
+  ReSize();
+  return RET_OK;
+}
+
+int ScaleCPUKernel::ReSize() {
   auto ret = InitParameter();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale fp32 InitParameter failed.";
@@ -128,25 +132,12 @@ int ScaleCPUKernel::Init() {
   return RET_OK;
 }
 
-int ScaleCPUKernel::ReSize() {
-  auto ret = InitParameter();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Scale fp32 InitParameter failed.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int ScaleCPUKernel::Scale(int task_id) {
-  auto ret = DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, scale_param_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Scale error task_id[" << task_id << "] error_code[" << ret << "]";
-    return RET_ERROR;
-  }
+  DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, scale_param_);
   return RET_OK;
 }
 
-int ScaleRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ScaleRun(void *cdata, int task_id) {
   auto scale = reinterpret_cast<ScaleCPUKernel *>(cdata);
   auto ret = scale->Scale(task_id);
   if (ret != RET_OK) {
@@ -171,7 +162,7 @@ int ScaleCPUKernel::Run() {
   auto out_tensor = out_tensors_.front();
   output_ptr_ = reinterpret_cast<float *>(out_tensor->Data());
 
-  ret = LiteBackendParallelLaunch(ScaleRun, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ScaleRun, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
index 5125e17f1b..8071f16bdd 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "nnacl/scale.h"
+#include "nnacl/fp32/scale.h"
 
 namespace mindspore::kernel {
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
index f62acfe4af..04917fdcf2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
@@ -30,8 +30,6 @@ using mindspore::schema::PrimitiveType_ScatterND;
 
 namespace mindspore::kernel {
 namespace {
-constexpr int kScatterNDInputNum = 3;
-constexpr int kScatterNDOutputNum = 1;
 constexpr int kScatterShapeIndex = 0;
 constexpr int kScatterIndicesIndex = 1;
 constexpr int kScatterUpdateIndex = 2;
@@ -139,7 +137,7 @@ int ScatterNDCPUKernel::ScatterND(int task_id) {
   return RET_OK;
 }
 
-int ScatterNDRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ScatterNDRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ScatterNDCPUKernel *>(cdata);
   auto ret = g_kernel->ScatterND(task_id);
   if (ret != RET_OK) {
@@ -155,7 +153,7 @@ int ScatterNDCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(ScatterNDRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ScatterNDRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ScatterND error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/shape.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/shape.cc
index 266f0bd579..5c67b293b0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/shape.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/shape.cc
@@ -26,10 +26,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Shape;
 
 namespace mindspore::kernel {
-namespace {
-constexpr int kShapeInputNum = 1;
-constexpr int kShapeOutputNum = 1;
-}  // namespace
 int ShapeCPUKernel::Init() { return RET_OK; }
 
 int ShapeCPUKernel::ReSize() { return RET_OK; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
index d81023a373..7e5fb0b28f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
@@ -20,6 +20,7 @@
 #include "nnacl/fp32/slice.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
+#include "src/ops/slice.h"
 
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
@@ -29,18 +30,26 @@ using mindspore::schema::PrimitiveType_Slice;
 
 namespace mindspore::kernel {
 namespace {
-int SliceLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SliceLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<SliceCPUKernel *>(cdata);
-  return kernel->SliceParallelRun(thread_id);
+  return kernel->SliceParallelRun(task_id);
 }
 }  // namespace
 
 int SliceCPUKernel::ReSize() {
-  auto *param = reinterpret_cast<SliceParameter *>(op_parameter_);
+  auto primitive_slice = reinterpret_cast<const mindspore::lite::Slice *>(primitive_);
+  auto begin = primitive_slice->GetPostProcessBegin();
+  auto size = primitive_slice->GetPostProcessSize();
+  auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
+  param->param_length_ = in_tensors_[0]->shape().size();
+  for (int i = 0; i < param->param_length_; ++i) {
+    param->begin_[i] = begin[i];
+    param->size_[i] = size[i];
+  }
   auto input_shape = in_tensors_[0]->shape();
   if (static_cast<int>(input_shape.size()) != param->param_length_) {
     MS_LOG(ERROR) << "Input begin's lenth " << param->param_length_ << "is not equal to input shape size "
@@ -97,7 +106,7 @@ int SliceCPUKernel::Run() {
     DoSliceNoParallel(input_data, output_data, param);
     return RET_OK;
   }
-  ret = LiteBackendParallelLaunch(SliceLaunch, this, param->op_parameter_.thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SliceLaunch, this, param->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "slice launch fail!ret: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.cc
index 81d6e9dee4..647372e6e3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.cc
@@ -29,8 +29,18 @@ using mindspore::lite::RET_FORMAT_ERR;
 using mindspore::lite::RET_OK;
 using mindspore::lite::RET_OP_EXECUTE_FAILURE;
 using mindspore::schema::PrimitiveType_SpaceToBatch;
+using mindspore::schema::PrimitiveType_SpaceToBatchND;
 
 namespace mindspore::kernel {
+namespace {
+size_t EnumElement(int *shape, int n_dims) {
+  size_t total = 1;
+  for (int i = 0; i < n_dims; i++) {
+    total *= shape[i];
+  }
+  return total;
+}
+}
 
 int SpaceToBatchCPUKernel::Init() {
   SpaceToBatchParameter *param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
@@ -40,37 +50,26 @@ int SpaceToBatchCPUKernel::Init() {
       break;
     }
   }
-  param->n_dims_ = DIMENSION_4D;
-  param->n_space_dims_ = SPACE_TO_BATCH_BLOCK_SIZES_SIZE;
+
   if (!InferShapeDone()) {
     return RET_OK;
   }
   return ReSize();
 }
 
-int SpaceToBatchCPUKernel::SpaceToBatchParallel(int task_id) {
-  int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_);
-  if (num_unit_thread <= 0) {
-    return RET_OK;
+void SpaceToBatchCPUKernel::FreeTmpBuffer() {
+  if (pedding_h_data_ != nullptr) {
+    context_->allocator->Free(pedding_h_data_);
+    pedding_h_data_ = nullptr;
   }
-  int thread_offset = task_id * thread_h_stride_;
-  SpaceToBatchParameter *param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
-  auto ret = SpaceToBatch(input_ptr_, output_ptr_, *param, thread_offset, thread_offset + num_unit_thread);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "SpaceToDepth error task_id[" << task_id << "] error_code[" << ret << "]";
-    return RET_ERROR;
+  if (pedding_w_data_ != nullptr) {
+    context_->allocator->Free(pedding_w_data_);
+    pedding_w_data_ = nullptr;
   }
-  return RET_OK;
-}
-
-int SpaceToBatchRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto g_kernel = reinterpret_cast<SpaceToBatchCPUKernel *>(cdata);
-  auto ret = g_kernel->SpaceToBatchParallel(task_id);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "SpaceToBatchRun error task_id[" << task_id << "] error_code[" << ret << "]";
-    return RET_OP_EXECUTE_FAILURE;
+  if (pedding_input_ != nullptr) {
+    context_->allocator->Free(pedding_input_);
+    pedding_input_ = nullptr;
   }
-  return RET_OK;
 }
 
 int SpaceToBatchCPUKernel::ReSize() {
@@ -78,13 +77,39 @@ int SpaceToBatchCPUKernel::ReSize() {
     MS_LOG(ERROR) << "space_to_batch only support NHWC now!";
     return RET_FORMAT_ERR;
   }
+  FreeTmpBuffer();
   SpaceToBatchParameter *param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
-  param->num_elements_ = EnumElement(param->in_shape_, param->n_dims_);
-  param->num_elements_padded_ = EnumElement(param->padded_in_shape_, param->n_dims_);
-  num_unit_ = static_cast<int>(in_tensors_[kInputIndex]->shape().at(kNHWC_H));
-  num_unit_ /= param->block_sizes_[0];
-  thread_h_num_ = MSMIN(thread_num_, num_unit_);
-  thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_);
+  if (!param->need_paddings_) {
+    return RET_OK;
+  }
+  auto input = in_tensors_[0];
+  auto in_shape = input->shape();
+  padded_in_shape_ = in_shape;
+  padded_in_shape_[1] = in_shape[1] + param->paddings_[0] + param->paddings_[1];
+  padded_in_shape_[2] = in_shape[2] + param->paddings_[2] + param->paddings_[3];
+  auto num_elements_padded = EnumElement(padded_in_shape_.data(), in_shape.size());
+  auto output_shape = out_tensors_[0]->shape();
+  auto pedding_h_size = output_shape[2] * output_shape[3] * sizeof(float);
+  pedding_h_data_ = reinterpret_cast<float *>(context_->allocator->Malloc(pedding_h_size));
+  if (pedding_h_data_ == nullptr) {
+    MS_LOG(ERROR) << "malloc pedding h data fail!";
+    return RET_ERROR;
+  }
+  auto pedding_w_size = output_shape[3] * sizeof(float);
+  pedding_w_data_ = reinterpret_cast<float *>(context_->allocator->Malloc(pedding_w_size));
+  if (pedding_w_data_ == nullptr) {
+    MS_LOG(ERROR) << "malloc pedding w data fail!";
+    FreeTmpBuffer();
+    return RET_ERROR;
+  }
+  pedding_input_ =
+      reinterpret_cast<float *>(context_->allocator->Malloc(num_elements_padded * sizeof(float)));
+  if (pedding_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc pedding buffer fail!";
+    return RET_ERROR;
+  }
+  memset(pedding_h_data_, 0, pedding_h_size);
+  memset(pedding_w_data_, 0, pedding_w_size);
   return RET_OK;
 }
 
@@ -96,54 +121,32 @@ int SpaceToBatchCPUKernel::Run() {
   }
   auto input = in_tensors_[0];
   auto output = out_tensors_[0];
-  input_ptr_ = reinterpret_cast<const float *>(input->Data());
-  output_ptr_ = reinterpret_cast<float *>(output->Data());
+  const float *input_ptr_ = reinterpret_cast<const float *>(input->Data());
+  float *output_ptr_ = reinterpret_cast<float *>(output->Data());
   SpaceToBatchParameter *param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
-
-  float *tmp_space[3] = {nullptr, nullptr, nullptr};
+  auto in_shape = input->shape();
+  auto out_shape = output->shape();
   if (param->need_paddings_) {
-    for (int i = 0; i < 3; ++i) {
-      tmp_space[i] =
-        reinterpret_cast<float *>(context_->allocator->Malloc(param->num_elements_padded_ * sizeof(float)));
-      (void)memset(tmp_space[i], 0, param->num_elements_padded_ * sizeof(float));
-      if (tmp_space[i] == nullptr) {
-        MS_LOG(ERROR) << "malloc tmp buffer fail!";
-        return RET_ERROR;
-      }
-    }
-    auto padded_input = tmp_space[0];
-    DoPadding(input_ptr_, padded_input, *param, tmp_space + 1);
-    input_ptr_ = padded_input;
-  }
-
-  if (input->GetFormat() == schema::Format_NHWC) {
-    ret = LiteBackendParallelLaunch(SpaceToBatchRun, this, thread_h_num_);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "SpaceToBatch error error_code[" << ret << "]";
-    }
+    DoSpaceToBatchPaddingNHWC(input_ptr_, pedding_input_, in_shape.data(), param->paddings_,
+                              padded_in_shape_.data(), pedding_h_data_, pedding_w_data_);
+    DoSpaceToBatchNHWC(pedding_input_, output_ptr_, param, padded_in_shape_.data(), out_shape.data());
+    return RET_OK;
   } else {
-    MS_LOG(ERROR) << "Only support NHWC now!";
-    ret = RET_FORMAT_ERR;
-  }
-  if (param->need_paddings_) {
-    for (int i = 0; i < 3; ++i) {
-      context_->allocator->Free(tmp_space[i]);
-    }
+    DoSpaceToBatchNHWC(input_ptr_, output_ptr_, param, in_shape.data(), out_shape.data());
+    return RET_OK;
   }
-
-  return ret;
 }  // namespace mindspore::kernel
 
 kernel::LiteKernel *CpuSpaceToBatchFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                      const std::vector<lite::tensor::Tensor *> &outputs,
-                                                     OpParameter *opParameter, const lite::Context *ctx,
+                                                     OpParameter *param, const lite::Context *ctx,
                                                      const kernel::KernelKey &desc,
                                                      const mindspore::lite::PrimitiveC *primitive) {
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "Input opParameter is nullptr!";
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "Input param is nullptr!";
     return nullptr;
   }
-  auto *kernel = new (std::nothrow) SpaceToBatchCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  auto *kernel = new (std::nothrow) SpaceToBatchCPUKernel(param, inputs, outputs, ctx, primitive);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "new SpaceToBatchCPUKernel fail!";
     return nullptr;
@@ -152,12 +155,13 @@ kernel::LiteKernel *CpuSpaceToBatchFp32KernelCreator(const std::vector<lite::ten
   auto ret = kernel->Init();
   if (ret != RET_OK) {
     delete kernel;
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    MS_LOG(ERROR) << "Init kernel failed, name: " << param->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(param->type_));
     return nullptr;
   }
   return kernel;
 }
 
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatch, CpuSpaceToBatchFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatchND, CpuSpaceToBatchFp32KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.h b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.h
index f93de6cc84..2135d27c78 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch.h
@@ -25,22 +25,20 @@ class SpaceToBatchCPUKernel : public LiteKernel {
   SpaceToBatchCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                         const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                         const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
 
-  ~SpaceToBatchCPUKernel() = default;
+  ~SpaceToBatchCPUKernel() { FreeTmpBuffer(); }
 
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int SpaceToBatchParallel(int task_id);
 
  private:
-  int thread_num_;
-  int thread_h_stride_;
-  int thread_h_num_;
-  int num_unit_;
-  const float *input_ptr_;
-  float *output_ptr_;
+  void FreeTmpBuffer();
+  float *pedding_input_ = nullptr;
+  float *pedding_h_data_ = nullptr;
+  float *pedding_w_data_ = nullptr;
+  std::vector<int> padded_in_shape_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
index 4e927e736b..e08f383894 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
@@ -74,7 +74,7 @@ int SpaceToDepthCPUKernel::SpaceToDepth(int task_id) {
   return RET_OK;
 }
 
-int SpaceToDepthRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SpaceToDepthRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SpaceToDepthCPUKernel *>(cdata);
   auto ret = g_kernel->SpaceToDepth(task_id);
   if (ret != RET_OK) {
@@ -93,7 +93,7 @@ int SpaceToDepthCPUKernel::Run() {
   input_ptr_ = reinterpret_cast<float *>(in_tensors_[0]->Data());
   output_ptr_ = reinterpret_cast<float *>(out_tensors_[0]->Data());
   if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) {
-    ret = LiteBackendParallelLaunch(SpaceToDepthRun, this, thread_h_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, SpaceToDepthRun, this, thread_h_num_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "SpaceToDepth error error_code[" << ret << "]";
       return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
index 16f89d1819..3e4d0b92ca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
@@ -39,7 +39,7 @@ int SparseToDenseCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int SparseToDenseRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SparseToDenseRun(void *cdata, int task_id) {
   auto s2ddata = reinterpret_cast<SparseToDenseCPUKernel *>(cdata);
   auto ret = s2ddata->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -70,7 +70,7 @@ int SparseToDenseCPUKernel::Run() {
   std::vector<int> temp_shape = output0->shape();
   output_shape_ = reinterpret_cast<int *>(temp_shape.data());
 
-  ret = LiteBackendParallelLaunch(SparseToDenseRun, this, s2d_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SparseToDenseRun, this, s2d_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "SparseToDenseRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
index 7fe0f68eae..cb56abae50 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
@@ -62,7 +62,7 @@ int SplitCPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-int SplitRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SplitRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitCPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -83,7 +83,7 @@ int SplitCPUKernel::Run() {
   for (int i = 0; i < param->num_split_; i++) {
     output_ptr_[i] = reinterpret_cast<float *>(out_tensors_.at(i)->Data());
   }
-  ret = LiteBackendParallelLaunch(SplitRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze.cc
index b334b72366..f666a4db29 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/squeeze.cc
@@ -27,11 +27,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Squeeze;
 
 namespace mindspore::kernel {
-namespace {
-constexpr int kSqueezeInputNum = 1;
-constexpr int kSqueezeOutputNum = 1;
-}  // namespace
-
 int SqueezeCPUKernel::Init() { return RET_OK; }
 
 int SqueezeCPUKernel::ReSize() { return RET_OK; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
index a8fff82ddf..283906e3d3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
@@ -29,10 +29,6 @@ using mindspore::lite::RET_OP_EXECUTE_FAILURE;
 using mindspore::schema::PrimitiveType_Transpose;
 
 namespace mindspore::kernel {
-namespace {
-constexpr int kTransposeInputNum = 1;
-constexpr int kTransposeOutputNum = 1;
-}  // namespace
 int TransposeCPUKernel::Init() {
   TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
   num_unit_ = static_cast<int>(in_tensors_[kInputIndex]->shape().at(param->perm_[kNHWC_H]));
@@ -76,7 +72,7 @@ int TransposeCPUKernel::TransposeParallel(int task_id) {
   return RET_OK;
 }
 
-int TransposeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int TransposeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<TransposeCPUKernel *>(cdata);
   auto ret = g_kernel->TransposeParallel(task_id);
   if (ret != RET_OK) {
@@ -105,7 +101,7 @@ int TransposeCPUKernel::Run() {
   in_shape_ = const_cast<int *>(in_tensor->shape().data());
   out_shape_ = const_cast<int *>(out_tensor->shape().data());
 
-  ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, TransposeRun, this, thread_h_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
index 57d545e24c..496c8e3f8a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
@@ -55,7 +55,7 @@ int UnsqueezeCPUKernel::DoUnsqueeze(int task_id) {
   return RET_OK;
 }
 
-int UnsqueezeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int UnsqueezeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<UnsqueezeCPUKernel *>(cdata);
   auto ret = g_kernel->DoUnsqueeze(task_id);
   if (ret != RET_OK) {
@@ -73,7 +73,7 @@ int UnsqueezeCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<int8_t *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
-  ret = LiteBackendParallelLaunch(UnsqueezeRun, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, UnsqueezeRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "UnsqueezeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
index 4a853aa2c0..3a35179a20 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
@@ -38,7 +38,7 @@ int WhereCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int WhereRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int WhereRun(void *cdata, int task_id) {
   auto wheredata = reinterpret_cast<WhereCPUKernel *>(cdata);
   auto ret = wheredata->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -79,7 +79,7 @@ int WhereCPUKernel::Run() {
     MS_LOG(ERROR) << "Error, inputs' length are zero !!!";
     return RET_ERROR;
   }
-  ret = LiteBackendParallelLaunch(WhereRun, this, where_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, WhereRun, this, where_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "WhereDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/zeroslike.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/zeroslike.cc
index 9a2fe6a11f..a11f9ab864 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/zeroslike.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/zeroslike.cc
@@ -27,9 +27,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_ZerosLike;
 
 namespace mindspore::kernel {
-constexpr int kInputNum = 1;
-constexpr int kOutputNum = 1;
-
 int ZerosLikeCPUKernel::Init() { return RET_OK; }
 
 int ZerosLikeCPUKernel::Run() {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
index a49dcc663b..6cc4999499 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
@@ -70,7 +70,7 @@ int ActivationGradCPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ActivationGradRun(void *cdata, int task_id) {
   auto activationGrad_kernel = reinterpret_cast<ActivationGradCPUKernel *>(cdata);
   auto error_code = activationGrad_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -81,7 +81,7 @@ int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int ActivationGradCPUKernel::Run() {
-  int error_code = LiteBackendParallelLaunch(ActivationGradRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationGradRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc
index 83edebb78f..7dd8f64357 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc
@@ -277,7 +277,7 @@ kernel::LiteKernel *CpuArithmeticGradFp32KernelCreator(const std::vector<lite::t
                                                        OpParameter *opParameter, const lite::Context *ctx,
                                                        const kernel::KernelKey &desc,
                                                        const mindspore::lite::PrimitiveC *primitive) {
-  MS_EXCEPTION_IF_NULL(opParameter);
+  MS_ASSERT(nullptr != opParameter);
   if (opParameter == nullptr) {
     return nullptr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc
index fda7fc1a21..5348bdb3f3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc
@@ -163,7 +163,7 @@ int PoolingGradCPUKernel::Run() {
   auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data());
   auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data());
 
-  if (pool_param->max_pooling_) {
+  if (pool_param->pool_mode_ == PoolMode_MaxPool) {
     auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data());
     MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param);
   } else {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
index b4a8a341b1..f7eea6ccd1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
@@ -92,7 +92,7 @@ int QuantizedAddCPUKernel::Run() {
     input0_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
     input1_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
 
-    ArithmeticParameter tile_para = {0};
+    ArithmeticParameter tile_para;
     tile_para.ndim_ = out_tensors_.at(0)->shape().size();
     for (size_t i = 0; i < tile_para.ndim_; i++) {
       tile_para.in_shape0_[i] = in_tensors_.at(0)->DimensionSize(i);
@@ -102,17 +102,17 @@ int QuantizedAddCPUKernel::Run() {
     TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->Data()),
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(input0_data_),
                         reinterpret_cast<uint8_t *>(input1_data_), &tile_para);
-    ret = LiteBackendParallelLaunch(AddInt8Run, this, thread_count_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddInt8Run, this, thread_count_);
     ctx_->allocator->Free(input0_data_);
     ctx_->allocator->Free(input1_data_);
     return ret;
   }
 
-  ret = LiteBackendParallelLaunch(AddInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddInt8Run, this, thread_count_);
   return ret;
 }
 
-int AddInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int AddInt8Run(void *cdata, int task_id) {
   auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata);
   add->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
index be83375b0d..77d76fbc18 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
@@ -46,7 +46,7 @@ class QuantizedAddCPUKernel : public LiteKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int AddInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int AddInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ADD_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
index c05aac9896..02fa869545 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
@@ -36,11 +36,11 @@ using mindspore::schema::PrimitiveType_NotEqual;
 
 namespace mindspore::kernel {
 namespace {
-int ArithmeticsInt8Launch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticsInt8Launch(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticInt8CPUKernel *>(cdata);
-  auto error_code = arithmetic_kernel->DoArithmetic(thread_id);
+  auto error_code = arithmetic_kernel->DoArithmetic(task_id);
   if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "ArithmeticsRun error thread_id[" << thread_id << "] error_code[" << error_code << "]";
+    MS_LOG(ERROR) << "ArithmeticsRun error thread_id[" << task_id << "] error_code[" << error_code << "]";
     return error_code;
   }
   return RET_OK;
@@ -151,7 +151,7 @@ int ArithmeticInt8CPUKernel::Run() {
     }
     TileDimensionsInt8(input_data0, input_data1, tile_data0_, tile_data1_, param);
   }
-  ret = LiteBackendParallelLaunch(ArithmeticsInt8Launch, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsInt8Launch, this, op_parameter_->thread_num_);
   if (param->broadcasting_) {
     context_->allocator->Free(tile_data0_);
     context_->allocator->Free(tile_data1_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
index 56b1a0fc2c..43c3a36123 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
@@ -65,7 +65,7 @@ int ArithmeticSelfInt8CPUKernel::ReSize() {
   return RET_OK;
 }
 
-int ArithmeticSelfInt8Runs(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticSelfInt8Runs(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ArithmeticSelfInt8CPUKernel *>(cdata);
   auto ret = g_kernel->DoArithmeticSelf(task_id);
   if (ret != RET_OK) {
@@ -104,7 +104,7 @@ int ArithmeticSelfInt8CPUKernel::Run() {
   auto out_tensor = out_tensors_.at(0);
   in_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());
   out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->Data());
-  ret = LiteBackendParallelLaunch(ArithmeticSelfInt8Runs, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticSelfInt8Runs, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.h
index d9e7ee7515..79e15be205 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.h
@@ -45,7 +45,7 @@ class ArithmeticSelfInt8CPUKernel : public LiteKernel {
   explicit ArithmeticSelfInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                        const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                        const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {
     switch (parameter->type_) {
       case PrimitiveType_Round:
         arithmeticSelf_run_ = Int8ElementRound;
@@ -98,7 +98,6 @@ class ArithmeticSelfInt8CPUKernel : public LiteKernel {
   size_t data_size_;
   ArithmeticSelfParameter *para_;
   ArithmeticSelfInt8Run arithmeticSelf_run_;
-  const Context *ctx_;
   int thread_count_;
   int8_t *in_ptr_;
   int8_t *out_ptr_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
index e702bcc27d..846f6f16b4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
@@ -180,7 +180,7 @@ int BatchnormInt8CPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int BatchNormInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int BatchNormInt8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<BatchnormInt8CPUKernel *>(cdata);
   auto ret = g_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -199,7 +199,7 @@ int BatchnormInt8CPUKernel::Run() {
   in_addr_ = reinterpret_cast<int8_t *>(in_tensors_.at(0)->Data());
   out_addr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
 
-  int ret = LiteBackendParallelLaunch(BatchNormInt8Run, this, batchnorm_param_->op_parameter_.thread_num_);
+  int ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormInt8Run, this, batchnorm_param_->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
index d60bde5f3e..8aad2d5716 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
@@ -104,12 +104,12 @@ int ConcatInt8CPUKernel::Run() {
   }
   output_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
 
-  ret = LiteBackendParallelLaunch(ConcatInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConcatInt8Run, this, thread_count_);
 
   return ret;
 }
 
-int ConcatInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConcatInt8Run(void *cdata, int task_id) {
   auto concat = reinterpret_cast<ConcatInt8CPUKernel *>(cdata);
   concat->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
index 7a677034db..0f8780fd2f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
@@ -56,7 +56,7 @@ class ConcatInt8CPUKernel : public ConcatBaseCPUKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int ConcatInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int ConcatInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONCAT_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
new file mode 100644
index 0000000000..128c88f3e5
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -0,0 +1,351 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+int Convolution1x1Int8Pre(void *cdata, int task_id) {
+  auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
+  auto error_code = conv->RunPre(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv1x1 Int8 RunPre error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
+  if (matmul_param_ != nullptr) {
+    delete matmul_param_;
+    matmul_param_ = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  FreeResizeBuf();
+  FreeQuantParam();
+}
+
+void Convolution1x1Int8CPUKernel::FreeResizeBuf() {
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
+    free(input_ptr_);
+    input_ptr_ = nullptr;
+  }
+  return;
+}
+
+void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
+  support_optimize_ = true;
+  matmul_func_ = MatMulInt8_8x8_r;
+#ifdef ENABLE_ARM64
+  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
+  if (optimize_op_handler != nullptr) {
+    dlerror();
+    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
+    auto dlopen_error = dlerror();
+    if (dlopen_error != nullptr) {
+      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
+      support_optimize_ = false;
+      matmul_func_ = nullptr;
+    } else {
+      support_optimize_ = true;
+    }
+  } else {
+    support_optimize_ = false;
+    matmul_func_ = nullptr;
+  }
+#endif
+  return;
+}
+
+int Convolution1x1Int8CPUKernel::InitWeightBias() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
+  /* weight */
+  size_t size = support_optimize_ ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C8NUM) * sizeof(int8_t)
+                                  : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
+  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, size);
+  if (support_optimize_) {
+    RowMajor2Row8x4MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel,
+                             input_channel);
+  } else {
+    RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel,
+                              input_channel);
+  }
+
+  /* bias = bias - v2 x zp1 + zp1 x zp2  */
+  int col4 = UP_ROUND(output_channel, C4NUM);
+  int col8 = UP_ROUND(output_channel, C8NUM);
+  size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t);
+  bias_data_ = malloc(size);
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, size);
+  if (in_tensors_.size() == 3) {
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t));
+  }
+
+  int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
+  int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->Data());
+  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
+  for (int oc = 0; oc < output_channel; oc++) {
+    int32_t weight_sum_value = 0;
+    int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1)
+                          ? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_
+                          : conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_;
+    for (int ic = 0; ic < input_channel; ic++) {
+      weight_sum_value += weight[oc * input_channel + ic];
+    }
+    bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::Init() {
+  matmul_param_ = new (std::nothrow) MatMulParameter();
+  if (matmul_param_ == nullptr) {
+    MS_LOG(ERROR) << "Init matmul_param_ failed.";
+    return RET_ERROR;
+  }
+
+  CheckSupportOptimize();
+
+  auto ret = SetQuantParam();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set quant param failed.";
+    return ret;
+  }
+
+  ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init weight bias failed.";
+    return ret;
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int Convolution1x1Int8CPUKernel::InitParam() {
+  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
+                      conv_param_->stride_w_ != 1);
+
+  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_;
+  matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM);
+  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
+  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
+  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
+  matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
+  matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);
+
+  /* init input sum size */
+  if (support_optimize_) {
+    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
+      input_sum_size = UP_ROUND(conv_param_->output_channel_, C8NUM) * UP_ROUND(matmul_param_->row_, C8NUM);
+    } else {
+      input_sum_size = UP_ROUND(matmul_param_->row_, C8NUM);
+    }
+  } else {
+    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
+      input_sum_size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM);
+    } else {
+      input_sum_size = UP_ROUND(matmul_param_->row_, C4NUM);
+    }
+  }
+
+  if (support_optimize_) {
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_);
+  } else {
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_);
+  }
+
+  if (support_optimize_) {
+    thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, C8NUM));
+    thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, C8NUM), thread_count_hw_);
+  } else {
+    thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, C4NUM));
+    thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, C4NUM), thread_count_hw_);
+  }
+
+  if (pre_trans_input_) {
+    input_ptr_ = reinterpret_cast<int8_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
+    if (input_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!";
+      return RET_MEMORY_FAILED;
+    }
+    memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t));
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::ReSize() {
+  FreeResizeBuf();
+
+  ConvolutionBaseCPUKernel::Init();
+
+  int error_code = InitParam();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Convolution base init failed.";
+    return error_code;
+  }
+  return RET_OK;
+}
+
+void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) {
+  output_ptr_ = src_output;
+  if (pre_trans_input_) {
+    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t));
+  } else {
+    input_ptr_ = src_input;
+  }
+
+  if (support_optimize_) {
+    ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Int8Pre, this, thread_count_hw_);
+  } else {
+    RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_);
+    PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_,
+                         conv_param_);
+  }
+
+  return;
+}
+
+int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
+  if (support_optimize_) {
+    int cur_stride = thread_stride_ * C8NUM;
+    int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C8NUM;
+    int cur_oc = MSMIN(cur_stride, res_stride);
+    if (cur_oc <= 0) {
+      return RET_OK;
+    }
+    Conv1x1Int8Opt(packed_input_, packed_weight_ + task_id * thread_stride_ * C8NUM * matmul_param_->deep_4_,
+                   output_ptr_ + task_id * thread_stride_ * C8NUM, input_sum_,
+                   reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C8NUM, matmul_param_->row_,
+                   cur_oc, matmul_param_->deep_4_, conv_param_, matmul_func_);
+  } else {
+    int cur_stride = thread_stride_ * C4NUM;
+    int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C4NUM;
+    int cur_oc = MSMIN(cur_stride, res_stride);
+    if (cur_oc <= 0) {
+      return RET_OK;
+    }
+    Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_16_,
+                output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_,
+                reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C4NUM, matmul_param_->row_, cur_oc,
+                matmul_param_->deep_16_, conv_param_);
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::RunPre(int task_id) {
+  int cur_hw = MSMIN(thread_stride_hw_ * C8NUM, matmul_param_->row_ - task_id * thread_stride_hw_ * C8NUM);
+  if (cur_hw <= 0) {
+    return RET_OK;
+  }
+  Conv1x1PreOpt(input_ptr_ + task_id * thread_stride_hw_ * C8NUM * matmul_param_->deep_,
+                packed_input_ + task_id * thread_stride_hw_ * C8NUM * matmul_param_->deep_4_,
+                input_sum_ + task_id * thread_stride_hw_ * C8NUM, matmul_param_->deep_, matmul_param_->col_, cur_hw,
+                conv_param_);
+  return RET_OK;
+}
+
+int Convolution1x1Int8Impl(void *cdata, int task_id) {
+  auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
+  auto error_code = conv->RunImpl(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::InitRunBuf() {
+  input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size * sizeof(int32_t)));
+  if (input_sum_ == nullptr) {
+    MS_LOG(ERROR) << "malloc input_sum_ failed.";
+    return RET_ERROR;
+  }
+
+  size_t size = support_optimize_ ? UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM)
+                                  : UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM);
+  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(size * sizeof(int8_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void Convolution1x1Int8CPUKernel::FreeRunBuf() {
+  if (packed_input_ != nullptr) {
+    ctx_->allocator->Free(packed_input_);
+    packed_input_ = nullptr;
+  }
+  if (input_sum_ != nullptr) {
+    ctx_->allocator->Free(input_sum_);
+    input_sum_ = nullptr;
+  }
+  return;
+}
+
+int Convolution1x1Int8CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+
+  int error_code = InitRunBuf();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv1x1 int8 InitRunBuf error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+
+  int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
+  int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
+
+  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
+    Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
+                src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);
+    ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Int8Impl, this, thread_count_);
+  }
+
+  FreeRunBuf();
+
+  return RET_OK;
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
new file mode 100644
index 0000000000..6ffd5aa4a7
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "include/errorcode.h"
+#include "schema/model_generated.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "nnacl/int8/conv_int8.h"
+#include "nnacl/int8/matmul_int8.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/optimized_kernel.h"
+
+namespace mindspore::kernel {
+class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                              const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
+                              const mindspore::lite::PrimitiveC *primitive)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~Convolution1x1Int8CPUKernel() override;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  int InitRunBuf();
+  void FreeRunBuf();
+
+ public:
+  int RunImpl(int task_id);
+  int RunPre(int task_id);
+
+ private:
+  void FreeResizeBuf();
+  int InitParam();
+  int InitWeightBias();
+  void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
+  void CheckSupportOptimize();
+
+ private:
+  int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */
+  int8_t *packed_weight_ = nullptr;
+  int8_t *packed_input_ = nullptr;
+  int8_t *input_ptr_ = nullptr;
+  int8_t *output_ptr_ = nullptr;
+  size_t thread_count_ = 1;
+  size_t thread_stride_ = 0;
+  size_t thread_count_hw_ = 1;
+  size_t thread_stride_hw_ = 0;
+  bool pre_trans_input_ = false;
+  size_t input_sum_size = 0;
+  MatMulParameter *matmul_param_ = nullptr;
+  MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
+  bool support_optimize_ = false;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
index 5dbddbaba2..188ec44d62 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
@@ -44,6 +44,10 @@ void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParamete
 }
 
 void Convolution3x3Int8CPUKernel::FreeTmpBuffer() {
+  if (tile_buffer_ != nullptr) {
+    ctx_->allocator->Free(tile_buffer_);
+    tile_buffer_ = nullptr;
+  }
   if (block_unit_buffer_ != nullptr) {
     ctx_->allocator->Free(block_unit_buffer_);
     block_unit_buffer_ = nullptr;
@@ -67,10 +71,6 @@ Convolution3x3Int8CPUKernel::~Convolution3x3Int8CPUKernel() {
     free(input_data_);
     input_data_ = nullptr;
   }
-  if (tile_buffer_ != nullptr) {
-    free(tile_buffer_);
-    tile_buffer_ = nullptr;
-  }
   FreeQuantParam();
 }
 
@@ -115,8 +115,16 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() {
   int output_batch = conv_param_->output_batch_;
   int output_w = conv_param_->output_w_;
   int output_h = conv_param_->output_h_;
+  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
   MS_ASSERT(ctx_->allocator != nullptr);
 
+  size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t);
+  tile_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(tile_buffer_size));
+  if (tile_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "malloc tile_buffer_ failed.";
+    return RET_ERROR;
+  }
+
   size_t block_unit_buffer_size = thread_count_ * 4 * 4 * C8NUM * sizeof(int16_t);
   block_unit_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
   if (block_unit_buffer_ == nullptr) {
@@ -146,9 +154,8 @@ void Convolution3x3Int8CPUKernel::ConfigInputOutput() {
 }
 
 int Convolution3x3Int8CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
+  // config input output
+  ConfigInputOutput();
   auto ret = SetQuantParam();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Set quant param failed.";
@@ -159,8 +166,9 @@ int Convolution3x3Int8CPUKernel::Init() {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return RET_ERROR;
   }
-  // config input output
-  ConfigInputOutput();
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -175,10 +183,6 @@ int Convolution3x3Int8CPUKernel::ReSize() {
     free(input_data_);
     input_data_ = nullptr;
   }
-  if (tile_buffer_ != nullptr) {
-    free(tile_buffer_);
-    tile_buffer_ = nullptr;
-  }
 
   ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -196,13 +200,6 @@ int Convolution3x3Int8CPUKernel::ReSize() {
   }
   memset(input_data_, 0, c8_input_size);
 
-  size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t);
-  tile_buffer_ = reinterpret_cast<int16_t *>(malloc(tile_buffer_size));
-  if (tile_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "malloc tile_buffer_ failed.";
-    return RET_ERROR;
-  }
-  memset(tile_buffer_, 0, tile_buffer_size);
   return RET_OK;
 }
 
@@ -213,7 +210,7 @@ int Convolution3x3Int8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int Convolution3x3Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution3x3Int8Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution3x3Int8CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -238,7 +235,7 @@ int Convolution3x3Int8CPUKernel::Run() {
   auto input_addr = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->Data());
   PackInputToC8Int8(input_addr, input_data_, conv_param_);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Int8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Int8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 int8 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index 33683f54cf..a1e47bd2a4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
+#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -28,91 +29,54 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
-void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
-  if (packed_input_ != nullptr) {
-    free(packed_input_);
-    packed_input_ = nullptr;
-  }
-  if (need_align_) {
-    if (packed_output_ != nullptr) {
-      free(packed_output_);
-      packed_output_ = nullptr;
-    }
-  }
-}
-
 ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
-  if (sliding != nullptr) {
-    delete sliding;
-    sliding = nullptr;
-  }
   if (packed_weight_ != nullptr) {
     free(packed_weight_);
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
   FreeQuantParam();
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight, int8 -> int16
-  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
-  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
-  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+  int channel = weight_tensor->Batch();
+  int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
+  auto tmp_weight = reinterpret_cast<int8_t *>(malloc(pack_weight_size * sizeof(int8_t)));
+  if (tmp_weight == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
+                     weight_tensor->Batch());
+
+  int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
   packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
-                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
+  for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
+    packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
+  }
 
-  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
+  memset(bias_data_, 0, channel * sizeof(int32_t));
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
     auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
     memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
   }
 
-  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
-  return RET_OK;
-}
-
-int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
-                        UP_DIV(conv_param_->input_channel_, 4);
-  packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-
-  if (conv_param_->input_channel_ % C4NUM != 0) {
-    need_align_ = true;
-    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
-                           UP_DIV(conv_param_->output_channel_, C4NUM);
-    packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
-    if (packed_input_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-  }
   return RET_OK;
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::Init() {
-  sliding = new (std::nothrow) SlidingWindowParam;
-  if (sliding == nullptr) {
-    MS_LOG(ERROR) << "new sliding window param.";
-    return RET_ERROR;
-  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -120,35 +84,29 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
-  FreeTmpBuffer();
   ConvolutionBaseCPUKernel::Init();
-  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
-
   auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Set quant param failed.";
     return ret;
   }
+  conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
   ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
     return ret;
   }
-  ret = InitBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
-    return ret;
-  }
   return RET_OK;
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
-  ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
-             sliding, task_id);
+  auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id;
+  ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
+             task_id);
   return RET_OK;
 }
 
-int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwInt8Run(void *cdata, int task_id) {
   auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
   auto ret = conv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
@@ -158,36 +116,46 @@ int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
   return RET_OK;
 }
 
+int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
+  int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_;
+  row_buffer_ = reinterpret_cast<int32_t *>(context_->allocator->Malloc(output_row_size * sizeof(float)));
+  if (row_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int ConvolutionDepthwiseInt8CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
   auto ret = Prepare();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
-    MS_LOG(ERROR) << "Only support input channel equals output channel.";
-    return RET_ERROR;
+
+  ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
+    return ret;
   }
 
   auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
-  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
+  input_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());
 
-  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
-  if (!need_align_) {
-    packed_output_ = output_addr;
-  }
+  auto output_tensor = out_tensors_.at(kOutputIndex);
+  output_ptr_ = reinterpret_cast<int8_t *>(output_tensor->Data());
 
-  ret = LiteBackendParallelLaunch(ConvDwInt8Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwInt8Run error: error_code[" << ret << "]";
     return RET_ERROR;
   }
 
-  if (need_align_) {
-    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
-                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  }
+  context_->allocator->Free(row_buffer_);
   return RET_OK;
 }
 
@@ -198,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T
                                                const mindspore::lite::PrimitiveC *primitive) {
   MS_ASSERT(opParameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
-  auto kernel =
-    new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  kernel::LiteKernel *kernel;
+  auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size();
+  if (filter_quant_size == 1) {  // per tensor
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {  // per channel
+    kernel =
+      new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel is nullptr.";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
index cef519df8d..b8661236bc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
@@ -36,16 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int InitWeightBias();
-  int InitBuffer();
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
-  SlidingWindowParam *sliding = nullptr;
+  int InitBuffer();
   int16_t *packed_weight_ = nullptr;
-  int16_t *packed_input_ = nullptr;
-  int8_t *packed_output_ = nullptr;
-  bool need_align_ = false;
+  int8_t *input_ptr_ = nullptr;
+  int8_t *output_ptr_ = nullptr;
+  int32_t *row_buffer_ = nullptr;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
new file mode 100644
index 0000000000..3e4d3274b9
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
@@ -0,0 +1,182 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "nnacl/int8/conv_depthwise_int8.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DepthwiseConv2D;
+
+namespace mindspore::kernel {
+ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() {
+  if (sliding != nullptr) {
+    delete sliding;
+    sliding = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    free(packed_weight_);
+    packed_weight_ = nullptr;
+  }
+  FreeQuantParam();
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
+  // init weight, int8 -> int16
+  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
+  auto weight_tensor = in_tensors_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+  packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
+                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
+
+  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
+  if (in_tensors_.size() == kInputSize2) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
+    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
+  }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() {
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
+                        UP_DIV(conv_param_->input_channel_, 4);
+  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
+                           UP_DIV(conv_param_->output_channel_, C4NUM);
+    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Init() {
+  sliding = new (std::nothrow) SlidingWindowParam;
+  if (sliding == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param.";
+    return RET_ERROR;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() {
+  ConvolutionBaseCPUKernel::Init();
+  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
+
+  auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set quant param failed.";
+    return ret;
+  }
+  ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) {
+  ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
+               sliding, task_id);
+  return RET_OK;
+}
+
+int ConvDwSWInt8Run(void *cdata, int task_id) {
+  auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseSWInt8CPUKernel *>(cdata);
+  auto ret = conv_dw_int8->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
+    return ret;
+  }
+
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
+  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
+
+  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
+  if (!need_align_) {
+    packed_output_ = output_addr;
+  }
+
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+
+  if (need_align_) {
+    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
+                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_output_);
+  }
+  context_->allocator->Free(packed_input_);
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
similarity index 55%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
rename to mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
index 728c00c4b3..4c373c2466 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
-#define MINDSPORE_LITE_SRC_BACKEND_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
 
 #include <vector>
 #include "src/lite_kernel.h"
@@ -23,14 +23,13 @@
 #include "nnacl/fp32/conv_depthwise.h"
 
 namespace mindspore::kernel {
-class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
+class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel {
  public:
-  ConvolutionDepthwise3x3CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
-                                   const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
-                                   const mindspore::lite::PrimitiveC *primitive)
+  ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
+                                      const mindspore::lite::PrimitiveC *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-
-  ~ConvolutionDepthwise3x3CPUKernel() override;
+  ~ConvolutionDepthwiseSWInt8CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -41,15 +40,12 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
   int Execute(int task_id);
 
  private:
-  void FreeTmpBufer();
-  float *packed_weight_ = nullptr;
-  float *packed_input_ = nullptr;
-  float *packed_output_ = nullptr;
-  float *block_buffer_ = nullptr;
-  float *trans_buffer_ = nullptr;
-  int trans_size_;
+  SlidingWindowParam *sliding = nullptr;
+  int16_t *packed_weight_ = nullptr;
+  int16_t *packed_input_ = nullptr;
+  int8_t *packed_output_ = nullptr;
   bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_BACKEND_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
index 36760454eb..1c069ce764 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -16,6 +16,7 @@
 
 #include "src/runtime/kernel/arm/int8/convolution_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
+#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "nnacl/int8/conv_int8.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "schema/model_generated.h"
@@ -246,9 +247,6 @@ void ConvolutionInt8CPUKernel::ConfigInputOutput() {
 }
 
 int ConvolutionInt8CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
   // config input output
   ConfigInputOutput();
   CheckSupportOptimize();
@@ -272,7 +270,9 @@ int ConvolutionInt8CPUKernel::Init() {
       return RET_ERROR;
     }
   }
-
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
   return ReSize();
 }
 
@@ -337,7 +337,7 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionInt8Impl(int task_id, LiteParallelGroupEnv *mpenv, void *cdata) {
+int ConvolutionInt8Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionInt8CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -373,7 +373,7 @@ int ConvolutionInt8CPUKernel::Run() {
   convert_func_(ori_input_data, nhwc4_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
                 conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionInt8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionInt8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]";
     FreeTmpBuffer();
@@ -398,8 +398,11 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
   int dilation_h = conv_param->dilation_h_;
   int dilation_w = conv_param->dilation_w_;
   kernel::LiteKernel *kernel;
+  auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size();
   if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
-    kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else if (kernel_h == 1 && kernel_w == 1 && filter_quant_size == 1) {
+    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   } else {
     kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
index 0feb6e6614..afc1c6545d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
@@ -67,7 +67,7 @@ int CropInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(CropInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, CropInt8Run, this, thread_count_);
   return ret;
 }
 
@@ -91,7 +91,7 @@ void PadOffset(int input_dim, CropParameter *crop_para) {
   }
 }
 
-int CropInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CropInt8Run(void *cdata, int task_id) {
   auto crop = reinterpret_cast<CropInt8CPUKernel *>(cdata);
   crop->DoExecute(task_id);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
index 46aabf4354..3cbcaba8eb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
@@ -46,7 +46,7 @@ class CropInt8CPUKernel : public CropBaseCPUKernel {
   CropParameter *crop_para_;
 };
 
-int CropInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int CropInt8Run(void *cdata, int task_id);
 void PadOffset(int input_dim, CropParameter *crop_para);
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
index 4f51bdc6ef..ba4dca80fc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
@@ -37,27 +37,9 @@ DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  FreeTmpBuffer();
   FreeQuantParam();
 }
 
-void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
-  if (packed_input_ != nullptr) {
-    delete packed_input_;
-    packed_input_ = nullptr;
-  }
-  if (need_align_) {
-    if (packed_output_ != nullptr) {
-      delete packed_output_;
-      packed_output_ = nullptr;
-    }
-  }
-  if (output_buffer_ != nullptr) {
-    delete output_buffer_;
-    output_buffer_ = nullptr;
-  }
-}
-
 int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight: int8 -> int16
   // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
@@ -111,7 +93,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
 int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                         UP_DIV(conv_param_->input_channel_, 4);
-  packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
+  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -121,7 +103,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
     need_align_ = true;
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
                            UP_DIV(conv_param_->output_channel_, C4NUM);
-    packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
+    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
     if (packed_output_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
@@ -129,8 +111,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
     memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
   }
 
-  output_buffer_ =
-    reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
+  output_buffer_ = reinterpret_cast<int32_t *>(
+    context_->allocator->Malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
   if (output_buffer_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -165,7 +147,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::Init() {
 }
 
 int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
-  FreeTmpBuffer();
   InitSlideParam();
   ConvolutionBaseCPUKernel::Init();
 
@@ -183,7 +164,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeconvDwInt8Run(void *cdata, int task_id) {
   auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
   auto ret = deconv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
@@ -215,7 +196,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() {
     packed_output_ = output_addr;
   }
 
-  ret = LiteBackendParallelLaunch(DeconvDwInt8Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwInt8Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwInt8Run error: error_code[" << ret << "]";
     return RET_ERROR;
@@ -224,7 +205,10 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() {
   if (need_align_) {
     PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_output_);
   }
+  context_->allocator->Free(packed_input_);
+  context_->allocator->Free(output_buffer_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
index 4437a412d0..b6ad5245ae 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
@@ -41,7 +41,6 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
   int Execute(int task_id);
 
  private:
-  void FreeTmpBuffer();
   SlidingWindowParam *sliding = nullptr;
   int16_t *packed_weight_ = nullptr;
   int16_t *packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
index 55e8f60d50..8f4b06d55d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
@@ -216,7 +216,7 @@ void DeConvInt8CPUKernel::FreeRunBuf() {
   return;
 }
 
-int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeConvInt8Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvInt8CPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -272,7 +272,7 @@ int DeConvInt8CPUKernel::Run() {
     DeConvPackInputSum(input_ptr_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
                        UP_ROUND(matmul_param_->row_, C4NUM), UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_);
 
-    error_code = LiteBackendParallelLaunch(DeConvInt8Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvInt8Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv int8 run error! error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
index 1c8754b5b4..0249cccdf4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
@@ -87,7 +87,7 @@ int DivInt8CPUKernel::DoExecute(int task_id) {
   return ret;
 }
 
-int DivInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DivInt8Run(void *cdata, int task_id) {
   auto div_kernel = reinterpret_cast<DivInt8CPUKernel *>(cdata);
   auto ret = div_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -104,7 +104,7 @@ int DivInt8CPUKernel::Run() {
   }
 
   if (broadcast_) {
-    ArithmeticParameter tile_para = {0};
+    ArithmeticParameter tile_para;
     tile_para.ndim_ = out_tensors_.at(0)->shape().size();
     for (size_t i = 0; i < tile_para.ndim_; i++) {
       tile_para.in_shape0_[i] = in_tensors_.at(0)->DimensionSize(i);
@@ -123,7 +123,7 @@ int DivInt8CPUKernel::Run() {
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(tile0_data_),
                         reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
   }
-  ret = LiteBackendParallelLaunch(DivInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DivInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
     context_->allocator->Free(tile0_data_);
     context_->allocator->Free(tile1_data_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
index 9a092eb994..e0ebe39441 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@@ -39,36 +39,32 @@ int FullconnectionInt8CPUKernel::ReSize() {
   fc_param_->row_8_ = UP_ROUND(fc_param_->row_, 8);
   fc_param_->col_8_ = UP_ROUND(fc_param_->col_, 8);
 
-  thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8));
-  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_);
-
-  a_c8_ptr_ =
-    reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(fc_param_->row_8_ * fc_param_->deep_ * sizeof(int8_t)));
-  if (!a_c8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(a_c8_ptr_, 0, fc_param_->row_8_ * fc_param_->deep_ * sizeof(int8_t));
-  b_r8_ptr_ =
-    reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(int8_t)));
-  if (!b_r8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(b_r8_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(int8_t));
+  r4_ = UP_ROUND(fc_param_->row_, 4);
+  c4_ = UP_ROUND(fc_param_->col_, 4);
+  d16_ = UP_ROUND(fc_param_->deep_, 16);
+  thread_count_ = MSMIN(thread_count_, UP_DIV(c4_, 4));
+  thread_stride_ = UP_DIV(UP_DIV(c4_, 4), thread_count_);
+  a_r4x16_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(r4_ * d16_ * sizeof(int8_t)));
+  if (!a_r4x16_ptr_) return RET_MEMORY_FAILED;
+  memset(a_r4x16_ptr_, 0, r4_ * d16_ * sizeof(int8_t));
+  b_c16x4_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(c4_ * d16_ * sizeof(int8_t)));
+  if (!b_c16x4_ptr_) return RET_MEMORY_FAILED;
+  memset(b_c16x4_ptr_, 0, c4_ * d16_ * sizeof(int8_t));
+  input_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(r4_ * sizeof(int)));
+  if (!input_sums_) return RET_MEMORY_FAILED;
+  memset(input_sums_, 0, r4_ * sizeof(int));
+  weight_bias_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(c4_ * sizeof(int)));
+  if (!weight_bias_sums_) return RET_MEMORY_FAILED;
+  memset(weight_bias_sums_, 0, c4_ * sizeof(int));
   auto weight_data = reinterpret_cast<int8_t *>(in_tensors_[1]->Data());
-  RowMajor2Col8MajorInt8(weight_data, b_r8_ptr_, fc_param_->col_, fc_param_->deep_);
-  c_r8x8_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(fc_param_->row_8_ * fc_param_->col_8_ * sizeof(int)));
-  if (!c_r8x8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(c_r8x8_ptr_, 0, fc_param_->row_8_ * fc_param_->col_8_ * sizeof(int));
-  auto bias_len = fc_param_->col_8_ * sizeof(int);
-  bias_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(bias_len));
-  if (!bias_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(bias_ptr_, 0, bias_len);
+  RowMajor2Row4x16Major(weight_data, fc_param_->col_, fc_param_->deep_, b_c16x4_ptr_, d16_);
   if (in_tensors_.size() == 3) {
+    auto bias_len = fc_param_->col_8_ * sizeof(int);
+    bias_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(bias_len));
+    if (!bias_ptr_) return RET_MEMORY_FAILED;
     memcpy(bias_ptr_, in_tensors_[2]->Data(), bias_len);
+  } else {
+    bias_ptr_ = NULL;
   }
 
   auto input_tensor = in_tensors_[0];
@@ -93,22 +89,36 @@ int FullconnectionInt8CPUKernel::ReSize() {
   CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6,
                                     quant_params_.output.zp_, quant_params_.output.scale_, &quant_params_.out_act_min,
                                     &quant_params_.out_act_max);
+  CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_params_.input.zp_, quant_params_.weight.zp_,
+                     bias_ptr_, weight_bias_sums_, ColMajor);
   return RET_OK;
 }
 
 int FullconnectionInt8CPUKernel::RunImpl(int task_id) {
-  int cur_oc = MSMIN(thread_stride_, UP_DIV(fc_param_->col_8_, 8) - task_id * thread_stride_);
+  int cur_oc = MSMIN(thread_stride_, UP_DIV(c4_, 4) - task_id * thread_stride_);
   if (cur_oc <= 0) {
     return RET_OK;
   }
-  auto &p = quant_params_;
-  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_;
-  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->row_8_;
-  MatMulInt8(a_c8_ptr_, cur_b, cur_c, fc_param_->row_8_, cur_oc * 8, fc_param_->deep_, p.input.zp_, p.weight.zp_);
+  int cur_oc_res = MSMIN(thread_stride_ * C4NUM, fc_param_->col_ - task_id * thread_stride_ * C4NUM);
+  auto &q = quant_params_;
+  auto &p = fc_param_;
+  auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * C4NUM * d16_;
+  auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * C4NUM;
+  auto output_ptr = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
+  auto cur_c = output_ptr + task_id * thread_stride_ * C4NUM;
+#ifdef ENABLE_ARM64
+  MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, r4_, cur_oc * C4NUM, d16_, input_sums_, cur_bias, q.out_act_min,
+                   q.out_act_max, q.output.zp_, q.quant_multiplier, q.left_shift, q.right_shift, p->row_, cur_oc_res,
+                   p->col_ * sizeof(int8_t));
+#else
+  MatmulInt8(a_r4x16_ptr_, cur_b, cur_c, input_sums_, cur_bias, q.out_act_min, q.out_act_max, q.output.zp_,
+             q.quant_multiplier, q.left_shift, q.right_shift, p->row_, cur_oc_res, d16_, p->col_);
+#endif
+
   return RET_OK;
 }
 
-int FcInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FcInt8Run(void *cdata, int task_id) {
   auto fc = reinterpret_cast<FullconnectionInt8CPUKernel *>(cdata);
   auto ret = fc->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -124,13 +134,10 @@ int FullconnectionInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  auto a_ptr = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
-  auto output_ptr = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
-  auto &p = quant_params_;
-  RowMajor2Col8MajorInt8(a_ptr, a_c8_ptr_, fc_param_->row_, fc_param_->deep_);
-  LiteBackendParallelLaunch(FcInt8Run, this, thread_count_);
-  PostFuncInt8C8(c_r8x8_ptr_, bias_ptr_, output_ptr, fc_param_->col_, fc_param_->row_, p.quant_multiplier, p.left_shift,
-                 p.right_shift, p.output.zp_, p.out_act_min, p.out_act_max);
+  auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
+  RowMajor2Row4x16Major(input_ptr, fc_param_->row_, fc_param_->deep_, a_r4x16_ptr_, d16_);
+  CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor);
+  ParallelLaunch(THREAD_POOL_DEFAULT, FcInt8Run, this, thread_count_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
index 9af8b850fa..9e2aca294c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
@@ -41,28 +41,36 @@ class FullconnectionInt8CPUKernel : public FullconnectionBaseCPUKernel {
 
  private:
   void FreeTmpBuffer() {
-    if (a_c8_ptr_ != nullptr) {
-      ctx_->allocator->Free(a_c8_ptr_);
-      a_c8_ptr_ = nullptr;
+    if (a_r4x16_ptr_ != nullptr) {
+      ctx_->allocator->Free(a_r4x16_ptr_);
+      a_r4x16_ptr_ = nullptr;
     }
-    if (b_r8_ptr_ != nullptr) {
-      ctx_->allocator->Free(b_r8_ptr_);
-      b_r8_ptr_ = nullptr;
+    if (b_c16x4_ptr_ != nullptr) {
+      ctx_->allocator->Free(b_c16x4_ptr_);
+      b_c16x4_ptr_ = nullptr;
     }
-    if (c_r8x8_ptr_ != nullptr) {
-      ctx_->allocator->Free(c_r8x8_ptr_);
-      c_r8x8_ptr_ = nullptr;
+    if (input_sums_ != nullptr) {
+      ctx_->allocator->Free(input_sums_);
+      input_sums_ = nullptr;
+    }
+    if (weight_bias_sums_ != nullptr) {
+      ctx_->allocator->Free(weight_bias_sums_);
+      weight_bias_sums_ = nullptr;
     }
     if (bias_ptr_ != nullptr) {
-      ctx_->allocator->Free(bias_ptr_);
-      bias_ptr_ = nullptr;
+      ctx_->allocator->Free(weight_bias_sums_);
+      weight_bias_sums_ = nullptr;
     }
   }
   MatmulQuantArg quant_params_;
-  int8_t *a_c8_ptr_ = nullptr;
-  int8_t *b_r8_ptr_ = nullptr;
-  int *c_r8x8_ptr_ = nullptr;
+  int8_t *a_r4x16_ptr_ = nullptr;
+  int8_t *b_c16x4_ptr_ = nullptr;
+  int *input_sums_ = nullptr;
+  int *weight_bias_sums_ = nullptr;
   int *bias_ptr_ = nullptr;
+  int r4_;
+  int c4_;
+  int d16_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
new file mode 100644
index 0000000000..f5539b9195
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/gatherNd_int8.h"
+#include <string.h>
+#include <vector>
+#include "schema/model_generated.h"
+#include "include/errorcode.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+#include "nnacl/int8/gatherNd_int8.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_GatherNd;
+
+namespace mindspore::kernel {
+
+GatherNdInt8CPUKernel::~GatherNdInt8CPUKernel() {
+  if (in_offset_ != nullptr) {
+    free(in_offset_);
+    in_offset_ = nullptr;
+  }
+}
+
+int GatherNdInt8CPUKernel::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int GatherNdInt8CPUKernel::ReSize() {
+  if (in_offset_ != nullptr) {
+    free(in_offset_);
+    in_offset_ = nullptr;
+  }
+  auto in_quant_args = in_tensors_.at(0)->GetQuantParams();
+  auto ind_quant_args = in_tensors_.at(1)->GetQuantParams();
+  auto out_quant_args = out_tensors_.at(0)->GetQuantParams();
+  param_.alpha_ = in_quant_args.front().scale / out_quant_args.front().scale;
+  param_.zp_in_ = in_quant_args.front().zeroPoint;
+  param_.zp_out_ = out_quant_args.front().zeroPoint;
+
+  auto indices_tensor = in_tensors_.at(1);
+  auto indices_shape = indices_tensor->shape();
+  int indices_rank = indices_shape.size();
+  count_ = 1;
+  for (int i = 0; i < indices_rank - 1; ++i) {
+    count_ *= indices_shape[i];
+  }
+
+  in_offset_ = reinterpret_cast<int *>(malloc(count_ * sizeof(int)));
+  if (in_offset_ == nullptr) {
+    MS_LOG(ERROR) << "GatherNdInt8 Malloc in_offset_ error!";
+    return RET_ERROR;
+  }
+  (void)memset(in_offset_, 0, count_ * sizeof(int));
+
+  thread_sz_count_ = MSMIN(thread_count_, count_);
+  thread_sz_stride_ = UP_DIV(count_, thread_sz_count_);
+
+  auto in_shape = in_tensors_.front()->shape();
+  int in_rank = in_shape.size();
+  int idx_lastshape = indices_shape[indices_rank - 1];
+  auto indices_ptr = reinterpret_cast<int8_t *>(indices_tensor->Data());
+  area_ = 1;
+  for (int i = idx_lastshape; i < in_rank; ++i) {
+    area_ *= in_shape[i];
+  }
+  std::vector<int> in_stride(in_rank);
+  in_stride[in_rank - 1] = 1;
+  for (int i = in_rank - 2; i >= 0; --i) {
+    in_stride[i] = in_shape[i + 1] * in_stride[i + 1];
+  }
+
+  int idx_stride = idx_lastshape;
+  for (int j = 0; j < count_; ++j) {
+    for (int k = 0; k < idx_lastshape; ++k) {
+      int tmp = static_cast<int>(
+        round((indices_ptr[j * idx_stride + k] - ind_quant_args.front().zeroPoint) * ind_quant_args.front().scale));
+      in_offset_[j] += tmp * in_stride[k];
+    }
+  }
+  return RET_OK;
+}
+
+int GatherNdInt8CPUKernel::DoGatherNd(int task_id) {
+  int count = MSMIN(thread_sz_stride_, count_ - task_id * thread_sz_stride_);
+  if (count <= 0) {
+    return RET_OK;
+  }
+  int offset = task_id * thread_sz_stride_;
+  auto ret = GatherNdInt8(in_ptr_, out_ptr_ + offset * area_, in_offset_ + offset, area_, count, param_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int GatherNdInt8Run(void *cdata, int task_id) {
+  auto g_kernel = reinterpret_cast<GatherNdInt8CPUKernel *>(cdata);
+  auto ret = g_kernel->DoGatherNd(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int GatherNdInt8CPUKernel::Run() {
+  auto prepare_ret = Prepare();
+  if (prepare_ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    return prepare_ret;
+  }
+  in_ptr_ = reinterpret_cast<int8_t *>(in_tensors_.front()->Data());
+  out_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.front()->Data());
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, GatherNdInt8Run, this, thread_sz_count_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuGatherNdInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                 const std::vector<lite::tensor::Tensor *> &outputs,
+                                                 OpParameter *opParameter, const lite::Context *ctx,
+                                                 const kernel::KernelKey &desc,
+                                                 const mindspore::lite::PrimitiveC *primitive) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_GatherNd);
+
+  auto *kernel = new (std::nothrow) GatherNdInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_GatherNd, CpuGatherNdInt8KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.h
new file mode 100644
index 0000000000..b4b03886cf
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHERND_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHERND_INT8_H_
+
+#include <vector>
+#include "nnacl/quantization/quantize.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+class GatherNdInt8CPUKernel : public LiteKernel {
+ public:
+  GatherNdInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                        const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                        const mindspore::lite::PrimitiveC *primitive)
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
+  ~GatherNdInt8CPUKernel() override;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoGatherNd(int task_id);
+
+ private:
+  int thread_count_;
+  int thread_sz_count_;
+  int thread_sz_stride_;
+  int count_;
+  int area_;
+  int *in_offset_ = nullptr;
+  int8_t *in_ptr_;
+  int8_t *out_ptr_;
+  GatherQuantArg param_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHERND_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
new file mode 100644
index 0000000000..749123770a
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
@@ -0,0 +1,163 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/int8/gather_int8.h"
+#include <vector>
+#include "nnacl/gather_parameter.h"
+#include "nnacl/int8/gather_int8.h"
+#include "nnacl/quantization/quantize.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Gather;
+
+namespace mindspore::kernel {
+
+int GatherInt8CPUKernel::Init() {
+  axis_ = (reinterpret_cast<GatherParameter *>(op_parameter_))->axis_;
+  batchDims_ = (reinterpret_cast<GatherParameter *>(op_parameter_))->batchDims_;
+  auto in_quant_args = in_tensors_.at(0)->GetQuantParams();
+  auto ind_quant_args = in_tensors_.at(1)->GetQuantParams();
+  auto out_quant_args = out_tensors_.at(0)->GetQuantParams();
+  param_.alpha_ = in_quant_args.front().scale / out_quant_args.front().scale;
+  param_.zp_in_ = in_quant_args.front().zeroPoint;
+  param_.zp_out_ = out_quant_args.front().zeroPoint;
+
+  auto indices_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->Data());
+  if (indices_ != nullptr) {
+    free(indices_);
+    indices_ = nullptr;
+  }
+  int count = in_tensors_.at(1)->ElementsNum();
+  indices_ = reinterpret_cast<int *>(malloc(count * sizeof(int)));
+  if (indices_ == nullptr) {
+    MS_LOG(ERROR) << "Gather Malloc indices_ error!";
+    return RET_ERROR;
+  }
+  (void)memset(indices_, 0, count * sizeof(int));
+  for (int i = 0; i < count; ++i) {
+    indices_[i] =
+      static_cast<int>(round((indices_ptr[i] - ind_quant_args.front().zeroPoint) * ind_quant_args.front().scale));
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int GatherInt8CPUKernel::ReSize() { return RET_OK; }
+
+int GatherInt8CPUKernel::DoGather(int task_id) {
+  auto input_tensor = in_tensors_.at(0);
+  auto indices_tensor = in_tensors_.at(1);
+  auto out_tensor = out_tensors_.at(0);
+
+  auto input_ptr = reinterpret_cast<int8_t *>(input_tensor->Data());
+  auto output_ptr = reinterpret_cast<int8_t *>(out_tensor->Data());
+
+  auto in_shape = input_tensor->shape();
+  int in_rank = in_shape.size();
+  int indices_element_size = indices_tensor->ElementsNum();
+
+  const int limit = in_shape[axis_];
+  for (int i = 0; i < indices_element_size; ++i) {
+    if (indices_[i] >= limit) {
+      MS_LOG(ERROR) << " indice data: " << indices_[i] << " is not in [ 0, " << limit - 1 << " ]";
+      return RET_ERROR;
+    }
+  }
+
+  int outer_size = 1;
+  for (int i = 0; i < axis_; ++i) {
+    outer_size *= in_shape[i];
+  }
+
+  int inner_size = 1;
+  for (int i = axis_ + 1; i < in_rank; ++i) {
+    inner_size *= in_shape[i];
+  }
+
+  int stride = UP_DIV(outer_size, thread_count_);
+  int count = MSMIN(stride, outer_size - stride * task_id);
+  auto thread_stride = stride * task_id;
+
+  int error_code;
+  input_ptr += thread_stride * limit;
+  output_ptr += thread_stride * indices_element_size;
+  error_code = GatherInt8(input_ptr, output_ptr, count, inner_size, limit, indices_, indices_element_size, param_);
+
+  if (error_code != RET_OK) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherInt8Run(void *cdata, int task_id) {
+  auto gather_kernel = reinterpret_cast<GatherInt8CPUKernel *>(cdata);
+  auto error_code = gather_kernel->DoGather(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherInt8CPUKernel::Run() {
+  auto prepare_ret = Prepare();
+  if (prepare_ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    return prepare_ret;
+  }
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, GatherInt8Run, this, thread_count_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Gather function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuGatherInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const lite::Context *ctx,
+                                               const kernel::KernelKey &desc,
+                                               const mindspore::lite::PrimitiveC *primitive) {
+  MS_ASSERT(desc.type == schema::PrimitiveType_Gather);
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "input parameter is nullptr!";
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) GatherInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Gather, CpuGatherInt8KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.h
new file mode 100644
index 0000000000..af00f6c085
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHER_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHER_INT8_H_
+
+#include <vector>
+#include "nnacl/gather_parameter.h"
+#include "nnacl/quantization/quantize.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+class GatherInt8CPUKernel : public LiteKernel {
+ public:
+  GatherInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                      const mindspore::lite::PrimitiveC *primitive)
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
+  ~GatherInt8CPUKernel() {
+    free(indices_);
+    indices_ = nullptr;
+  }
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoGather(int task_id);
+
+ private:
+  int *indices_ = nullptr;
+  int thread_count_;
+  int batchDims_;
+  int axis_;
+  GatherQuantArg param_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_GATHER_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
index 686e14cf61..8ece51bc81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
@@ -78,7 +78,7 @@ int HswishInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int HswishInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int HswishInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<HswishInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -94,7 +94,7 @@ int HswishInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(HswishInt8Run, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, HswishInt8Run, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "HswishInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
index 9bc770b1cb..af6b6fd94e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
@@ -25,9 +25,20 @@ using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Prelu;
 
 namespace mindspore::kernel {
+namespace {
+int LeakyReluInt8Run(void *cdata, int task_id) {
+  if (cdata == nullptr) {
+    MS_LOG(ERROR) << "input cdata is nullptr!";
+    return RET_ERROR;
+  }
+  auto relu = reinterpret_cast<LeakyReluInt8CPUKernel *>(cdata);
+  relu->DoExecute(task_id);
+  return RET_OK;
+}
+}  // namespace
+
 int LeakyReluInt8CPUKernel::Init() {
   LeakyReluBaseCPUKernel::Init();
   LeakyReluParameter *param = reinterpret_cast<LeakyReluParameter *>(op_parameter_);
@@ -82,17 +93,12 @@ int LeakyReluInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(PreluInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, LeakyReluInt8Run, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "RunPreluParam failed. errorcode: ";
   }
   return RET_OK;
 }
-int PreluInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto prelu = reinterpret_cast<LeakyReluInt8CPUKernel *>(cdata);
-  prelu->DoExecute(task_id);
-  return RET_OK;
-}
 
 int LeakyReluInt8CPUKernel::DoExecute(int task_id) {
   auto input_tensor = in_tensors_.at(kInputIndex);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
index 9df765079a..ba0282a096 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
@@ -41,7 +41,6 @@ class LeakyReluInt8CPUKernel : public LeakyReluBaseCPUKernel {
  private:
   LeakyReluQuantArg quant_prelu_parm_;
 };
-int PreluInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_PRELU_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
index 7c7c798ee6..9c3bafe389 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
@@ -48,46 +48,23 @@ int MatmulInt8CPUKernel::ReSize() {
   params_->row_8_ = UP_ROUND(params_->row_, 8);
   params_->col_8_ = UP_ROUND(params_->col_, 8);
 
-#ifdef ENABLE_ARM64
   r4_ = UP_ROUND(params_->row_, 4);
   c4_ = UP_ROUND(params_->col_, 4);
   d16_ = UP_ROUND(params_->deep_, 16);
-  a_r4d16_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(r4_ * d16_ * sizeof(int8_t)));
-  if (!a_r4d16_ptr_) return RET_MEMORY_FAILED;
-  memset(a_r4d16_ptr_, 0, r4_ * d16_ * sizeof(int8_t));
-  b_c4d16_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(c4_ * d16_ * sizeof(int8_t)));
-  if (!b_c4d16_ptr_) return RET_MEMORY_FAILED;
-  memset(b_c4d16_ptr_, 0, c4_ * d16_ * sizeof(int8_t));
-  c_r4c4_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(r4_ * c4_ * sizeof(int8_t)));
-  if (!c_r4c4_ptr_) return RET_MEMORY_FAILED;
-  memset(c_r4c4_ptr_, 0, r4_ * c4_ * sizeof(int8_t));
-  a_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(r4_ * sizeof(int)));
-  if (!a_sums_) return RET_MEMORY_FAILED;
-  memset(a_sums_, 0, r4_ * sizeof(int));
-  b_bias_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(c4_ * sizeof(int)));
-  if (!b_bias_) return RET_MEMORY_FAILED;
-  memset(b_bias_, 0, c4_ * sizeof(int));
+  a_r4x16_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(r4_ * d16_ * sizeof(int8_t)));
+  if (!a_r4x16_ptr_) return RET_MEMORY_FAILED;
+  memset(a_r4x16_ptr_, 0, r4_ * d16_ * sizeof(int8_t));
+  b_c16x4_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(c4_ * d16_ * sizeof(int8_t)));
+  if (!b_c16x4_ptr_) return RET_MEMORY_FAILED;
+  memset(b_c16x4_ptr_, 0, c4_ * d16_ * sizeof(int8_t));
+  input_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(r4_ * sizeof(int)));
+  if (!input_sums_) return RET_MEMORY_FAILED;
+  memset(input_sums_, 0, r4_ * sizeof(int));
+  weight_bias_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(c4_ * sizeof(int)));
+  if (!weight_bias_sums_) return RET_MEMORY_FAILED;
+  memset(weight_bias_sums_, 0, c4_ * sizeof(int));
   thread_count_ = MSMIN(thread_count_, UP_DIV(c4_, 4));
   thread_stride_ = UP_DIV(UP_DIV(c4_, 4), thread_count_);
-#else
-  a_c8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(int8_t)));
-  if (!a_c8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(int8_t));
-  b_r8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(int8_t)));
-  if (!b_r8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(int8_t));
-  c_r8x8_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(int)));
-  if (!c_r8x8_ptr_) {
-    return RET_MEMORY_FAILED;
-  }
-  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(int));
-  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
-  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
-#endif
 
   auto input_tensor = in_tensors_[0];
   auto params = input_tensor->GetQuantParams();
@@ -112,31 +89,29 @@ int MatmulInt8CPUKernel::ReSize() {
 }
 
 int MatmulInt8CPUKernel::RunImpl(int task_id) {
-#ifdef ENABLE_ARM64
   int cur_oc = MSMIN(thread_stride_, UP_DIV(c4_, 4) - task_id * thread_stride_);
   if (cur_oc <= 0) {
     return RET_OK;
   }
-  auto cur_b = b_c4d16_ptr_ + task_id * thread_stride_ * 4 * d16_;
-  auto cur_c = c_r4c4_ptr_ + task_id * thread_stride_ * 4 * r4_;
+  int cur_oc_res = MSMIN(thread_stride_ * C4NUM, params_->col_ - task_id * thread_stride_ * C4NUM);
+  auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * 4 * d16_;
+  auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * 4;
+  auto cur_c = c_ptr_ + task_id * thread_stride_ * 4;
+
   auto &p = quant_params_;
-  MatmulInt8Neon64(a_r4d16_ptr_, cur_b, cur_c, r4_, c4_, d16_, a_sums_, b_bias_, INT_MIN, INT_MAX, p.output.zp_,
-                   p.quant_multiplier, p.left_shift, p.right_shift);
+#ifdef ENABLE_ARM64
+  MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, r4_, cur_oc * C4NUM, d16_, input_sums_, cur_bias, INT8_MIN, INT8_MAX,
+                   p.output.zp_, p.quant_multiplier, p.left_shift, p.right_shift, params_->row_, cur_oc_res,
+                   params_->col_ * sizeof(int8_t));
 #else
-  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
-  if (cur_oc <= 0) {
-    return RET_OK;
-  }
-  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
-  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
-
-  MatMulInt8(a_c8_ptr_, cur_b, cur_c, params_->row_8_, cur_oc * 8, params_->deep_, quant_params_.input.zp_,
-             quant_params_.weight.zp_);
+  MatmulInt8(a_r4x16_ptr_, cur_b, cur_c, input_sums_, cur_bias, INT8_MIN, INT8_MAX, p.output.zp_, p.quant_multiplier,
+             p.left_shift, p.right_shift, params_->row_, cur_oc_res, d16_, params_->col_);
 #endif
+
   return RET_OK;
 }
 
-int MatmulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MatmulInt8Run(void *cdata, int task_id) {
   auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
   auto ret = op->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -162,43 +137,30 @@ int MatmulInt8CPUKernel::Run() {
   for (int i = 0; i < params_->batch; ++i) {
     auto cur_a_ptr = a_ptr + i * a_stride;
     auto cur_b_ptr = b_ptr + i * b_stride;
-    auto cur_c_ptr = c_ptr + i * c_stride;
 
-#ifdef ENABLE_ARM64
     if (params_->a_transpose_) {
-      RowMajor2Col16x4Major(cur_a_ptr, params_->deep_, params_->row_, a_r4d16_ptr_, d16_);
+      RowMajor2Col16x4Major(cur_a_ptr, params_->deep_, params_->row_, a_r4x16_ptr_, d16_);
+      CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, ColMajor);
     } else {
-      RowMajor2Row4x16Major(cur_a_ptr, params_->row_, params_->deep_, a_r4d16_ptr_, d16_);
+      RowMajor2Row4x16Major(cur_a_ptr, params_->row_, params_->deep_, a_r4x16_ptr_, d16_);
+      CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor);
     }
     if (params_->b_transpose_) {
-      RowMajor2Row4x16Major(cur_b_ptr, params_->col_, params_->deep_, b_c4d16_ptr_, d16_);
-    } else {
-      RowMajor2Col16x4Major(cur_b_ptr, params_->deep_, params_->col_, b_c4d16_ptr_, d16_);
-    }
-    auto &q = quant_params_;
-    RowMajor2Asums(cur_a_ptr, params_->row_, params_->deep_, q.weight.zp_, a_sums_);
-    RowMajor2Bbias(cur_b_ptr, params_->deep_, params_->col_, q.input.zp_, q.weight.zp_, NULL, b_bias_);
-    LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
-    Row4x4Major2RowMajor(c_r4c4_ptr_, r4_, cur_c_ptr, params_->row_, params_->col_);
-#else
-    if (params_->a_transpose_) {
-      RowMajor2Row8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
+      RowMajor2Row4x16Major(cur_b_ptr, params_->col_, params_->deep_, b_c16x4_ptr_, d16_);
+      CalcWeightBiasSums(cur_b_ptr, params_->deep_, params_->col_, quant_params_.input.zp_, quant_params_.weight.zp_,
+                         NULL, weight_bias_sums_, ColMajor);
     } else {
-      RowMajor2Col8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
+      RowMajor2Col16x4Major(cur_b_ptr, params_->deep_, params_->col_, b_c16x4_ptr_, d16_);
+      CalcWeightBiasSums(cur_b_ptr, params_->deep_, params_->col_, quant_params_.input.zp_, quant_params_.weight.zp_,
+                         NULL, weight_bias_sums_, RowMajor);
     }
-    if (params_->b_transpose_) {
-      RowMajor2Col8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
-    } else {
-      RowMajor2Row8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
+    c_ptr_ = c_ptr + i * c_stride;
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, MatmulInt8Run, this, thread_count_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
+      return ret;
     }
-    LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
-    auto &q = quant_params_;
-    SimplePostFuncInt8(c_r8x8_ptr_, cur_c_ptr, params_->col_, params_->row_, params_->row_8_, q.quant_multiplier,
-                       q.left_shift, q.right_shift, q.output.zp_);
-#endif
   }
-
   return RET_OK;
 }
-
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
index 7e6d66f8fb..d728d2aecd 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
@@ -39,57 +39,32 @@ class MatmulInt8CPUKernel : public MatmulBaseCPUKernel {
 
  private:
   void FreeTmpBuffer() {
-#ifdef ENABLE_ARM64
-    if (a_r4d16_ptr_ != nullptr) {
-      ctx_->allocator->Free(a_r4d16_ptr_);
-      a_r4d16_ptr_ = nullptr;
+    if (a_r4x16_ptr_ != nullptr) {
+      ctx_->allocator->Free(a_r4x16_ptr_);
+      a_r4x16_ptr_ = nullptr;
     }
-    if (b_c4d16_ptr_ != nullptr) {
-      ctx_->allocator->Free(b_c4d16_ptr_);
-      b_c4d16_ptr_ = nullptr;
+    if (b_c16x4_ptr_ != nullptr) {
+      ctx_->allocator->Free(b_c16x4_ptr_);
+      b_c16x4_ptr_ = nullptr;
     }
-    if (c_r4c4_ptr_ != nullptr) {
-      ctx_->allocator->Free(c_r4c4_ptr_);
-      c_r4c4_ptr_ = nullptr;
+    if (input_sums_ != nullptr) {
+      ctx_->allocator->Free(input_sums_);
+      input_sums_ = nullptr;
     }
-    if (a_sums_ != nullptr) {
-      ctx_->allocator->Free(a_sums_);
-      a_sums_ = nullptr;
+    if (weight_bias_sums_ != nullptr) {
+      ctx_->allocator->Free(weight_bias_sums_);
+      weight_bias_sums_ = nullptr;
     }
-    if (b_bias_ != nullptr) {
-      ctx_->allocator->Free(b_bias_);
-      b_bias_ = nullptr;
-    }
-#else
-    if (a_c8_ptr_ != nullptr) {
-      ctx_->allocator->Free(a_c8_ptr_);
-      a_c8_ptr_ = nullptr;
-    }
-    if (b_r8_ptr_ != nullptr) {
-      ctx_->allocator->Free(b_r8_ptr_);
-      b_r8_ptr_ = nullptr;
-    }
-    if (c_r8x8_ptr_ != nullptr) {
-      ctx_->allocator->Free(c_r8x8_ptr_);
-      c_r8x8_ptr_ = nullptr;
-    }
-#endif
   }
   MatmulQuantArg quant_params_;
-#ifdef ENABLE_ARM64
-  int8_t *a_r4d16_ptr_ = nullptr;
-  int8_t *b_c4d16_ptr_ = nullptr;
-  int8_t *c_r4c4_ptr_ = nullptr;
-  int *a_sums_ = nullptr;
-  int *b_bias_ = nullptr;
+  int8_t *a_r4x16_ptr_ = nullptr;
+  int8_t *b_c16x4_ptr_ = nullptr;
+  int8_t *c_ptr_ = nullptr;
+  int *input_sums_ = nullptr;
+  int *weight_bias_sums_ = nullptr;
   int r4_;
   int c4_;
   int d16_;
-#else
-  int8_t *a_c8_ptr_ = nullptr;
-  int8_t *b_r8_ptr_ = nullptr;
-  int *c_r8x8_ptr_ = nullptr;
-#endif
 };  // namespace mindspore::kernel
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
index e8c67c4c79..d4cad12b42 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
@@ -77,7 +77,7 @@ int MulInt8CPUKernel::Run() {
     input0_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
     input1_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
 
-    ArithmeticParameter tile_para = {0};
+    ArithmeticParameter tile_para;
     tile_para.ndim_ = out_tensors_.at(0)->shape().size();
     for (size_t i = 0; i < tile_para.ndim_; i++) {
       tile_para.in_shape0_[i] = in_tensors_.at(0)->DimensionSize(i);
@@ -86,17 +86,17 @@ int MulInt8CPUKernel::Run() {
     }
     TileDimensionsInt8(static_cast<int8_t *>(in_tensors_.at(0)->Data()),
                        static_cast<int8_t *>(in_tensors_.at(1)->Data()), input0_data_, input1_data_, &tile_para);
-    ret = LiteBackendParallelLaunch(MulInt8Run, this, thread_count_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, MulInt8Run, this, thread_count_);
     ctx_->allocator->Free(input0_data_);
     ctx_->allocator->Free(input1_data_);
     return ret;
   }
 
-  ret = LiteBackendParallelLaunch(MulInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, MulInt8Run, this, thread_count_);
   return ret;
 }
 
-int MulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MulInt8Run(void *cdata, int task_id) {
   auto mul = reinterpret_cast<MulInt8CPUKernel *>(cdata);
   mul->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
index 36d9984cac..9f00e2e8e1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
@@ -46,7 +46,7 @@ class MulInt8CPUKernel : public LiteKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int MulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int MulInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MUL_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
index 0f67fa9d9d..f836cfa22a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
@@ -112,7 +112,7 @@ int PadInt8CPUKernel::RunImpl(int task_id) {
   return PadConstant4D(in_data_, out_data_, in_dims_, out_dims_, pad_param_->paddings_, task_id, context_->thread_num_);
 }
 
-int PadInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PadInt8Impl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<PadInt8CPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -132,7 +132,7 @@ int PadInt8CPUKernel::Run() {
   out_data_ = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
 
   memset(out_data_, pad_param_->pad_quant_arg_.constant_value_[0], out_tensors_[0]->ElementsNum() * sizeof(int8_t));
-  int error_code = LiteBackendParallelLaunch(PadInt8Impl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PadInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
index f9e990bab4..db8ef66042 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
@@ -61,15 +61,19 @@ int PoolingInt8CPUKernel::ReSize() {
 int PoolingInt8CPUKernel::RunImpl(int task_id) {
   auto input_data = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->Data());
   auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
-  if (pooling_param_->max_pooling_) {
-    MaxPoolingInt8(input_data, output_data, pooling_param_, task_id);
+  if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
+    if (pooling_param_->quantize_) {
+      MaxPoolingWithQuantInt8(input_data, output_data, pooling_param_, task_id);
+    } else {
+      MaxPoolingOptInt8(input_data, output_data, pooling_param_, task_id);
+    }
   } else {
-    AvgPoolingInt8(input_data, output_data, pooling_param_, task_id);
+    AvgPoolingOptInt8(input_data, output_data, pooling_param_, task_id);
   }
   return RET_OK;
 }
 
-int PoolingInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PoolingInt8Impl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingInt8CPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -85,7 +89,7 @@ int PoolingInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(PoolingInt8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingInt8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "poolingInt8 error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
index 9e7aa10d62..5abe82c9b9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
@@ -88,7 +88,7 @@ int PowerInt8CPUKernel::DoPower(int task_id) {
   return ret;
 }
 
-int PowerInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PowerInt8Run(void *cdata, int task_id) {
   auto power_kernel = reinterpret_cast<PowerInt8CPUKernel *>(cdata);
   auto ret = power_kernel->DoPower(task_id);
   if (ret != RET_OK) {
@@ -103,7 +103,7 @@ int PowerInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  ret = LiteBackendParallelLaunch(PowerInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, PowerInt8Run, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PowerInt8Run error, error_code[" << ret << "]";
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
index 90a2e5aad8..2e498bbcc8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
@@ -235,7 +235,7 @@ int ReduceInt8CPUKernel::ReSize() {
   return ret;
 }
 
-int ReduceInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReduceInt8Impl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -284,7 +284,7 @@ int ReduceInt8CPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       FreeTmpBuffer();
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
@@ -321,7 +321,7 @@ int ReduceInt8CPUKernel::Run() {
   axis_size_ = tmp_shape_[last_reduce_axis];
   last_dst_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
   is_last_axis_ = true;
-  auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+  auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
index 8292653610..8ec6f39d58 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
@@ -58,7 +58,7 @@ int ReluXInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ReluXInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReluXInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<ReluXInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -74,7 +74,7 @@ int ReluXInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(ReluXInt8Run, this, op_parameter_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReluXInt8Run, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "ReluXInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
index 02149c3847..a730a61c48 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
@@ -60,11 +60,11 @@ int ReshapeInt8CPUKernel::Run() {
   elements_num_ = in_tensors_.at(kInputIndex)->ElementsNum();
   count_unit_ = op_parameter_->thread_num_ > 1 ? UP_DIV(elements_num_, op_parameter_->thread_num_) : elements_num_;
 
-  ret = LiteBackendParallelLaunch(ReshapeInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ReshapeInt8Run, this, op_parameter_->thread_num_);
   return ret;
 }
 
-int ReshapeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReshapeInt8Run(void *cdata, int task_id) {
   auto reshape = reinterpret_cast<ReshapeInt8CPUKernel *>(cdata);
   reshape->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
index 13f2450342..61115acdd8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
@@ -46,7 +46,7 @@ class ReshapeInt8CPUKernel : public ReshapeBaseCPUKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int ReshapeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int ReshapeInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_RESHAPE_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
index fd1581b56c..aab798265a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
@@ -30,20 +30,14 @@ using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore::kernel {
-namespace {
-constexpr int kInputNum = 1;
-constexpr int kOutputNum = 1;
-constexpr size_t kRank = 4;
-}  // namespace
-
 int ResizeInt8CPUKernel::Init() {
   auto ret = ResizeBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
-  quant_in_ = new(std::nothrow) QuantArg;
+  quant_in_ = new (std::nothrow) QuantArg;
   MS_ASSERT(quant_in_);
-  quant_out_ = new(std::nothrow) QuantArg;
+  quant_out_ = new (std::nothrow) QuantArg;
   MS_ASSERT(quant_out_);
   auto input = in_tensors_.at(0);
   quant_in_->zp_ = input->GetQuantParams().front().zeroPoint;
@@ -52,7 +46,7 @@ int ResizeInt8CPUKernel::Init() {
   quant_out_->zp_ = output->GetQuantParams().front().zeroPoint;
   quant_out_->scale_ = output->GetQuantParams().front().scale;
 
-  multiplier_ = new(std::nothrow) QuantMulArg;
+  multiplier_ = new (std::nothrow) QuantMulArg;
   MS_ASSERT(multiplier_);
   QuantizeRoundParameter(quant_in_->scale_ / quant_out_->scale_, &multiplier_->multiplier_, &multiplier_->left_shift_,
                          &multiplier_->right_shift_);
@@ -62,7 +56,7 @@ int ResizeInt8CPUKernel::Init() {
   return ReSize();
 }
 
-int ResizeInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ResizeInt8Impl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<ResizeInt8CPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -91,9 +85,14 @@ int ResizeInt8CPUKernel::RunImpl(int task_id) {
   int ret = 0;
   switch (method_) {
     case static_cast<int>(schema::ResizeMethod_BILINEAR): {
-      ret = ResizeBilinearInt8(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(),
-                               align_corners_, quant_in_, quant_out_, multiplier_, task_id, context_->thread_num_);
-
+      if (quant_in_->zp_ == 0) {
+        ret = ResizeBilinearInt8(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(),
+                                 align_corners_, quant_in_, quant_out_, multiplier_, task_id, context_->thread_num_);
+      } else {
+        ret = ResizeBilinearInt8WithFloatWeight(input_data, output_data, input_shape.data(),
+                                                out_tensors_[0]->shape().data(), align_corners_, quant_in_, quant_out_,
+                                                multiplier_, task_id, context_->thread_num_);
+      }
       break;
     }
     case static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR): {
@@ -101,25 +100,12 @@ int ResizeInt8CPUKernel::RunImpl(int task_id) {
       bool same_scale = abs(quant_out_->scale_ - quant_in_->scale_) < 1e-6;
       if (same_zp && same_scale) {
         ret =
-            ResizeNearestNeighborInt8Simple(input_data,
-                                            output_data,
-                                            input_shape.data(),
-                                            out_tensors_[0]->shape().data(),
-                                            align_corners_,
-                                            task_id,
-                                            context_->thread_num_);
+          ResizeNearestNeighborInt8Simple(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(),
+                                          align_corners_, task_id, context_->thread_num_);
       } else {
         ret =
-            ResizeNearestNeighborInt8(input_data,
-                                      output_data,
-                                      input_shape.data(),
-                                      out_tensors_[0]->shape().data(),
-                                      align_corners_,
-                                      multiplier_,
-                                      quant_in_,
-                                      quant_out_,
-                                      task_id,
-                                      context_->thread_num_);
+          ResizeNearestNeighborInt8(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(),
+                                    align_corners_, multiplier_, quant_in_, quant_out_, task_id, context_->thread_num_);
       }
       break;
     }
@@ -138,7 +124,7 @@ int ResizeInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(ResizeInt8Impl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ResizeInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
index 750fdcea9f..2add85dd2e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
@@ -79,7 +79,7 @@ int SigmoidInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int SigmoidInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SigmoidInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<SigmoidInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -95,7 +95,7 @@ int SigmoidInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(SigmoidInt8Run, this, op_parameter_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, SigmoidInt8Run, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "SigmoidInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
index 1ba2df8f25..631273ec16 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
@@ -68,7 +68,7 @@ int SliceInt8CPUKernel::DoSlice(int task_id) {
   return ret;
 }
 
-int SliceInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SliceInt8Run(void *cdata, int task_id) {
   auto slice_kernel = reinterpret_cast<SliceInt8CPUKernel *>(cdata);
   auto ret = slice_kernel->DoSlice(task_id);
   if (ret != RET_OK) {
@@ -90,7 +90,7 @@ int SliceInt8CPUKernel::Run() {
   if (param_->size_[1] < param_->op_parameter_.thread_num_) {
     ret = SliceInt8NoParallel(input_data, output_data, param_);
   } else {
-    ret = LiteBackendParallelLaunch(SliceInt8Run, this, op_parameter_->thread_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, SliceInt8Run, this, op_parameter_->thread_num_);
   }
 
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
index 20f52429e7..1b16a00336 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
@@ -94,7 +94,7 @@ int SoftmaxInt8CPUKernel::DoSoftmax(int task_id) {
   return RET_OK;
 }
 
-int SoftmaxRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SoftmaxRun(void *cdata, int task_id) {
   auto softmax_kernel = reinterpret_cast<SoftmaxInt8CPUKernel *>(cdata);
   auto error_code = softmax_kernel->DoSoftmax(task_id);
   if (error_code != RET_OK) {
@@ -122,7 +122,7 @@ int SoftmaxInt8CPUKernel::Run() {
     context_->allocator->Free(sum_data_);
     return RET_ERROR;
   }
-  ret = LiteBackendParallelLaunch(SoftmaxRun, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SoftmaxRun, this, thread_count_);
   context_->allocator->Free(exp_data_);
   context_->allocator->Free(sum_data_);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
index b69891f45d..676da57151 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
@@ -71,7 +71,7 @@ int SplitInt8CPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-int SplitInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SplitInt8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitInt8CPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -94,7 +94,7 @@ int SplitInt8CPUKernel::Run() {
     output_ptr_.push_back(reinterpret_cast<int8_t *>(out_tensors_.at(i)->Data()));
   }
 
-  ret = LiteBackendParallelLaunch(SplitInt8Run, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitInt8Run, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
index 311a3e41f4..c1e4196b6e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
@@ -157,7 +157,7 @@ int SqueezeInt8CPUKernel::Run() {
     free(*(inputs_array + i));
   }
 
-  ret = LiteBackendParallelLaunch(SqueezeInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SqueezeInt8Run, this, thread_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "RunSqueezeParam failed. errorcode: ";
   }
@@ -165,7 +165,7 @@ int SqueezeInt8CPUKernel::Run() {
   return ret;
 }
 
-int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SqueezeInt8Run(void *cdata, int task_id) {
   auto Squeeze = reinterpret_cast<SqueezeInt8CPUKernel *>(cdata);
   Squeeze->DoExecute(task_id);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
index 128e32425e..6d205ce62b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
@@ -44,7 +44,7 @@ class SqueezeInt8CPUKernel : public SqueezeBaseCPUKernel {
   SqueezeQuantArg *quant_Squeeze_parm_;
 };
 
-int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int SqueezeInt8Run(void *cdata, int task_id);
 
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
index 4a68c8034f..9a4f705072 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
@@ -110,7 +110,7 @@ int SubInt8CPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int SubInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SubInt8Run(void *cdata, int task_id) {
   auto sub_kernel = reinterpret_cast<SubInt8CPUKernel *>(cdata);
   auto ret = sub_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -128,7 +128,7 @@ int SubInt8CPUKernel::Run() {
   }
 
   if (broadcast_) {
-    ArithmeticParameter tile_para = {0};
+    ArithmeticParameter tile_para;
     tile_para.ndim_ = out_tensors_.at(0)->shape().size();
     for (size_t i = 0; i < tile_para.ndim_; i++) {
       tile_para.in_shape0_[i] = in_tensors_.at(0)->DimensionSize(i);
@@ -147,7 +147,7 @@ int SubInt8CPUKernel::Run() {
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(tile0_data_),
                         reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
   }
-  ret = LiteBackendParallelLaunch(SubInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SubInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
     context_->allocator->Free(tile0_data_);
     context_->allocator->Free(tile1_data_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
index 35c7200b56..98f3d9067a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
@@ -70,7 +70,7 @@ int Unsqueezeint8CPUKernel::DoUnsqueeze(int task_id) {
   return RET_OK;
 }
 
-int UnsqueezeIn8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int UnsqueezeIn8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<Unsqueezeint8CPUKernel *>(cdata);
   auto ret = g_kernel->DoUnsqueeze(task_id);
   if (ret != RET_OK) {
@@ -88,7 +88,7 @@ int Unsqueezeint8CPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  ret = LiteBackendParallelLaunch(UnsqueezeIn8Run, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, UnsqueezeIn8Run, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "UnsqueezeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.h
index 9e127f3d82..0e3a580cd7 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.h
@@ -30,7 +30,7 @@ class Unsqueezeint8CPUKernel : public LiteKernel {
   Unsqueezeint8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                          const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                          const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {
     Unsq_para_ = reinterpret_cast<UnSqueezeParameter *>(op_parameter_);
     Unsq_para_->thread_count_ = op_parameter_->thread_num_;
   }
@@ -42,14 +42,12 @@ class Unsqueezeint8CPUKernel : public LiteKernel {
   int DoUnsqueeze(int task_id);
 
  private:
-  UnSqueezeQuantArg *quant_Unsqueeze_parm_;
   UnSqueezeParameter *Unsq_para_;
   int thread_sz_count_;
   int thread_sz_stride_;
   int data_size_;
   float *in_ptr_;
   float *out_ptr_;
-  const Context *ctx_;
   int thread_count_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt
index 9d56a9333e..e870054f48 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt
@@ -1,16 +1,2 @@
-set(OPENCL_KERNEL_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/subgraph_opencl_kernel.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/arithmetic.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolution.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/depthwise_conv2d.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/pooling2d.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/matmul.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/softmax.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/concat.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/conv2d_transpose.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/transpose.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/reshape.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/activation.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/caffe_prelu.cc
-    )
+file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc)
+add_library(opencl_kernel_lib_ OBJECT ${KERNEL_SRC} ${OPENCL_KERNEL_SRC})
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
index 554674da6a..c3cc89ad1f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
@@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define INT4 int4
 #define INT2 int2
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
@@ -11,16 +12,16 @@ __kernel void batch_normalization(__read_only image2d_t input, __read_only image
   if (X >= input_shape.y || Y >= input_shape.z || Z >= input_shape.w) {
     return;
   }
-  FLT4 result = read_imagef(input, smp_none, (int2)((Y)*input_shape.w + Z, (X)));
+  FLT4 result = READ_IMAGE(input, smp_none, (int2)((Y)*input_shape.w + Z, (X)));
 
-  FLT4 result_mean = read_imagef(mean, smp_none, (int2)((Z), (0)));
-  FLT4 result_var = read_imagef(variance, smp_none, (int2)((Z), (0)));
-  FLT4 result_scale = read_imagef(scale, smp_none, (int2)((Z), (0)));
-  FLT4 result_offset = read_imagef(offset, smp_none, (int2)((Z), (0)));
+  FLT4 result_mean = READ_IMAGE(mean, smp_none, (int2)((Z), (0)));
+  FLT4 result_var = READ_IMAGE(variance, smp_none, (int2)((Z), (0)));
+  FLT4 result_scale = READ_IMAGE(scale, smp_none, (int2)((Z), (0)));
+  FLT4 result_offset = READ_IMAGE(offset, smp_none, (int2)((Z), (0)));
 
   result.x = result_scale.x * ((result.x - result_mean.x) / sqrt(result_var.x + epsilon)) + result_offset.x;
   result.y = result_scale.y * ((result.y - result_mean.y) / sqrt(result_var.y + epsilon)) + result_offset.y;
   result.z = result_scale.z * ((result.z - result_mean.z) / sqrt(result_var.z + epsilon)) + result_offset.z;
   result.w = result_scale.w * ((result.w - result_mean.w) / sqrt(result_var.w + epsilon)) + result_offset.w;
-  write_imagef(output, (int2)((Y)*input_shape.w + Z, (X)), result);
+  WRITE_IMAGE(output, (int2)((Y)*input_shape.w + Z, (X)), result);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/biasadd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/biasadd.cl
new file mode 100644
index 0000000000..e2e1f5b0bb
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/biasadd.cl
@@ -0,0 +1,31 @@
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+
+#define SLICES 4
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+#define FLT4 float4
+#define READ_FLT4 read_imagef
+#define WRITE_FLT4 write_imagef
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void BiasAdd(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+                      __global float *alpha, const int dim) {
+  int C = input_shape.w;  // channel size
+
+  int Y = get_global_id(0);  // height id
+  int X = get_global_id(1);  // weight id
+  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
+    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 tmp;
+    int index = 0;
+    if (dim == 2) {
+      index = X * 4;
+    } else {
+      index = num * 4;
+    }
+    tmp.x = in_c4.x + alpha[index];
+    tmp.y = in_c4.y + alpha[index + 1];
+    tmp.z = in_c4.z + alpha[index + 2];
+    tmp.w = in_c4.w + alpha[index + 3];
+    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
deleted file mode 100644
index bfaec1dfe1..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-
-#define SLICES 4
-#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-__kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                         __global float *alpha) {
-  int C = input_shape.w;  // channel size
-
-  int Y = get_global_id(0);  // height id
-  int X = get_global_id(1);  // weight id
-  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
-    FLT4 tmp;
-    int index = num * 4;
-    tmp.x = in_c4.x * alpha[index];
-    tmp.y = in_c4.y * alpha[index + 1];
-    tmp.z = in_c4.z * alpha[index + 2];
-    tmp.w = in_c4.w * alpha[index + 3];
-    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
-  }
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
index c16daf9c81..c2ae7c9106 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
@@ -1,4 +1,4 @@
-// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
@@ -10,11 +10,11 @@ __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1,
     return;
   }
   if (Z < input_channels.x) {
-    FLT4 result = read_imagef(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
-    write_imagef(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
   } else {
-    FLT4 result = read_imagef(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
-    write_imagef(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
   }
 }
 
@@ -27,14 +27,14 @@ __kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t i
     return;
   }
   if (Z < input_channels.x) {
-    FLT4 result0 = read_imagef(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
-    write_imagef(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
+    FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
   } else if (Z < (input_channels.x + input_channels.y)) {
-    FLT4 result1 = read_imagef(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
-    write_imagef(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+    FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
   } else {
     FLT4 result2 =
-      read_imagef(input2, smp_none, (int2)((Y)*input_channels.z + Z - input_channels.x - input_channels.y, (X)));
-    write_imagef(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+      READ_IMAGE(input2, smp_none, (int2)((Y)*input_channels.z + Z - input_channels.x - input_channels.y, (X)));
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
index 2014bb589d..eea2139ee2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
@@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
                                   __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
index a6e1c322e1..0c1b83d444 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
@@ -1,189 +1,190 @@
-__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                         float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-                                         int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                         __write_only image2d_t dst_data, int2 kernel_size,
+                                         int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                         float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(x_c, (Z * src_size.y + y_c)));
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
-  write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
 }
 
 __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                        float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-                                        int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                        __write_only image2d_t dst_data, int2 kernel_size,
+                                        int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                        float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-  write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res);
 }
 
 __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                            float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-                                            int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                            __write_only image2d_t dst_data, int2 kernel_size, int2 stride,
+                                            int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                            float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z;
   {
-    int y_c = y_offseted;
+    int y_c = y_offset;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     {
-      int x_c = x_offseted;
+      int x_c = x_offset;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
+        r += TO_FLT4(src_p * flt_p);
       }
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-  write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res);
 }
 __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                         float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                         int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                         __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                         int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                         float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res;
 }
 
 __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                        float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                        int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                        __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                        int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                        float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
 }
 
 __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                            float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                            int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
+                                            __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                            int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                            float relu_clip_min, float relu_clip_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z;
   {
-    int y_c = y_offseted;
+    int y_c = y_offset;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     {
-      int x_c = x_offseted;
+      int x_c = x_offset;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        r += TO_FLT4(src_p * flt_p);
       }
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
index 480be10107..e67c73358e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
                      __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl
new file mode 100644
index 0000000000..40fbb4cfe3
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl
@@ -0,0 +1,30 @@
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+
+#define SLICES 4
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void PRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+                    __global float *alpha, const int dim) {
+  int C = input_shape.w;  // channel size
+
+  int Y = get_global_id(0);  // height id
+  int X = get_global_id(1);  // weight id
+  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 tmp;
+    if (dim == 1) {
+      tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * (*alpha);
+      tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * (*alpha);
+      tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * (*alpha);
+      tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * (*alpha);
+    } else {
+      int index = num * 4;
+      tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha[index];
+      tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha[index + 1];
+      tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha[index + 2];
+      tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha[index + 3];
+    }
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
index bb9892b575..b51c514856 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
@@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
   int X = get_global_id(0);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
index 5fc704ae4c..72a20cd293 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
@@ -1,6 +1,6 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define INT2 int2
 #define INT4 int4
-#define FLT4 float4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 __kernel void slice(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
                     INT4 begin, INT2 sharedNoUpdiv) {
@@ -12,46 +12,43 @@ __kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
   FLT4 result;
   if (sharedNoUpdiv.x % 4 == 0) {
     for (int i = 0; i < out_shape.w; i++) {
-      result = read_imagef(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
-      write_imagef(output, (INT2)((Y)*out_shape.w + i, (X)), result);
+      result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
+      WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i, (X)), result);
     }
   } else {
     int begin_postion = sharedNoUpdiv.y % 4;
-    FLT4 first = read_imagef(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
+    FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
     if (begin_postion == 1) {
       for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second =
-          read_imagef(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
+        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
         result.x = first.y;
         result.y = first.z;
         result.z = first.w;
         result.w = second.x;
-        write_imagef(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
+        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
         first.y = second.y;
         first.z = second.z;
         first.w = second.w;
       }
     } else if (begin_postion == 2) {
       for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second =
-          read_imagef(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
+        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
         result.x = first.z;
         result.y = first.w;
         result.z = second.x;
         result.w = second.y;
-        write_imagef(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
+        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
         first.z = second.z;
         first.w = second.w;
       }
     } else {
       for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second =
-          read_imagef(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
+        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
         result.x = first.w;
         result.y = second.x;
         result.z = second.y;
         result.w = second.z;
-        write_imagef(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
+        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
         first.w = second.w;
       }
     }
@@ -64,18 +61,18 @@ __kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
     result_fill0.y = 0;
     result_fill0.z = 0;
     result_fill0.w = 0;
-    write_imagef(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
+    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
   } else if (size == 2) {
     result_fill0.x = result.x;
     result_fill0.y = result.y;
     result_fill0.z = 0;
     result_fill0.w = 0;
-    write_imagef(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
+    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
   } else if (size == 3) {
     result_fill0.x = result.x;
     result_fill0.y = result.y;
     result_fill0.z = result.z;
     result_fill0.w = 0;
-    write_imagef(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
+    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
index 31b6e02b55..bc08a62c2b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -1,14 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                          int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                           int4 shape) {
   int X = get_global_id(0);
@@ -46,58 +37,17 @@ __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only
   }
   WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]);
 }
-__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                            int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                            int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  // size(h, w, c4, 1), shape(n, c, h, w)
+  int X = get_global_id(0);  // h
+  int Y = get_global_id(1);  // w
+  int Z = get_global_id(2);  // c4
   if (X >= size.x || Y >= size.y || Z >= size.z) {
     return;
   }
-  // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]);
+  WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]);
 }
-
 __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                          int4 shape) {
   int X = get_global_id(0);
@@ -108,56 +58,6 @@ __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __globa
   }
   dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
 }
-__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                          int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                           int4 shape) {
   int X = get_global_id(0);
@@ -184,25 +84,16 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob
     }
   }
 }
-__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  // size(h, w, c, 1), shape(n, c, h, w)
+  int X = get_global_id(0);  // h
+  int Y = get_global_id(1);  // w
+  int Z = get_global_id(2);  // c
   if (X >= size.x || Y >= size.y || Z >= size.z) {
     return;
   }
-  dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
+  dst_data[(Z * size.x + X) * size.y + Y] = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X));
 }
 __kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                            int4 shape) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
index 05f903602e..0076b5fdb5 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
   int X = get_global_id(0);
@@ -75,8 +76,8 @@ __kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_dat
   result[3].z = x2.w;
   result[3].w = x3.w;
 
-  dst_data[4 * Y * HW.y + X] = result[0];
-  dst_data[(4 * Y + 1) * HW.y + X] = result[1];
-  dst_data[(4 * Y + 2) * HW.y + X] = result[2];
-  dst_data[(4 * Y + 3) * HW.y + X] = result[3];
+  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0];
+  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1];
+  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2];
+  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3];
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
index 80beb6ed54..b40cd5ea7c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -15,6 +15,7 @@
  */
 
 #include <vector>
+#include <map>
 #include <string>
 #include <set>
 
@@ -23,7 +24,6 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"
-
 #include "src/runtime/kernel/opencl/cl/activation.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -39,61 +39,58 @@ using mindspore::schema::PrimitiveType_Activation;
 namespace mindspore::kernel {
 
 int ActivationOpenClKernel::Init() {
-  const int max_shape_dim = 4;
-  if (in_tensors_[0]->shape().size() != max_shape_dim) {
-    MS_LOG(ERROR) << "Activate fun only support dim=4, but your dim=" << in_tensors_[0]->shape().size();
+  in_size_ = in_tensors_[0]->shape().size();
+  out_size_ = out_tensors_[0]->shape().size();
+  if (in_size_ != 2 && in_size_ != 4) {
+    MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_;
     return RET_ERROR;
   }
-  std::string program_name = "";
-  std::string kernel_name = "";
-  std::string source = activation_source;
-  if (type_ == ActivationType_RELU) {
-    program_name = "RELU";
-    kernel_name = "Relu";
-  } else if (type_ == ActivationType_RELU6) {
-    program_name = "RELU6";
-    kernel_name = "Relu6";
-  } else if (type_ == ActivationType_LEAKY_RELU) {
-    program_name = "LEAKY_RELU";
-    kernel_name = "ReluScalar";
-  } else if (type_ == ActivationType_SIGMOID) {
-    program_name = "SIGMOID";
-    kernel_name = "Sigmoid";
-  } else {
-    MS_LOG(ERROR) << "Activation type error";
+  std::map<int, std::vector<std::string>> Program_Kernel{
+    {ActivationType_LEAKY_RELU, std::vector<std::string>{"LEAKY_RELU", "ReluScalar"}},
+    {ActivationType_RELU, std::vector<std::string>{"RELU", "Relu"}},
+    {ActivationType_SIGMOID, std::vector<std::string>{"SIGMOID", "Sigmoid"}},
+    {ActivationType_RELU6, std::vector<std::string>{"RELU6", "Relu6"}}};
+  if (Program_Kernel.count(type_) == 0) {
+    MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found";
     return RET_ERROR;
   }
+
+  std::string source = activation_source;
   std::set<std::string> build_options;
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->LoadSource(program_name, source);
-  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime->LoadSource(Program_Kernel[type_][0], source);
+  ocl_runtime->BuildKernel(kernel_, Program_Kernel[type_][0], Program_Kernel[type_][1], build_options);
+
+  std::map<int, schema::Format> format{{4, schema::Format_NHWC4}, {2, schema::Format_NC4}};
+  if (format.count(out_size_) == 0) {
+    MS_LOG(ERROR) << "Not found output tensor format";
+    return RET_ERROR;
+  }
   in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
   out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
+  in_tensors_[0]->SetFormat(format[in_size_]);
+  out_tensors_[0]->SetFormat(format[out_size_]);
+  if (in_size_ == 2) {
+    in_ori_format_ = schema::Format_NC4;
+    out_ori_format_ = schema::Format_NC4;
+  }
   MS_LOG(DEBUG) << op_parameter_->name_ << " init Done!";
   return RET_OK;
 }
 
 int ActivationOpenClKernel::Run() {
   MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!";
-  int N = in_tensors_[0]->shape()[0];
-  int H = in_tensors_[0]->shape()[1];
-  int W = in_tensors_[0]->shape()[2];
-  int C = in_tensors_[0]->shape()[3];
-  cl_int4 input_shape = {N, H, W, C};
-
+  cl_int4 img2d_shape = GetImg2dShape();
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   int arg_idx = 0;
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, img2d_shape);
   if (type_ == ActivationType_LEAKY_RELU) {
     ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_);
   }
   std::vector<size_t> local = {1, 1};
-  std::vector<size_t> global = {static_cast<size_t>(H), static_cast<size_t>(W)};
-  std::cout << type_ << " " << std::endl;
+  std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
   auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail.";
@@ -102,11 +99,21 @@ int ActivationOpenClKernel::Run() {
   return RET_OK;
 }
 
-int ActivationOpenClKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
-  int H = in_tensors_[0]->shape()[1];
-  int W = in_tensors_[0]->shape()[2];
-  int C = in_tensors_[0]->shape()[3];
+cl_int4 ActivationOpenClKernel::GetImg2dShape() {
+  cl_int4 img2d_shape = {0, 0, 0, 0};
+  for (int i = 0; i < in_size_; ++i) {
+    img2d_shape.s[i + 4 - in_size_] = in_tensors_[0]->shape()[i];
+  }
+  if (in_size_ == 2) {
+    img2d_shape.s[1] = img2d_shape.s[2];
+    img2d_shape.s[2] = UP_DIV(img2d_shape.s[3], C4NUM);
+    img2d_shape.s[3] = C4NUM;
+  }
+  return img2d_shape;
+}
 
+int ActivationOpenClKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  cl_int4 img_shape = GetImg2dShape();
 #ifdef ENABLE_FP16
   size_t img_dtype = CL_HALF_FLOAT;
 #else
@@ -114,8 +121,8 @@ int ActivationOpenClKernel::GetImageSize(size_t idx, std::vector<size_t> *img_si
 #endif
 
   img_size->clear();
-  img_size->push_back(W * UP_DIV(C, C4NUM));
-  img_size->push_back(H);
+  img_size->push_back(img_shape.s[2] * UP_DIV(img_shape.s[3], C4NUM));
+  img_size->push_back(img_shape.s[1]);
   img_size->push_back(img_dtype);
   return RET_OK;
 }
@@ -125,11 +132,11 @@ kernel::LiteKernel *OpenClActivationFp32KernelCreator(const std::vector<lite::te
                                                       OpParameter *opParameter, const lite::Context *ctx,
                                                       const kernel::KernelKey &desc,
                                                       const mindspore::lite::PrimitiveC *primitive) {
-  if (inputs.size() == 0) {
+  if (inputs.empty()) {
     MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
     return nullptr;
   }
-  if (inputs[0]->shape()[0] > 1) {
+  if (inputs[0]->shape().size() > 2 && inputs[0]->shape()[0] > 1) {
     MS_LOG(ERROR) << "Activation kernel:" << opParameter->name_ << " failed: Unsupported multi-batch.";
     return nullptr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
index c4ee6779b0..d31d744a63 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -38,11 +38,14 @@ class ActivationOpenClKernel : public OpenCLKernel {
   int Init() override;
   int Run() override;
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
+  cl_int4 GetImg2dShape();
 
  private:
   cl::Kernel kernel_;
   int type_;
   float alpha_;
+  int in_size_;
+  int out_size_;
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 7fb0a15a58..a5bcf0422a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -103,10 +103,7 @@ int ArithmeticOpenCLKernel::Init() {
 
   lite::STATUS error_code = RET_OK;
 #ifdef PROGRAM_WITH_IL
-  bool ret = runtime_->CreateKernelFromIL(kernel_(), kernel_name);
-  if (!ret) {
-    error_code = RET_ERROR;
-  }
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   if (out_mem_type_ == OpenCLMemType::IMG) {
     kernel_name += "_IMG";
@@ -134,7 +131,6 @@ int ArithmeticOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 
-  uint32_t element_num = out_tensors_[0]->ElementsC4Num();
   int arg_idx = 0;
 
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
index c0036a160f..ee8eba8ad3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -38,11 +38,12 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
     im_dst_y = out_tensors_[0]->Height() * CO4;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -148,4 +149,5 @@ kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::tensor:
 }
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
new file mode 100644
index 0000000000..151412194b
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/kernel/opencl/kernel/biasadd.h"
+#include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/cl/biasadd.cl.inc"
+
+using mindspore::kernel::KERNEL_ARCH::kGPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_BiasAdd;
+
+namespace mindspore::kernel {
+
+void BiasAddOpenCLKernel::InitBuffer() {
+  int C = in_tensors_[1]->shape()[0];
+  int div_ci = UP_DIV(C, C4NUM);
+  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  BiasAdd_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(div_ci * C4NUM * sizeof(FLOAT_t)));
+  BiasAdd_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(BiasAdd_, CL_MAP_WRITE, nullptr, true));
+  memset(BiasAdd_, 0x00, div_ci * C4NUM * sizeof(FLOAT_t));
+  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_[1]->Data());
+  for (int i = 0; i < in_tensors_[1]->ElementsNum(); ++i) {
+    BiasAdd_[i] = origin_weight[i];
+  }
+  allocator->UnmapBuffer(BiasAdd_);
+}
+
+int BiasAddOpenCLKernel::Init() {
+  in_size_ = in_tensors_[0]->shape().size();
+  out_size_ = out_tensors_[0]->shape().size();
+  if (in_size_ != 4 && in_size_ != 2) {
+    MS_LOG(ERROR) << "BiasAdd only support dim=4 or 2, but your dim=" << in_size_;
+    return RET_ERROR;
+  }
+  int C = in_tensors_[0]->shape()[3];
+  int Bias_Size = in_tensors_[1]->shape()[0];
+  if (UP_DIV(Bias_Size, C4NUM) != UP_DIV(C, C4NUM)) {
+    MS_LOG(ERROR) << "BiasAdd weight channel size:" << Bias_Size << " must be equal with in_teneors channel size:" << C;
+    return RET_ERROR;
+  }
+  InitBuffer();
+  std::set<std::string> build_options;
+  std::string source = biasadd_source;
+  std::string program_name = "BiasAdd";
+  std::string kernel_name = "BiasAdd";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->LoadSource(program_name, source);
+  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
+
+  in_ori_format_ = in_tensors_[0]->GetFormat();
+  out_ori_format_ = out_tensors_[0]->GetFormat();
+  std::map<int, schema::Format> format{{4, schema::Format_NHWC4}, {2, schema::Format_NC4}};
+  if (format.count(out_size_) == 0) {
+    MS_LOG(ERROR) << "Not found output tensor format";
+    return RET_ERROR;
+  }
+  in_tensors_[0]->SetFormat(format[in_size_]);
+  out_tensors_[0]->SetFormat(format[out_size_]);
+  if (in_size_ == 2) {
+    in_ori_format_ = format[in_size_];
+    out_ori_format_ = format[out_size_];
+  }
+  MS_LOG(DEBUG) << program_name << " Init Done!";
+  return RET_OK;
+}
+
+int BiasAddOpenCLKernel::Run() {
+  cl_int4 input_shape = GetImg2dShape();
+  MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  int arg_idx = 0;
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_size_);
+  std::vector<size_t> local = {1, 1};
+  std::vector<size_t> global = {static_cast<size_t>(input_shape.s[1]), static_cast<size_t>(input_shape.s[2])};
+  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+cl_int4 BiasAddOpenCLKernel::GetImg2dShape() {
+  cl_int4 img2d_shape = {0, 0, 0, 0};
+  for (int i = 0; i < in_size_; ++i) {
+    img2d_shape.s[i + 4 - in_size_] = in_tensors_[0]->shape()[i];
+  }
+  if (in_size_ == 2) {
+    img2d_shape.s[1] = img2d_shape.s[2];
+    img2d_shape.s[2] = UP_DIV(img2d_shape.s[3], C4NUM);
+    img2d_shape.s[3] = C4NUM;
+  }
+  return img2d_shape;
+}
+
+int BiasAddOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  cl_int4 img_shape = GetImg2dShape();
+#ifdef ENABLE_FP16
+  size_t img_dtype = CL_HALF_FLOAT;
+#else
+  size_t img_dtype = CL_FLOAT;
+#endif
+
+  img_size->clear();
+  img_size->push_back(img_shape.s[2] * UP_DIV(img_shape.s[3], C4NUM));
+  img_size->push_back(img_shape.s[1]);
+  img_size->push_back(img_dtype);
+  return RET_OK;
+}
+
+kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const lite::Context *ctx,
+                                               const kernel::KernelKey &desc, const lite::PrimitiveC *primitive) {
+  if (inputs.size() == 0) {
+    MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
+    return nullptr;
+  }
+  if (inputs[0]->shape()[0] > 1) {
+    MS_LOG(ERROR) << "Input data size unsupported multi-batch.";
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) BiasAddOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Kernel " << opParameter->name_ << "is nullptr.";
+    return nullptr;
+  }
+
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init BiasAdd kernel failed!";
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
similarity index 65%
rename from mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.h
rename to mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
index 9a25aef4fa..56535afddb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAFFEPRELU_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAFFEPRELU_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_BIASADD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_BIASADD_H_
 
 #include <vector>
 #include <string>
@@ -27,23 +27,26 @@
 
 namespace mindspore::kernel {
 
-class CaffePReluOpenCLKernel : public OpenCLKernel {
+class BiasAddOpenCLKernel : public OpenCLKernel {
  public:
-  explicit CaffePReluOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
-                                  const std::vector<lite::tensor::Tensor *> &outputs)
+  explicit BiasAddOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                               const std::vector<lite::tensor::Tensor *> &outputs)
       : OpenCLKernel(parameter, inputs, outputs) {}
-  ~CaffePReluOpenCLKernel() override{};
+  ~BiasAddOpenCLKernel() override{};
 
   int Init() override;
   int Run() override;
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
-  void CaffeWeight();
+  void InitBuffer();
+  cl_int4 GetImg2dShape();
 
  private:
   cl::Kernel kernel_;
-  FLOAT_t *CaffeWeight_;
+  FLOAT_t *BiasAdd_;
+  int in_size_;
+  int out_size_;
 };
 
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAFFEPRELU_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_BIASADD_H_
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
deleted file mode 100644
index 07a2d51754..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include <set>
-#include <vector>
-
-#include "src/kernel_registry.h"
-#include "include/errorcode.h"
-#include "src/runtime/kernel/opencl/kernel/caffe_prelu.h"
-#include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc"
-
-using mindspore::kernel::KERNEL_ARCH::kGPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_CaffePReLU;
-
-namespace mindspore::kernel {
-
-void CaffePReluOpenCLKernel::CaffeWeight() {
-  int C = in_tensors_[1]->shape()[0];
-  int div_ci = UP_DIV(C, C4NUM);
-  std::cout << div_ci << std::endl;
-  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
-  CaffeWeight_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(div_ci * C4NUM * sizeof(FLOAT_t)));
-  CaffeWeight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(CaffeWeight_, CL_MAP_WRITE, nullptr, true));
-  memset(CaffeWeight_, 0x00, div_ci * C4NUM * sizeof(FLOAT_t));
-  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_[1]->Data());
-  for (int i = 0; i < in_tensors_[1]->ElementsNum(); ++i) {
-    CaffeWeight_[i] = origin_weight[i];
-  }
-  allocator->UnmapBuffer(CaffeWeight_);
-}
-
-int CaffePReluOpenCLKernel::Init() {
-  if (in_tensors_[0]->shape().size() != 4) {
-    MS_LOG(ERROR) << "Caffe PRelu only support dim=4, but your dim=" << in_tensors_[0]->shape().size();
-    return RET_ERROR;
-  }
-  CaffeWeight();
-  std::set<std::string> build_options;
-  std::string source = caffe_prelu_source;
-  std::string program_name = "CaffePRelu";
-  std::string kernel_name = "CaffePRelu";
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->LoadSource(program_name, source);
-  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  MS_LOG(DEBUG) << program_name << " Init Done!";
-  return RET_OK;
-}
-
-int CaffePReluOpenCLKernel::Run() {
-  int N = in_tensors_[0]->shape()[0];
-  int H = in_tensors_[0]->shape()[1];
-  int W = in_tensors_[0]->shape()[2];
-  int C = in_tensors_[0]->shape()[3];
-
-  cl_int4 input_shape = {N, H, W, C};
-  int C_Weight = in_tensors_[1]->shape()[0];
-  if (UP_DIV(C_Weight, C4NUM) != UP_DIV(C, C4NUM)) {
-    MS_LOG(ERROR) << "CaffePRelu weight channel size:" << C_Weight
-                  << " must be equal with in_teneors channel size:" << C;
-    return RET_ERROR;
-  }
-
-  MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  int arg_idx = 0;
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, CaffeWeight_);
-  std::vector<size_t> local = {1, 1};
-  std::vector<size_t> global = {static_cast<size_t>(H), static_cast<size_t>(W)};
-  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int CaffePReluOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
-  int H = in_tensors_[0]->shape()[1];
-  int W = in_tensors_[0]->shape()[2];
-  int C = in_tensors_[0]->shape()[3];
-
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
-  size_t img_dtype = CL_FLOAT;
-#endif
-
-  img_size->clear();
-  img_size->push_back(W * UP_DIV(C, C4NUM));
-  img_size->push_back(H);
-  img_size->push_back(img_dtype);
-  return RET_OK;
-}
-
-kernel::LiteKernel *OpenCLCaffePReluKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
-                                                  const std::vector<lite::tensor::Tensor *> &outputs,
-                                                  OpParameter *opParameter, const lite::Context *ctx,
-                                                  const kernel::KernelKey &desc,
-                                                  const mindspore::lite::PrimitiveC *primitive) {
-  if (inputs.size() == 0) {
-    MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
-    return nullptr;
-  }
-  if (inputs[0]->shape()[0] > 1) {
-    MS_LOG(ERROR) << "Init CaffePRelu kernel failed: Unsupported multi-batch.";
-    return nullptr;
-  }
-  auto *kernel =
-    new (std::nothrow) CaffePReluOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "Kernel " << opParameter->name_ << "is nullptr.";
-    return nullptr;
-  }
-
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init CaffePRelu kernel failed!";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_CaffePReLU, OpenCLCaffePReluKernelCreator)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 14e0662c99..2203508a97 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -38,11 +38,12 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
     im_dst_y = out_tensors_[0]->Height() * CO4;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -225,4 +226,5 @@ kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Te
 }
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLConcatKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLConcatKernelCreator);
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index f14a00103c..609ebc0dcc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <set>
+#include "nnacl/fp32/common_func.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
@@ -35,14 +36,15 @@ int Conv2dTransposeOpenCLKernel::Init() {
     MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2.";
     return RET_ERROR;
   }
-  if (param->pad_h_ != 0 || param->pad_w_ != 0) {
+  if (param->pad_u_ != 0 || param->pad_l_ != 0) {
     MS_LOG(ERROR) << "only support pad =0.";
     return RET_ERROR;
   }
   std::string kernel_name = "conv2d_transpose2x2";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::string source = conv2d_transpose2x2_source;
   std::set<std::string> build_options;
@@ -70,13 +72,15 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
   int div_ci = UP_DIV(ci, C4NUM);
   int div_co = UP_DIV(co, C4NUM);
   auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
 
   // IHWO to OHWI4(I)4(O)(converter format is IHWO)
   // init padWeight_(buffer mem)
-  padWeight_ =
-    reinterpret_cast<FLOAT_t *>(allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * sizeof(FLOAT_t)));
-  padWeight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true));
-  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kWeightIndex)->Data());
+  padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
+  padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
+  memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
+  auto origin_weight = in_tensors_.at(kWeightIndex)->Data();
+  auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type();
   int index = 0;
   for (int co_i = 0; co_i < div_co; co_i++) {
     for (int kh_i = 0; kh_i < kh; kh_i++) {
@@ -87,10 +91,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
               int co_offset = co_i * C4NUM + co4_i;
               int ci_offset = ci_i * C4NUM + ci4_i;
               if (co_offset < co && ci_offset < ci) {
-                int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * ci + co_offset;
-                padWeight_[index++] = origin_weight[ori_index];
+                int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset;
+                if (enable_fp16_) {
+                  if (weight_dtype == kNumberTypeFloat32) {
+                    reinterpret_cast<uint16_t *>(padWeight_)[index++] =
+                      Float32ToShort(reinterpret_cast<float *>(origin_weight)[ori_index]);
+                  } else {
+                    reinterpret_cast<uint16_t *>(padWeight_)[index++] =
+                      reinterpret_cast<uint16_t *>(origin_weight)[ori_index];
+                  }
+                } else {
+                  reinterpret_cast<float *>(padWeight_)[index++] = reinterpret_cast<float *>(origin_weight)[ori_index];
+                }
               } else {
-                padWeight_[index++] = 0.;
+                index++;
               }
             }
           }
@@ -104,30 +118,36 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
   size_t im_dst_x, im_dst_y;
   im_dst_x = div_co;
   im_dst_y = 1;
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
-  bias_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(im_dst_x * im_dst_y * C4NUM * sizeof(FLOAT_t), img_size));
-  bias_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true));
-  memset(bias_, 0x00, div_co * C4NUM * sizeof(FLOAT_t));
+  bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * data_size, img_size);
+  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
+  memset(bias_, 0x00, div_co * C4NUM * data_size);
+  auto bias_dtype = in_tensors_[2]->data_type();
   if (in_tensors_.size() >= 3) {
-    memcpy(bias_, in_tensors_[2]->Data(), co * sizeof(FLOAT_t));
+    if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
+      auto fdata = reinterpret_cast<float *>(in_tensors_[2]->Data());
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
+      }
+    } else {
+      memcpy(bias_, in_tensors_[2]->Data(), co * data_size);
+    }
   }
   allocator->UnmapBuffer(bias_);
 }
 
 int Conv2dTransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  im_dst_x = UP_DIV(out_tensors_[0]->Channel() * out_tensors_[0]->Width(), C4NUM);
+  im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
   im_dst_y = out_tensors_[0]->Height();
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -145,9 +165,10 @@ int Conv2dTransposeOpenCLKernel::Run() {
   ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
   int ci = in_tensors_[0]->Channel();
   int co = out_tensors_[0]->Channel();
+  int co4 = UP_DIV(co, C4NUM);
   int kh = param->kernel_h_;
   int kw = param->kernel_w_;
-  int pad = param->pad_h_;
+  int pad = param->pad_u_;
   int oh = out_tensors_[0]->Height();
   int ow = out_tensors_[0]->Width();
   int h = in_tensors_[0]->Height();
@@ -156,7 +177,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
   // local size should less than MAX_GROUP_SIZE
   std::vector<size_t> local = {16, 1, 16};
   std::vector<size_t> global = {UP_ROUND((size_t)UP_ROUND(oh / 2, 2), local[0]),
-                                UP_ROUND((size_t)UP_ROUND(ow / 2, 2), local[1]), UP_ROUND((size_t)co / 4, local[2])};
+                                UP_ROUND((size_t)UP_ROUND(ow / 2, 2), local[1]), UP_ROUND(co4, local[2])};
 
   cl_int2 kernel_size = {kh, kw};
   cl_int2 stride = {2, 2};
@@ -165,8 +186,8 @@ int Conv2dTransposeOpenCLKernel::Run() {
   cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
   int arg_cnt = 0;
   ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_);
-  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_, lite::opencl::MemType::BUF);
   ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
   ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
@@ -197,4 +218,5 @@ kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector<lite::t
 }
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
index d8f05d6118..901015def0 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
@@ -40,10 +40,10 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
 
  private:
-  ConvParameter *parameter_;
   cl::Kernel kernel_;
-  FLOAT_t *padWeight_;
-  FLOAT_t *bias_;
+  void *padWeight_;
+  void *bias_;
+  bool enable_fp16_{false};
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index 29ef635c42..8200549b6d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -17,9 +17,11 @@
 #include <string>
 #include <set>
 #include <algorithm>
+#include "src/common/utils.h"
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#include "nnacl/fp32/common_func.h"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -31,6 +33,7 @@ namespace mindspore::kernel {
 int ConvolutionOpenCLKernel::Init() {
   static int init_count = 0;
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  use_fp16_ = ocl_runtime->GetFp16Enable();
   auto allocator = ocl_runtime->GetAllocator();
   std::set<std::string> build_options;
   init_count++;
@@ -73,19 +76,15 @@ int ConvolutionOpenCLKernel::Init() {
 
   // allocate winograd memory
   if (use_winograd_) {
-#ifdef ENABLE_FP16
-    size_t img_dtype = CL_HALF_FLOAT;
-    size_t sizeof_datatype = 2;
-#else
-    size_t img_dtype = CL_FLOAT;
-    size_t sizeof_datatype = 4;
-#endif
-    size_t size = TILES_XY * CI_SLICES * 36 * sizeof_datatype;
+    size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
+    size_t sizeof_FLT = use_fp16_ ? 2 : 4;
+
+    size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT;
     size_t width = TILES_XY;
     size_t height = CI_SLICES * 36;
     winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});
 
-    size = TILES_XY * CO_SLICES * 36 * sizeof_datatype;
+    size = TILES_XY * CO_SLICES * 36 * sizeof_FLT;
     width = TILES_XY;
     height = CO_SLICES * 36;
     winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
@@ -103,6 +102,7 @@ int ConvolutionOpenCLKernel::Init() {
 int ConvolutionOpenCLKernel::InitBuffer() {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto allocator = ocl_runtime->GetAllocator();
+  size_t sizeof_FLT = use_fp16_ ? 2 : 4;
 
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
   size_t KH = param->kernel_h_;
@@ -111,15 +111,18 @@ int ConvolutionOpenCLKernel::InitBuffer() {
   constexpr size_t CO_TILE = C4NUM;
   size_t packed_weight_size;
   if (use_winograd_) {
-    packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof(float);
+    packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT;
   } else {
-    packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
+    packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT;
   }
-  packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
+  packed_weight_ = allocator->Malloc(packed_weight_size);
+  auto packed_weight_fp32 = reinterpret_cast<float *>(packed_weight_);
+  auto packed_weight_fp16 = reinterpret_cast<uint16_t *>(packed_weight_);
   allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
   memset(packed_weight_, 0x00, packed_weight_size);
   auto weight_tensor = in_tensors_[1];
-  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
+  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->Data());
+  auto origin_weight_fp16 = reinterpret_cast<uint16_t *>(weight_tensor->Data());
 
   if (use_winograd_) {
     // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
@@ -141,7 +144,11 @@ int ConvolutionOpenCLKernel::InitBuffer() {
         for (int kh = 0; kh < 3; ++kh) {
           for (int kw = 0; kw < 3; ++kw) {
             const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
-            in_vals[kh * 3 + kw] = origin_weight[f_index];
+            if (use_fp16_) {
+              in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]);
+            } else {
+              in_vals[kh * 3 + kw] = origin_weight_fp32[f_index];
+            }
           }
         }
 
@@ -169,14 +176,18 @@ int ConvolutionOpenCLKernel::InitBuffer() {
               (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) *
                 CO_TILE +
               co_inner;
-            packed_weight_[dst_idx] = encoded_weight[src_idx++];
+            if (use_fp16_) {
+              packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]);
+            } else {
+              packed_weight_fp32[dst_idx] = encoded_weight[src_idx++];
+            }
           }
         }
       }
     }
   } else {
     // weight: OHWI -> O/4 H W I/4 I4 O4
-    for (int co = 0; co < CO; ++co) {
+    for (int co = 0, src_idx = 0; co < CO; ++co) {
       for (int kh = 0; kh < KH; ++kh) {
         for (int kw = 0; kw < KW; ++kw) {
           for (int ci = 0; ci < CI; ++ci) {
@@ -184,8 +195,13 @@ int ConvolutionOpenCLKernel::InitBuffer() {
             auto co_inner = co % CO_TILE;
             auto ci_outer = ci / CI_TILE;
             auto ci_inner = ci % CI_TILE;
-            packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
-                           co_inner] = *(origin_weight++);
+            size_t dst_idx =
+              ((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner;
+            if (use_fp16_) {
+              packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++];
+            } else {
+              packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++];
+            }
           }
         }
       }
@@ -195,14 +211,11 @@ int ConvolutionOpenCLKernel::InitBuffer() {
 
   // align bias from C to C4
   auto bias_tensor = in_tensors_[2];
-  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
-  packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
-  packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
+  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT;
+  packed_bias_ = allocator->Malloc(packed_bias_size);
+  allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
   memset(packed_bias_, 0x00, packed_bias_size);
-  auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
-  for (int co = 0; co < CO; ++co) {
-    packed_bias_[co] = bias_data[co];
-  }
+  memcpy(packed_bias_, bias_tensor->Data(), CO * sizeof_FLT);
   allocator->UnmapBuffer(packed_bias_);
 
   return RET_OK;
@@ -224,11 +237,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
     im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
-  size_t img_dtype = CL_FLOAT;
-#endif
+  size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
   img_size->clear();
   img_size->push_back(im_dst_x);
   img_size->push_back(im_dst_y);
@@ -245,34 +254,34 @@ int ConvolutionOpenCLKernel::Run() {
     arg_cn = 0;
     cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
     cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_);
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
     ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
     ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
 
     arg_cn = 0;
     cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
     cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
     ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
     ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
 
     arg_cn = 0;
     cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
     cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_);
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
     ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
   } else {
     arg_cn = 0;
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
   }
 
   if (use_winograd_) {
@@ -321,18 +330,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
   code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n";
   code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n";
 
-#ifdef ENABLE_FP16
-  code +=
-    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-    "#define FLT4 half4\n"
-    "#define READ_FLT4 read_imageh\n"
-    "#define WRITE_FLT4 write_imageh\n\n";
-#else
-  code +=
-    "#define FLT4 float4\n"
-    "#define READ_FLT4 read_imagef\n"
-    "#define WRITE_FLT4 write_imagef\n\n";
-#endif
+  if (use_fp16_) {
+    code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+  }
 
   code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n";
 
@@ -365,7 +365,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
     "            {\n"
     "                for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
     "                {\n";
-  code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n";
+  code += "FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n";
   code +=
     "                    out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n"
     "                    out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n"
@@ -382,28 +382,29 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
     "    }\n\n";
   code += "    FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";
 
-  if (param->is_relu_) {
+  if (param->act_type_ == ActType_Relu) {
     code += "    out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n";
-  } else if (param->is_relu6_) {
+  } else if (param->act_type_ == ActType_Relu6) {
     code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
   }
 
   if (OW * CO_SLICES < 65536) {
-    code += "    WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
+    code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
   } else {
-    code += "    WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
+    code += "    WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
   }
   return code;
 }
 
 std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
-  return "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
+  return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+         "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
          "#define PAD 1\n"
          "\n"
          "__constant sampler_t\n"
          "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
          "\n"
-         "constant float Bt[36] = {\n"
+         "constant FLT Bt[36] = {\n"
          "        1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n"
          "        0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n"
          "        0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n"
@@ -433,52 +434,40 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
          "    int tile_x = tile_xy % TILE_X;\n"
          "    int tile_y = tile_xy / TILE_X;\n"
          "\n"
-         "    constant float *Bt_row = Bt + row * 6;\n"
-         "    float4 BtD_row[6] = {0};\n"
+         "    constant FLT *Bt_row = Bt + row * 6;\n"
+         "    FLT4 BtD_row[6] = {0};\n"
          "    for (int y = 0; y < 6; y++)\n"
          "    {\n"
          "        int y_idx = tile_y * 4 - PAD + y;\n"
          "        for (int x = 0; x < 6; x++)\n"
          "        {\n"
          "            int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n"
-         "            BtD_row[x] += Bt_row[y] * read_imagef(input, smp_none, (int2)(x_idx, y_idx));\n"
+         "            BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n"
          "        }\n"
          "    }\n"
          "\n"
          "    for (int y = 0; y < 6; y++)\n"
          "    {\n"
-         "        float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
          "        for (int x = 0; x < 6; x++)\n"
          "        {\n"
          "            acc += BtD_row[x] * Bt[y * 6 + x];\n"
          "        }\n"
-         "//        write_imagef(output, (int2)((row * 6 + y) * SLICES + slice, tile_xy), acc); // H WC  W=36\n"
-         "        write_imagef(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W  H=36\n"
+         "        WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W  H=36\n"
          "    }\n"
          "}";
 }
 
 std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
-  return "#define CI_TILE 4\n"
+  return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+         "#define CI_TILE 4\n"
          "#define H 36\n"
-         "//#define W 256\n"
-         "//#define CI 96\n"
-         "//#define CO 80s\n"
-         "//#define CI_SLICES 24\n"
-         "//#define CO_SLICES 20\n"
-         "\n"
-         "#define FLT4 float4\n"
-         "#define READ_FLT4 read_imagef\n"
-         "#define WRITE_FLT4 write_imagef\n"
-         "\n"
-         "//#define __global\n"
-         "\n"
          "__constant sampler_t\n"
          "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
          "\n"
          "__kernel void WinogradConvolution(__read_only image2d_t input,\n"
          "                                  __write_only image2d_t output,\n"
-         "                                  __global float16 *weight,\n"
+         "                                  __global FLT16 *weight,\n"
          "                                  int4 input_shape,         // N 36 H/4*W/4 CI_SLICES\n"
          "                                  int4 output_shape)        // N 36 H/4*W/4 CO_SLICES\n"
          "{\n"
@@ -501,14 +490,14 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
          "    FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
          "\n"
          "    int y_idx = h;\n"
-         "    __global float16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n"
+         "    __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n"
          "    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
          "    {\n"
-         "        FLT4 in0 = READ_FLT4(input, smp_none, (int2)(w + 0, y_idx));\n"
-         "        FLT4 in1 = READ_FLT4(input, smp_none, (int2)(w + 1, y_idx));\n"
+         "        FLT4 in0 = READ_IMAGE(input, smp_none, (int2)(w + 0, y_idx));\n"
+         "        FLT4 in1 = READ_IMAGE(input, smp_none, (int2)(w + 1, y_idx));\n"
          "        y_idx += 36;\n"
          "\n"
-         "        float16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n"
+         "        FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n"
          "        weight_ptr += 2;\n"
          "\n"
          "\n"
@@ -533,18 +522,18 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
          "        out11 += in1.w * weight1.scdef;\n"
          "    }\n"
          "\n"
-         "    WRITE_FLT4(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n"
+         "    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n"
          "    if (w + 1 < W)\n"
          "    {\n"
-         "        WRITE_FLT4(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n"
+         "        WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n"
          "    }\n"
          "\n"
          "    if (co_slice + 1 < CO_SLICES)\n"
          "    {\n"
-         "        WRITE_FLT4(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n"
+         "        WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n"
          "        if (w + 1 < W)\n"
          "        {\n"
-         "            WRITE_FLT4(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n"
+         "            WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n"
          "        }\n"
          "    }\n"
          "}";
@@ -552,16 +541,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
 
 std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
   std::string code =
-    "//#define TILE_XY 256\n"
-    "//#define SLICES 20\n"
-    "//#define OH 16\n"
-    "//#define OW 256\n"
-    "\n"
-    "//#define __global\n"
+    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
     "__constant sampler_t\n"
     "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
     "\n"
-    "constant float At[24] = {\n"
+    "constant FLT At[24] = {\n"
     "        1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n"
     "        0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n"
     "        0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n"
@@ -570,7 +554,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
     "\n"
     "__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
     "                              __write_only image2d_t output,\n"
-    "                              __global float4 *bias,\n"
+    "                              __global FLT4 *bias,\n"
     "                              int4 input_shape,      // N 36 H/4*W/4 CO_SLICES\n"
     "                              int4 output_shape)     // N H W CO_SLICES\n"
     "{\n"
@@ -588,40 +572,38 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
     "        return;\n"
     "    }\n"
     "\n"
-    "    constant float *At_row = At + row * 6;\n"
-    "    float4 AtM_row[6] = {0};\n"
+    "    constant FLT *At_row = At + row * 6;\n"
+    "    FLT4 AtM_row[6] = {0};\n"
     "    for (int y = 0; y < 6; y++)\n"
     "    {\n"
     "        for (int x = 0; x < 6; x++)\n"
     "        {\n"
-    "            AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + "
-    "x));\n"
+    "            AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + x));\n"
     "        }\n"
     "    }\n"
     "\n"
     "    for (int x = 0; x < 4; x++)\n"
     "    {\n"
-    "        float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+    "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
     "        for (int y = 0; y < 6; y++)\n"
     "        {\n"
     "            acc += AtM_row[y] * At[x * 6 + y];\n"
     "        }\n"
-    "        acc += bias[slice];\n";
+    "        acc += bias[slice];\n"
+    "\n";
 
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  if (param->is_relu_) {
-    code += "    acc = max(acc, (float4)(0.0f));\n";
-  } else if (param->is_relu6_) {
-    code += "    acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n";
+  if (param->act_type_ == ActType_Relu) {
+    code += "    acc = max(acc, (FLT4)(0.0f));\n";
+  } else if (param->act_type_ == ActType_Relu6) {
+    code += "    acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n";
   }
 
   code +=
     "        int TILE_X = OW / 4;\n"
     "        int tile_x = tile_xy % TILE_X * 4;\n"
     "        int tile_y = tile_xy / TILE_X * 4;\n"
-    "//        write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n"
-    "        write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H "
-    "width=WC\n"
+    "        WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H width=WC\n"
     "    }\n"
     "}";
   return code;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index 4ad51a15a4..851fd09a13 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -40,6 +40,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
 
  private:
+  bool use_fp16_ = false;
+
   int CI;
   int IH;
   int IW;
@@ -48,8 +50,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   int OW;
   int CI_SLICES;
   int CO_SLICES;
-  float *packed_weight_ = nullptr;
-  float *packed_bias_ = nullptr;
+  void *packed_weight_ = nullptr;
+  void *packed_bias_ = nullptr;
 
   bool use_winograd_ = false;
   int TILES_X;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index fbd146992c..57be8612e7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -15,14 +15,17 @@
  */
 
 #include "src/runtime/kernel/opencl/kernel/depthwise_conv2d.h"
+#include <float.h>
+#include <map>
 #include <string>
 #include <set>
 #include <utility>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/arm/fp32/convolution_depthwise.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "nnacl/fp32/common_func.h"
+#include "nnacl/op_base.h"
 #include "include/errorcode.h"
-#include "nnacl/pack.h"
 
 #ifndef PROGRAM_WITH_IL
 
@@ -64,7 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
     kernel_name += "_1x1";
   }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::string program_name = "DepthwiseConv2d";
   std::set<std::string> build_options;
@@ -81,30 +84,50 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
   auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto allocator = ocl_runtime->GetAllocator();
+  bool is_fp16 = ocl_runtime->GetFp16Enable();
 
   // weight: o, h, w, i; o == group, i == 1
-  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kWeightIndex)->Data());
+  void *origin_weight = in_tensors_.at(kWeightIndex)->Data();
   int CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
   int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;
 
-  packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(pack_weight_size * sizeof(FLOAT_t)));
-  packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
   int plane = parameter->kernel_h_ * parameter->kernel_w_;
-#ifdef ENABLE_FP16
-  PackNCHWToNC4HW4Fp16(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel());
-#else
-  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel());
-#endif
+  if (is_fp16) {
+    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(int16_t));
+    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
+      std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
+      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
+      std::function<int16_t(float)> to_dtype = Float32ToShort;
+      PackNCHWToNC4HW4<float, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else {
+      MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type();
+    }
+  } else {
+    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(float));
+    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
+      std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else {
+      MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type();
+    }
+  }
 
   allocator->UnmapBuffer(packed_weight_);
 
   if (in_tensors_.size() == kInputSize2) {
-    bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(C4NUM * CO4 * sizeof(FLOAT_t)));
-    bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true));
-    size_t up_co_size = C4NUM * CO4 * sizeof(FLOAT_t);
+    size_t dtype_size = sizeof(float);
+    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
+      dtype_size = sizeof(int16_t);
+    }
+    bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size);
+    bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true);
+    size_t up_co_size = C4NUM * CO4 * dtype_size;
     memset(bias_data_, 0, up_co_size);
-    auto ori_bias = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * sizeof(FLOAT_t));
+    auto ori_bias = in_tensors_.at(kBiasIndex)->Data();
+    memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
     allocator->UnmapBuffer(bias_data_);
   } else {
     MS_ASSERT(in_tensors_.size() == kInputSize1);
@@ -124,11 +147,10 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
     im_dst_y = out_tensors_[0]->Height() * CO4;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -160,26 +182,29 @@ int DepthwiseConv2dOpenCLKernel::Run() {
   std::vector<size_t> local;
   GetLocalSize(0, global, &local);
 
-  float relu_clip1 = 6.0;
+  std::map<ActType, std::pair<float, float>> relu_clips{
+    {ActType_No, {FLT_MIN, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}};
   cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_};
   cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
-  cl_int2 padding = {-parameter->pad_h_, -parameter->pad_w_};
+  cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
   cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_};
   cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
   cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
                       (cl_int)out_tensors_[0]->Batch()};
 
-  ocl_runtime->SetKernelArg(kernel_, 1, packed_weight_);
-  ocl_runtime->SetKernelArg(kernel_, 2, bias_data_);
-  ocl_runtime->SetKernelArg(kernel_, 3, relu_clip1);
-  ocl_runtime->SetKernelArg(kernel_, 5, kernel_size);
-  ocl_runtime->SetKernelArg(kernel_, 6, stride);
-  ocl_runtime->SetKernelArg(kernel_, 7, padding);
-  ocl_runtime->SetKernelArg(kernel_, 8, dilation);
-  ocl_runtime->SetKernelArg(kernel_, 9, src_size);
-  ocl_runtime->SetKernelArg(kernel_, 10, dst_size);
-  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, 4, out_tensors_[0]->Data());
+  int arg_cnt = 0;
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dilation);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
+  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
   ocl_runtime->RunKernel(kernel_, global, local, nullptr);
   return RET_OK;
 }
@@ -204,5 +229,6 @@ kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::t
   return kernel;
 }
 
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
index 22c167b964..6564054483 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/conv_parameter.h"
-
 #include "src/runtime/opencl/opencl_runtime.h"
 
 namespace mindspore::kernel {
@@ -46,8 +45,8 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
   int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
 
  private:
-  FLOAT_t *packed_weight_;
-  FLOAT_t *bias_data_;
+  void *packed_weight_;
+  void *bias_data_;
   cl::Kernel kernel_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index 9593cc419d..0356963ad3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -16,6 +16,7 @@
 
 #include <set>
 #include <string>
+#include "nnacl/fp32/common_func.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/fp32/matmul.h"
@@ -34,9 +35,9 @@ namespace mindspore::kernel {
 int MatMulOpenCLKernel::Init() {
   std::string kernel_name = "MatMul";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::set<std::string> build_options;
   std::string source = matmul_source;
@@ -74,11 +75,12 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
 
 void MatMulOpenCLKernel::PadWeight() {
   auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
-  padWeight_ =
-    reinterpret_cast<FLOAT_t *>(allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * sizeof(FLOAT_t)));
-  padWeight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true));
 
-  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kWeightIndex)->Data());
+  size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
+  padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
+  memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
+  auto origin_weight = in_tensors_.at(kWeightIndex)->Data();
   int divCI = sizeCI.s[1];
   int divCO = sizeCO.s[1];
   int co = sizeCO.s[0];
@@ -90,9 +92,25 @@ void MatMulOpenCLKernel::PadWeight() {
           int src_x = i * C4NUM + l;
           int src_y = j * C4NUM + k;
           if (src_x < sizeCI.s[0] && src_y < sizeCO.s[0]) {
-            padWeight_[index++] = origin_weight[src_y * sizeCI.s[0] + src_x];
+            if (enable_fp16_) {
+              if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
+                reinterpret_cast<uint16_t *>(padWeight_)[index++] =
+                  Float32ToShort(reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x]);
+              } else {
+                reinterpret_cast<uint16_t *>(padWeight_)[index++] =
+                  reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x];
+              }
+            } else {
+              if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
+                reinterpret_cast<float *>(padWeight_)[index++] =
+                  ShortToFloat32(reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]);
+              } else {
+                reinterpret_cast<float *>(padWeight_)[index++] =
+                  reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x];
+              }
+            }
           } else {
-            padWeight_[index++] = 0;
+            index++;
           }
         }
       }
@@ -102,17 +120,23 @@ void MatMulOpenCLKernel::PadWeight() {
   size_t im_dst_x, im_dst_y;
   im_dst_x = divCO;
   im_dst_y = 1;
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
-  bias_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(im_dst_x * im_dst_y * C4NUM * sizeof(FLOAT_t), img_size));
-  bias_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true));
-  memset(bias_, 0x00, divCO * C4NUM * sizeof(FLOAT_t));
+  bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
+  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
+  memset(bias_, 0x00, divCO * C4NUM * dtype_size);
   if (in_tensors_.size() >= 3) {
-    memcpy(bias_, in_tensors_[2]->Data(), co * sizeof(FLOAT_t));
+    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
+      auto fdata = reinterpret_cast<float *>(in_tensors_[2]->Data());
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
+      }
+    } else {
+      memcpy(bias_, in_tensors_[2]->Data(), co * dtype_size);
+    }
   }
   allocator->UnmapBuffer(bias_);
 }
@@ -121,11 +145,10 @@ int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
   size_t im_dst_x, im_dst_y;
   im_dst_x = sizeCO.s[1];
   im_dst_y = 1;
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -140,8 +163,8 @@ int MatMulOpenCLKernel::Run() {
   std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4};
   int arg_count = 0;
   ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_);
-  ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
+  ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
+  ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_, lite::opencl::MemType::BUF);
   ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI);
   ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO);
@@ -175,4 +198,6 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::tensor::Te
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
index 35540767c7..55847d4d76 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
@@ -23,7 +23,6 @@
 #include "nnacl/conv_parameter.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 
-
 namespace mindspore::kernel {
 
 class MatMulOpenCLKernel : public OpenCLKernel {
@@ -43,9 +42,10 @@ class MatMulOpenCLKernel : public OpenCLKernel {
 
  private:
   cl::Kernel kernel_;
-  FLOAT_t *padWeight_;
-  FLOAT_t *bias_;
-  bool hasBias_ = false;
+  void *padWeight_;
+  void *bias_;
+  bool hasBias_{false};
+  bool enable_fp16_{false};
   cl_int2 sizeCI;
   cl_int2 sizeCO;
 };
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 6e181c5ec2..baabd30a72 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -43,13 +43,13 @@ int PoolingOpenCLKernel::Init() {
   std::string source;
   std::string program_name;
 #endif
-  if (parameter_->max_pooling_) {
+  if (parameter_->pool_mode_ == PoolMode_MaxPool) {
     kernel_name = "MaxPooling2d";
 #ifndef PROGRAM_WITH_IL
     source = max_pool2d_source;
     program_name = "MaxPooling2d";
 #endif
-  } else if (parameter_->avg_pooling_) {
+  } else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
     kernel_name = "AvgPooling2d";
 #ifndef PROGRAM_WITH_IL
     source = avg_pool2d_source;
@@ -62,7 +62,7 @@ int PoolingOpenCLKernel::Init() {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   if (out_mem_type_ == OpenCLMemType::BUF) {
     kernel_name += "_BUF";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
index 4f4e39a63a..78f7df0baf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@@ -23,26 +23,51 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
-#include "nnacl/prelu_parameter.h"
+#include "src/runtime/kernel/opencl/cl/prelu.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Prelu;
+using mindspore::schema::PrimitiveType_PReLU;
 
 namespace mindspore::kernel {
 
+void PReluOpenCLKernel::InitBuffer() {
+  int C = in_tensors_[1]->shape()[0];
+  int div_ci = UP_DIV(C, C4NUM);
+  std::cout << div_ci << std::endl;
+  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  PReluWeight_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(div_ci * C4NUM * sizeof(FLOAT_t)));
+  PReluWeight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(PReluWeight_, CL_MAP_WRITE, nullptr, true));
+  memset(PReluWeight_, 0x00, div_ci * C4NUM * sizeof(FLOAT_t));
+  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_[1]->Data());
+  for (int i = 0; i < in_tensors_[1]->ElementsNum(); ++i) {
+    PReluWeight_[i] = origin_weight[i];
+  }
+  allocator->UnmapBuffer(PReluWeight_);
+}
+
 int PReluOpenCLKernel::Init() {
   if (in_tensors_[0]->shape().size() != 4) {
     MS_LOG(ERROR) << "PRelu only support dim=4, but your dim=" << in_tensors_[0]->shape().size();
     return RET_ERROR;
   }
+  int C_Weight = in_tensors_[1]->shape()[0];
+  int C = in_tensors_[0]->shape()[3];
+  if (C_Weight != 1 && UP_DIV(C_Weight, C4NUM) != UP_DIV(C, C4NUM)) {
+    MS_LOG(ERROR)
+      << "PRelu weight channel size must be 1 or must be equal with in_teneors channel size, but your weight size is "
+      << C_Weight << " and your input channel size is " << C;
+    return RET_ERROR;
+  }
+  if (C_Weight != 1) {
+    InitBuffer();
+  }
   std::set<std::string> build_options;
-  std::string source = activation_source;
+  std::string source = prelu_source;
   std::string program_name = "PRelu";
-  std::string kernel_name = "ReluScalar";
+  std::string kernel_name = "PRelu";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
@@ -61,17 +86,18 @@ int PReluOpenCLKernel::Run() {
   int W = in_tensors_[0]->shape()[2];
   int C = in_tensors_[0]->shape()[3];
   cl_int4 input_shape = {N, H, W, C};
-  if (in_tensors_[1]->ElementsNum() < 1) {
-    MS_LOG(ERROR) << "PRelu weight size must be greater than 1! But your weight size is "
-                  << in_tensors_[1]->ElementsNum();
-    return RET_ERROR;
-  }
+
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   int arg_idx = 0;
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
-  ocl_runtime->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<float *>(in_tensors_[1]->Data())[0]);
+  if (in_tensors_[1]->shape()[0] == 1) {
+    ocl_runtime->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<float *>(in_tensors_[1]->Data()));
+  } else {
+    ocl_runtime->SetKernelArg(kernel_, arg_idx++, PReluWeight_);
+  }
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<int>(in_tensors_[1]->shape()[0]));
 
   std::vector<size_t> local = {1, 1};
   std::vector<size_t> global = {static_cast<size_t>(H), static_cast<size_t>(W)};
@@ -128,5 +154,5 @@ kernel::LiteKernel *OpenCLPReluKernelCreator(const std::vector<lite::tensor::Ten
   return kernel;
 }
 
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Prelu, OpenCLPReluKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_PReLU, OpenCLPReluKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
index 79b232ce32..cc2429250a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
@@ -36,9 +36,11 @@ class PReluOpenCLKernel : public OpenCLKernel {
   int Init() override;
   int Run() override;
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
+  void InitBuffer();
 
  private:
   cl::Kernel kernel_;
+  FLOAT_t *PReluWeight_;
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index 6fc2f4ce31..3254d0758d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -33,6 +33,7 @@ namespace mindspore::kernel {
 int ReshapeOpenCLKernel::Init() {
   std::string kernel_name = "reshape";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
   in_ori_format_ = in_tensors_[0]->GetFormat();
   out_ori_format_ = out_tensors_[0]->GetFormat();
   if (in_ori_format_ != schema::Format_NHWC4 && in_ori_format_ != schema::Format_NHWC) {
@@ -45,7 +46,7 @@ int ReshapeOpenCLKernel::Init() {
     return RET_ERROR;
   }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::set<std::string> build_options;
   std::string source = reshape_source;
@@ -73,11 +74,10 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
   int c = shapex[3];
   im_dst_x = w * UP_DIV(c, C4NUM);
   im_dst_y = h;
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -121,4 +121,5 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::tensor::T
 }
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
index 4a35c44887..15fc837381 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
@@ -38,6 +38,7 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
 
  private:
   cl::Kernel kernel_;
+  bool enable_fp16_{false};
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
index dfc2038bc2..be857a15f3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@@ -38,11 +38,12 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
     im_dst_y = out_tensors_[0]->Height() * CO4;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -143,4 +144,6 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::tensor::Ten
 }
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
+
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
index caa6a57f95..9422c08e60 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -107,7 +107,7 @@ int SoftmaxOpenCLKernel::Init() {
     MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
   }
 #ifdef PROGRAM_WITH_IL
-  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   if (!is_image_out_) {
     out_mem_type_ = OpenCLMemType::BUF;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 92c2c03479..aeb3209fc1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -50,7 +50,7 @@ int ToFormatOpenCLKernel::Init() {
 
   this->set_name(kernel_name);
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::set<std::string> build_options;
   std::string source = to_format_source;
@@ -126,13 +126,14 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
     im_dst_y = h;
   } else {
     MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
+    return RET_ERROR;
   }
   img_size->clear();
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
+  auto enable_fp16_ = lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable();
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
   return RET_OK;
@@ -146,8 +147,10 @@ int ToFormatOpenCLKernel::Run() {
 
   cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
   cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
-  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
+  auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
+  auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
+  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data(), src_mem_type);
+  ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data(), dst_mem_type);
   ocl_runtime->SetKernelArg(kernel_, 2, gsize);
   ocl_runtime->SetKernelArg(kernel_, 3, shape);
   ocl_runtime->RunKernel(kernel_, global, local, nullptr);
@@ -172,5 +175,6 @@ kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::
   return kernel;
 }
 
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index 9c2df7bc7b..d5b679630b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -35,13 +35,14 @@ namespace mindspore::kernel {
 int TransposeOpenCLKernel::Init() {
   std::string kernel_name = "transpose";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
   if (!is_image_out_) {
     kernel_name += "_BUF";
   } else {
     kernel_name += "_IMG";
   }
 #ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
   std::set<std::string> build_options;
   std::string source = transpose_source;
@@ -70,11 +71,10 @@ int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
   size_t im_dst_x, im_dst_y;
   im_dst_x = UP_DIV(out_tensors_[0]->Height() * out_tensors_[0]->Width(), C4NUM);
   im_dst_y = out_tensors_[0]->Channel();
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -82,6 +82,7 @@ int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
 }
 
 int TransposeOpenCLKernel::Run() {
+  // notice: input image2d size = {c/4, h * w}
   MS_LOG(DEBUG) << this->name() << " Running!";
   std::vector<int> shapex = in_tensors_[0]->shape();
   int h = shapex[1];
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
index 2efce26672..708acbfaae 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
@@ -38,7 +38,8 @@ class TransposeOpenCLKernel : public OpenCLKernel {
 
  private:
   cl::Kernel kernel_;
-  bool is_image_out_ = false;
+  bool is_image_out_{false};
+  bool enable_fp16_{false};
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index 4b19d71dc7..20b316072e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include <set>
 #include "src/runtime/opencl/opencl_executor.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
@@ -93,11 +94,10 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *
     }
 
     out_tensors->emplace_back(new_tensor);
-#ifdef ENABLE_FP16
-    KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat};
-#else
     KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
-#endif
+    if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
+      desc.data_type = kNumberTypeFloat16;
+    }
     OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter;
     MS_ASSERT(parameter);
     if (parameter == nullptr) {
@@ -207,7 +207,7 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() {
       output->set_allocator(allocator_);
     }
     for (auto input_kernel : kernel->in_kernels()) {
-      MS_EXCEPTION_IF_NULL(input_kernel);
+      MS_ASSERT(nullptr != input_kernel);
       auto ret = input_kernel->DecOutTensorRefCount();
       if (0 != ret) {
         MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
@@ -215,21 +215,21 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() {
     }
   }
   for (auto kernel : out_kernels_) {
-    MS_EXCEPTION_IF_NULL(kernel);
+    MS_ASSERT(nullptr != kernel);
     auto ret = kernel->DecOutTensorRefCount();
     if (0 != ret) {
       MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
     }
   }
   for (auto kernel : in_convert_ops_) {
-    MS_EXCEPTION_IF_NULL(kernel);
+    MS_ASSERT(nullptr != kernel);
     auto ret = kernel->DecOutTensorRefCount();
     if (0 != ret) {
       MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
     }
   }
   for (auto kernel : out_convert_ops_) {
-    MS_EXCEPTION_IF_NULL(kernel);
+    MS_ASSERT(nullptr != kernel);
     auto ret = kernel->DecOutTensorRefCount();
     if (0 != ret) {
       MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
@@ -262,26 +262,21 @@ int SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::tensor::
 }
 
 int SubGraphOpenCLKernel::UnInit() {
-  for (const auto tensor : in_convert_tensors_) {
+  for (const auto &tensor : in_convert_tensors_) {
     if (tensor != nullptr) {
       delete tensor;
     }
   }
-  for (const auto tensor : out_convert_tensors_) {
+  for (const auto &tensor : out_convert_tensors_) {
     if (tensor != nullptr) {
       delete tensor;
     }
   }
-  for (const auto op : in_convert_ops_) {
+  for (const auto &op : in_convert_ops_) {
     if (op != nullptr) {
       delete op;
     }
   }
-  for (const auto parameter : in_parameters_) {
-    if (parameter != nullptr) {
-      delete parameter;
-    }
-  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
index 32d0f70125..59e41af945 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
@@ -57,7 +57,6 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
                             std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);
 
  private:
-  SubGraphOpenCLParameter *subgraph_ocl_parameter_;
   lite::opencl::OpenCLAllocator *allocator_;
   std::vector<lite::tensor::Tensor *> in_convert_tensors_;
   std::vector<lite::tensor::Tensor *> out_convert_tensors_;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h
index 07a87547bb..c498b65c3f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -23,6 +23,7 @@
 #include "utils/log_adapter.h"
 #include "nnacl/op_base.h"
 #include "src/lite_kernel.h"
+#include "src/common//utils.h"
 
 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
@@ -89,6 +90,73 @@ std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int ma
 
 std::string CLErrorCode(cl_int error_code);
 
+template <class T1, class T2>
+void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel,
+                      const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * channel;
+    int dst_offset = b * plane * c4 * C4NUM;
+    for (int c = 0; c < channel; c++) {
+      int c4_block_num = c / C4NUM;
+      int c4_block_rem = c % C4NUM;
+      int src_c_offset = src_offset + c * plane;
+      int dst_c_offset = dst_offset + c4_block_num * plane * C4NUM;
+      for (int k = 0; k < plane; k++) {
+        int src_kernel_offset = src_c_offset + k;
+        int dst_kernel_offset = dst_c_offset + C4NUM * k + c4_block_rem;
+        (static_cast<T2 *>(dst) + dst_kernel_offset)[0] =
+          to_dtype((static_cast<T1 *>(src) + src_kernel_offset)[0]);
+      }
+    }
+  }
+}
+template <class T1, class T2>
+void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel,
+                     const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  int nhwc4_batch_unit_offset = c4 * C4NUM * plane;
+  int ic_remainder_ = channel % C4NUM;
+  if (ic_remainder_ != 0) {
+    int nhwc4_batch_offset = 0;
+    for (int b = 0; b < batch; b++) {
+      int batch_offset = b * channel * plane;
+      for (int i = 0; i < plane; ++i) {
+        for (int c = 0; c < channel; ++c) {
+          (static_cast<T2 *>(dst) + nhwc4_batch_offset + i * c4 * C4NUM + c)[0] =
+            to_dtype((static_cast<T1 *>(src) + batch_offset + i * channel + c)[0]);
+        }
+      }
+      nhwc4_batch_offset += nhwc4_batch_unit_offset;
+    }
+  } else {
+    size_t ori_input_size = batch * plane * channel;
+    for (size_t n = 0; n < ori_input_size; ++n) {
+      (static_cast<T2 *>(dst) + n)[0] = to_dtype((static_cast<T1 *>(src) + n)[0]);
+    }
+  }
+}
+template <class T1, class T2>
+void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel,
+                      const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_oc_offset = b * plane * channel;
+    int dst_oc_offset = b * plane * c4 * C4NUM;
+    for (int k = 0; k < plane; k++) {
+      int src_kernel_offset = src_oc_offset + k * channel;
+      int dst_kernel_offset = dst_oc_offset + k * C4NUM;
+      for (int i = 0; i < channel; i++) {
+        int c4_block_num = i / C4NUM;
+        int c4_block_rem = i % C4NUM;
+        int src_ic_offset = src_kernel_offset + i;
+        int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem;
+        (static_cast<T2 *>(dst) + dst_ic_offset)[0] = to_dtype((static_cast<T1 *>(src) + src_ic_offset)[0]);
+      }
+    }
+  }
+}
+
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_UTILS_H_
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
index e39dcdf277..b9bae225dd 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -24,7 +24,7 @@
 namespace mindspore::lite::opencl {
 
 OpenCLAllocator::OpenCLAllocator() {}
-OpenCLAllocator::~OpenCLAllocator() {}
+OpenCLAllocator::~OpenCLAllocator() { Clear(); }
 
 void OpenCLAllocator::SetContext(const AllocatorContext &ctx) {
   lock_flag_ = ctx.lockFlag;
@@ -50,10 +50,12 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
   auto svm_capabilities = ocl_runtime->GetSVMCapabilities();
 
   size_t img_pitch = 0;
+  size_t dtype_size = 1;
   if (!img_size.empty()) {
+    dtype_size = img_size[2] == CL_FLOAT ? sizeof(cl_float4) : sizeof(cl_half4);
     uint32_t image_alignment = ocl_runtime->GetImagePitchAlignment();
     img_pitch = (img_size[0] + image_alignment - 1) / image_alignment * image_alignment;
-    size = img_pitch * img_size[1] * sizeof(cl_float4);
+    size = img_pitch * img_size[1] * dtype_size;
   }
   if (size > MAX_MALLOC_SIZE) {
     MS_LOG(ERROR) << "MallocData out of max_size, size: " << size;
@@ -81,7 +83,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
   void *device_ptr = nullptr;
   void *image_ptr = nullptr;
 
-  if (svm_capabilities && svm_on_) {
+  if (svm_capabilities) {
     cl_svm_mem_flags flags = (svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0;
     flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0;
     flags = flags | CL_MEM_READ_WRITE;
@@ -107,7 +109,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
     if (!img_size.empty()) {
       cl::ImageFormat image_format(CL_RGBA, img_size[2]);
       cl::Image2D *image = new (std::nothrow) cl::Image2D(*ocl_runtime->Context(), image_format, *buffer, img_size[0],
-                                                          img_size[1], img_pitch * sizeof(cl_float4), &ret);
+                                                          img_size[1], img_pitch * dtype_size, &ret);
       if (image == nullptr || ret != CL_SUCCESS) {
         delete buffer;
         UnLock();
@@ -280,7 +282,7 @@ void OpenCLAllocator::Clear() {
 void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) {
   auto ocl_runtime = opencl::OpenCLRuntime::GetInstance();
   auto svm_capabilities = ocl_runtime->GetSVMCapabilities();
-  if (svm_capabilities && svm_on_) {
+  if (svm_capabilities) {
     if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
       auto it = allocated_list_.find(host_ptr);
       if (it == allocated_list_.end()) {
@@ -356,8 +358,8 @@ int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) {
   }
 }
 
-MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
-  MEM_TYPE mem_type{MEM_TYPE::BUF};
+MemType OpenCLAllocator::GetMemType(void *host_ptr) {
+  MemType mem_type{MemType::BUF};
   Lock();
   auto it = allocated_list_.find(host_ptr);
   if (it == allocated_list_.end()) {
@@ -367,9 +369,9 @@ MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
   }
   MemBuf *mem_buf = it->second;
   if (mem_buf->img_size.empty()) {
-    mem_type = MEM_TYPE::BUF;
+    mem_type = MemType::BUF;
   } else {
-    mem_type = MEM_TYPE::IMG;
+    mem_type = MemType::IMG;
   }
   UnLock();
   return mem_type;
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
index 87bf03cc5c..b582e83bc6 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@@ -40,7 +40,7 @@ struct OpenclMemory {
   OpenCLMemoryType mem_type{MS_HOST_BUFFER | MS_CL_BUFFER};
 };
 
-enum class MEM_TYPE : char { BUF, IMG };
+enum class MemType : char { SVM, BUF, IMG };
 
 class OpenCLAllocator : public Allocator {
  public:
@@ -58,7 +58,7 @@ class OpenCLAllocator : public Allocator {
   void *GetBuffer(void *host_ptr);
   void *MapBuffer(void *host_ptr, int flags, void *command_queue = nullptr, bool sync = true);
   int UnmapBuffer(void *host_ptr, void *command_queue = nullptr);
-  MEM_TYPE GetMemType(void *host_ptr);
+  MemType GetMemType(void *host_ptr);
   int GetImageSize(void *host_ptr, std::vector<size_t> *img_size);
   void *Prepare(void *ptr) override {
     if (ptr != nullptr) {
@@ -86,7 +86,6 @@ class OpenCLAllocator : public Allocator {
   // 6 is empirical value
   int shift_factor_ = 6;
   bool lock_flag_ = false;
-  bool svm_on_{false};
 };
 
 }  // namespace mindspore::lite::opencl
diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
index 500f22f52a..170e937db0 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@@ -65,7 +65,7 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
       }
     }
     for (auto input_kernel : kernel->in_kernels()) {
-      MS_EXCEPTION_IF_NULL(input_kernel);
+      MS_ASSERT(nullptr != input_kernel);
       ret = input_kernel->DecOutTensorRefCount();
       if (0 != ret) {
         MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
index af6ed55904..2513d22873 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -35,38 +35,33 @@ using mindspore::kernel::CLErrorCode;
 
 namespace mindspore::lite::opencl {
 
-std::map<std::string, std::string> g_opencl_program_map;
-
+static std::map<std::string, std::string> g_opencl_program_map;
 static std::mutex g_mtx;
 static std::mutex g_init_mtx;
 
-// magic number
-static std::map<int, int> AdrenoSubGroup{
-  {640, 128}, {630, 128}, {616, 128}, {612, 64}, {610, 64}, {540, 32}, {530, 32},
-  {512, 32},  {510, 32},  {509, 32},  {506, 32}, {505, 32}, {405, 32}, {330, 16},
-};
-
-#ifdef USE_OPENCL_WRAPPER
-std::shared_ptr<OpenCLWrapper> OpenCLWrapper::opencl_wrapper_singleton_ = nullptr;
-#endif
-std::shared_ptr<OpenCLRuntime> OpenCLRuntime::opencl_runtime_singleton_ = nullptr;
 bool OpenCLRuntime::init_done_ = false;
+OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr;
+size_t OpenCLRuntime::instance_count_ = 0;
 
 OpenCLRuntime *OpenCLRuntime::GetInstance() {
   std::unique_lock<std::mutex> lck(g_mtx);
-  if (opencl_runtime_singleton_.get() == nullptr) {
-    opencl_runtime_singleton_.reset(new OpenCLRuntime());
-    opencl_runtime_singleton_->Init();
+  static OpenCLRuntime ocl_runtime;
+  if (instance_count_ == 0) {
+    ocl_runtime_instance_ = &ocl_runtime;
+    ocl_runtime_instance_->Init();
   }
-  return opencl_runtime_singleton_.get();
+  instance_count_++;
+  return ocl_runtime_instance_;
 }
 
 void OpenCLRuntime::DeleteInstance() {
   std::unique_lock<std::mutex> lck(g_mtx);
-  init_done_ = false;
-  if (opencl_runtime_singleton_ != nullptr) {
-    opencl_runtime_singleton_.reset();
-    opencl_runtime_singleton_ = nullptr;
+  if (instance_count_ == 0) {
+    MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!";
+  }
+  instance_count_--;
+  if (instance_count_ == 0) {
+    ocl_runtime_instance_->Uninit();
   }
 }
 
@@ -88,7 +83,7 @@ int OpenCLRuntime::Init() {
   MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION;
 
 #ifdef USE_OPENCL_WRAPPER
-  if (false == OpenCLWrapper::GetInstance()->LoadOpenCLLibrary()) {
+  if (OpenCLWrapper::GetInstance()->LoadOpenCLLibrary() == false) {
     MS_LOG(ERROR) << "Load OpenCL symbols failed!";
     return RET_ERROR;
   }
@@ -123,7 +118,11 @@ int OpenCLRuntime::Init() {
     return RET_ERROR;
   }
 
-  device_ = std::make_shared<cl::Device>();
+  device_ = new (std::nothrow) cl::Device();
+  if (device_ == nullptr) {
+    MS_LOG(ERROR) << "Create OpenCL device failed!";
+    return RET_ERROR;
+  }
   *device_ = devices[0];
   max_work_item_sizes_ = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
   const std::string device_name = device_->getInfo<CL_DEVICE_NAME>();
@@ -138,27 +137,28 @@ int OpenCLRuntime::Init() {
                << max_work_item_sizes_[2];
 
   gpu_info_ = ParseGpuInfo(device_name, device_version);
-  cl_int err;
+  cl_int ret;
 #if defined(SHARING_MEM_WITH_OPENGL) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
   // create context from glcontext
   MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
   cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
                                           CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
-  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
+  context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);
 
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Create special OpenCL context falied, Create common OpenCL context then.";
-    context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
+  if (ret != CL_SUCCESS || context_ == nullptr) {
+    MS_LOG(ERROR) << "Create special OpenCL context failed, Create common OpenCL context then.";
+    context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
+    if (context_ == nullptr) {
+      MS_LOG(ERROR) << "Create OpenCL context failed!";
+      return RET_ERROR;
+    }
   }
 #else
   MS_LOG(INFO) << "Create common opencl context";
-  //  cl_context_properties context_prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](),
-  //                                          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printf_callback, 0};
-  //  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
-  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
+  context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
 #endif
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(err);
+  if (ret != CL_SUCCESS || context_ == nullptr) {
+    MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
     return RET_ERROR;
   }
 
@@ -170,9 +170,8 @@ int OpenCLRuntime::Init() {
   auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
   support_fp16_ = CL_SUCCESS == success && fp_config > 0;
 
-  err = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
-  svm_capabilities_ = 0;
-  if (err != CL_SUCCESS || svm_capabilities_ == 0) {
+  ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
+  if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
     svm_capabilities_ = 0;
     MS_LOG(INFO) << "SVM capalibilties: "
                  << "NONE";
@@ -204,16 +203,20 @@ int OpenCLRuntime::Init() {
   properties |= CL_QUEUE_PROFILING_ENABLE;
 #endif
 
-  default_command_queue_ = std::make_shared<cl::CommandQueue>(*context_, *device_, properties, &err);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(err);
+  default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret);
+  if (ret != CL_SUCCESS || default_command_queue_ == nullptr) {
+    MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
     return RET_ERROR;
   }
 
-  allocator_ = std::make_shared<OpenCLAllocator>();
+  allocator_ = new (std::nothrow) OpenCLAllocator();
+  if (allocator_ == nullptr) {
+    MS_LOG(ERROR) << "Command OpenCL allocator failed!";
+    return RET_ERROR;
+  }
 #ifdef PROGRAM_WITH_IL
   std::string flag = "";
-  CreateProgramFromIL(g_program_binary, flag);
+  binary_program_ = CreateProgramFromIL(g_program_binary, flag);
 #endif
   init_done_ = true;
   MS_LOG(INFO) << "OpenCLRuntime init done!";
@@ -221,18 +224,28 @@ int OpenCLRuntime::Init() {
   return RET_OK;
 }
 
-OpenCLRuntime::~OpenCLRuntime() {
+int OpenCLRuntime::Uninit() {
   program_map_.clear();
-  // allocator_->Clear();
-  allocator_.reset();
-  default_command_queue_.reset();
-  context_.reset();
-  device_.reset();
+  delete allocator_;
+  delete default_command_queue_;
+  delete context_;
+  delete device_;
+  allocator_ = nullptr;
+  default_command_queue_ = nullptr;
+  context_ = nullptr;
+  device_ = nullptr;
+#ifdef USE_OPENCL_WRAPPER
+  OpenCLWrapper::GetInstance()->UnLoadOpenCLLibrary();
+#endif
+  init_done_ = false;
+  return RET_OK;
 }
 
-cl::Context *OpenCLRuntime::Context() { return context_.get(); }
+OpenCLRuntime::~OpenCLRuntime() { Uninit(); }
 
-cl::Device *OpenCLRuntime::Device() { return device_.get(); }
+cl::Context *OpenCLRuntime::Context() { return context_; }
+
+cl::Device *OpenCLRuntime::Device() { return device_; }
 
 uint64_t OpenCLRuntime::DeviceGlobalMemoryCacheSize() const { return global_memery_cachesize_; }
 
@@ -263,9 +276,7 @@ uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRa
       sub_group_size = 0;
     }
 #else
-    if (AdrenoSubGroup.find(gpu_info_.model_num) != AdrenoSubGroup.end()) {
-      sub_group_size = AdrenoSubGroup[gpu_info_.model_num];
-    }
+    sub_group_size = 0;
 #endif
   }
 
@@ -290,20 +301,23 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
     // fp16 enable, kernel will use half and read_imageh and write_imageh.
     build_options_str =
       "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
-      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4";
+      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4 ";
   } else {
     // fp16 not enable, kernel will use float and read_imagef and write_imagef.
     build_options_str =
       "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
-      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4";
+      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4 ";
   }
 
-  build_options_str = std::accumulate(
-    build_options.begin(), build_options.end(), build_options_str,
-    [](const std::string &options, const std::string &option) -> std::string { return options + " " + option; });
+  auto build_options_ext = std::accumulate(
+    build_options.begin(), build_options.end(), std::string(""),
+    [](const std::string &options, const std::string &option) -> std::string {
+       auto res = options + " " + option;
+       return res;
+       });
   build_options_str += default_build_opts_;
   // program identifier = program_name + build_options
-  std::string build_program_key = program_name + build_options_str;
+  std::string build_program_key = program_name + build_options_str + build_options_ext;
 
   auto build_program_it = program_map_.find(build_program_key);
   cl::Program program;
@@ -317,7 +331,7 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
       MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
       return RET_ERROR;
     }
-    status = this->BuildProgram(build_options_str, &program);
+    status = this->BuildProgram(build_options_str, program);
     if (!status) {
       MS_LOG(ERROR) << program_name << " build failed!";
       return RET_ERROR;
@@ -325,50 +339,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
     program_map_.emplace(build_program_key, program);
   }
 
-  cl_int err;
-  kernel = cl::Kernel(program, kernel_name.c_str(), &err);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(err);
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
-int OpenCLRuntime::RunKernel(const cl_kernel &kernel, const std::vector<size_t> &global,
-                             const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
-  if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
-  }
-  MS_ASSERT(local.size() == 0 || local.size() == global.size());
-  std::vector<size_t> internal_global_ws = global;
-  for (size_t i = 0; i < local.size(); ++i) {
-    internal_global_ws[i] = ROUND_UP(global[i], local[i]);
-  }
-
-  MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
-  for (size_t i = 0; i < global.size(); i++) {
-    MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
-  }
-  for (size_t i = 0; i < local.size(); i++) {
-    MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
-  }
-
-  cl::Event event;
-  cl_int error = CL_SUCCESS;
-  if (local.size() == 0) {
-    error =
-      clEnqueueNDRangeKernel((*command_queue)(), kernel, global.size(), 0, global.data(), nullptr, 0, nullptr, nullptr);
-  } else {
-    error = clEnqueueNDRangeKernel((*command_queue)(), kernel, global.size(), 0, global.data(), local.data(), 0,
-                                   nullptr, nullptr);
-  }
-
-  if (error != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(error);
+  cl_int ret;
+  kernel = cl::Kernel(program, kernel_name.c_str(), &ret);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
     return RET_ERROR;
   }
-  MS_LOG(DEBUG) << "RunKernel success!";
   return RET_OK;
 }
 
@@ -376,7 +352,7 @@ int OpenCLRuntime::RunKernel(const cl_kernel &kernel, const std::vector<size_t>
 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global,
                              const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   MS_ASSERT(local.size() == 0 || local.size() == global.size());
   std::vector<size_t> internal_global_ws = global;
@@ -392,9 +368,6 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
     MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
   }
 
-  cl::Event event;
-  cl_int err = CL_SUCCESS;
-
   cl::NDRange global_range = cl::NullRange;
   cl::NDRange local_range = cl::NullRange;
   if (global.size() == 1) {
@@ -417,10 +390,12 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
     return RET_ERROR;
   }
 
-  err = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);
+  cl::Event event;
+  cl_int ret = CL_SUCCESS;
+  ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);
 
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(err);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
     return RET_ERROR;
   }
   MS_LOG(DEBUG) << "RunKernel success!";
@@ -463,9 +438,7 @@ GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_
 
 bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
   auto it_source = g_opencl_program_map.find(program_name);
-  if (it_source != g_opencl_program_map.end()) {
-    it_source->second = source;
-  } else {
+  if (it_source == g_opencl_program_map.end()) {
     g_opencl_program_map.emplace(program_name, source);
   }
   return true;
@@ -486,11 +459,11 @@ bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *pr
 }
 
 // build program with build options
-bool OpenCLRuntime::BuildProgram(const std::string &build_options, cl::Program *program) {
-  cl_int ret = program->build({*device_}, build_options.c_str());
+bool OpenCLRuntime::BuildProgram(const std::string &build_options, const cl::Program &program) {
+  cl_int ret = program.build({*device_}, build_options.c_str());
   if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
-      std::string build_log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
+    if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
+      std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
       MS_LOG(ERROR) << "Program build log: " << build_log;
     }
     MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
@@ -502,7 +475,7 @@ bool OpenCLRuntime::BuildProgram(const std::string &build_options, cl::Program *
 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
                                         bool sync) const {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   cl_int cl_ret = CL_SUCCESS;
   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(src);
@@ -515,7 +488,7 @@ bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size,
 bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
                                         bool sync) const {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   cl_int cl_ret = CL_SUCCESS;
   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(dst);
@@ -525,28 +498,28 @@ bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t
   return cl_ret == CL_SUCCESS;
 }
 
-void *OpenCLRuntime::MapBuffer(const cl::Buffer buffer, int flags, size_t size, cl::CommandQueue *command_queue,
+void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
                                bool sync) const {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   return command_queue->enqueueMapBuffer(buffer, sync, flags, 0, size);
 }
 
 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
-  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
+  if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
     return RET_OK;
   }
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   return command_queue->enqueueMapSVM(host_ptr, sync, flags, size);
 }
 
-void *OpenCLRuntime::MapBuffer(const cl::Image2D buffer, bool sync, int flags, const std::vector<size_t> &region,
+void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
                                cl::CommandQueue *command_queue) const {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   cl::size_type row_pitch;
   cl::size_type slice_pitch;
@@ -555,26 +528,26 @@ void *OpenCLRuntime::MapBuffer(const cl::Image2D buffer, bool sync, int flags, c
   return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
 }
 
-int OpenCLRuntime::UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
+int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   return command_queue->enqueueUnmapMemObject(buffer, host_ptr);
 }
 
 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
-  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
+  if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
     return RET_OK;
   }
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   return command_queue->enqueueUnmapSVM(host_ptr);
 }
 
 bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
   if (command_queue == nullptr) {
-    command_queue = default_command_queue_.get();
+    command_queue = default_command_queue_;
   }
   cl_int ret = command_queue->finish();
   if (ret != CL_SUCCESS) {
@@ -586,43 +559,55 @@ bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
 
 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
   size_t max_work_group_size;
-  cl_int err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+  cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
                                         &max_work_group_size, nullptr);
-  if (err != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(err);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
   }
   return static_cast<int>(max_work_group_size);
 }
 
-bool OpenCLRuntime::CreateKernelFromIL(cl_kernel &kernel, const std::string kernel_name) {
+cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
   cl_int ret = CL_SUCCESS;
-  kernel = clCreateKernel(il_program_, kernel_name.c_str(), &ret);
+  cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
   if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Create kernel with IL failed: " << CLErrorCode(ret);
+    MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
   }
-  return ret == CL_SUCCESS;
+  return kernel;
 }
 
 // build program with IL
-bool OpenCLRuntime::CreateProgramFromIL(const std::vector<u_char> program_binary, const std::string flag) {
+cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
-  size_t program_length = program_binary.size();
-  cl_int ret = CL_SUCCESS;
-  il_program_ = clCreateProgramWithIL((*context_)(), program_binary.data(), program_length, &ret);
-  if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Create program with IL failed: " << CLErrorCode(ret);
-    return false;
-  }
-
-  ret = clBuildProgram(il_program_, 1, &(*device_)(), flag.c_str(), NULL, NULL);
-  if (ret != CL_SUCCESS) {
-    MS_LOG(ERROR) << "Build program with IL failed: " << CLErrorCode(ret);
+  cl::Program program = cl::Program(*context_, binary);
+  bool status = BuildProgram(default_build_opts_, program);
+  if (!status) {
+    MS_LOG(ERROR) << "Build program with IL failed!";
   }
-  return ret == CL_SUCCESS;
+  return program;
 #else
   MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
-  return false;
+  return cl::Program();
 #endif
 }
 
+// build program with binary
+cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary,
+                                                   const std::string &flag) {
+  cl::Program program = cl::Program(*context_, {*device_}, binary);
+  bool status = BuildProgram(default_build_opts_, program);
+  if (!status) {
+    MS_LOG(ERROR) << "Build program with binary failed!";
+  }
+  return program;
+}
+
+std::vector<std::vector<unsigned char>> OpenCLRuntime::GetProgramBinaries(const cl::Program &program) {
+  cl_int ret = CL_SUCCESS;
+  auto binary = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
+  if (ret != CL_SUCCESS) {
+    MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
+  }
+  return binary;
+}
 }  // namespace mindspore::lite::opencl
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
index bcf066e15c..993c6f8368 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@@ -20,7 +20,6 @@ j* you may not use this file except in compliance with the License.
 #include <vector>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <set>
 #include <string>
 #include <type_traits>
@@ -38,9 +37,6 @@ struct GpuInfo {
   float opencl_version = 0;
 };
 
-// Base GPU cache size used for computing local work group size.
-const int32_t g_base_gpu_mem_cachesize = 16384;
-
 class OpenCLRuntime {
  public:
   static OpenCLRuntime *GetInstance();
@@ -51,11 +47,12 @@ class OpenCLRuntime {
   OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
 
   int Init();
+  int Uninit();
 
   cl::Context *Context();
   cl::Device *Device();
-  OpenCLAllocator *GetAllocator() { return allocator_.get(); }
-  cl::CommandQueue *GetDefaultCommandQueue() { return default_command_queue_.get(); }
+  OpenCLAllocator *GetAllocator() { return allocator_; }
+  cl::CommandQueue *GetDefaultCommandQueue() { return default_command_queue_; }
   uint64_t DeviceGlobalMemoryCacheSize() const;
   int DeviceMaxWorkGroupSize() const;
   uint32_t DeviceComputeUnits() const;
@@ -65,61 +62,68 @@ class OpenCLRuntime {
   GpuInfo GetGpuInfo();
   bool GetFp16Enable() const;
   bool SetFp16Enable(bool enable);
-  const std::vector<size_t> &GetWorkItemSize() { return max_work_item_sizes_; }
-  uint32_t GetImagePitchAlignment() { return image_pitch_align_; }
-  cl_device_svm_capabilities GetSVMCapabilities() const { return svm_capabilities_; }
+  bool GetSVMEnable() const { return svm_enable_; }
+  void SetSVMEnable(bool enable) { svm_enable_ = enable; }
+  const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }
+  uint32_t GetImagePitchAlignment() const { return image_pitch_align_; }
+  cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; }
 
   template <typename T>
-  typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl_kernel &kernel, uint32_t index,
-                                                                                const T value) {
-    if (svm_capabilities_) {
-      MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
-      return clSetKernelArgSVMPointer(kernel, index, value);
-    } else {
-      MEM_TYPE mem_type = allocator_->GetMemType(value);
-      if (mem_type == MEM_TYPE::BUF) {
+  typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl::Kernel &kernel, uint32_t index,
+                                                                                const T value,
+                                                                                const MemType mem_type = MemType::IMG) {
+    switch (mem_type) {
+      case MemType::SVM: {
+        MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
+        return kernel.setArg(index, value);
+      }
+      case MemType::BUF: {
         cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetBuffer(value));
         MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value;
-        return clSetKernelArg(kernel, index, sizeof((*buffer)()), &(*buffer)());
-      } else {
+        return kernel.setArg(index, *buffer);
+      }
+      case MemType::IMG: {
         cl::Image2D *image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(value));
+        if (image == nullptr) {
+          MS_LOG(WARNING) << "Can't get Image2D, try to use Buffer. Please confirm the buffer type.";
+          cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetBuffer(value));
+          MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value;
+          return kernel.setArg(index, *buffer);
+        }
         MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Image2D " << image << ", host_ptr: " << value;
-        return clSetKernelArg(kernel, index, sizeof((*image)()), &(*image)());
+        return kernel.setArg(index, *image);
       }
+      default:
+        MS_LOG(ERROR) << "Unsupport opencl memory type: " << static_cast<int>(mem_type);
     }
   }
 
   template <typename T>
-  typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(cl_kernel &kernel, uint32_t index,
-                                                                                 const T value) {
-    return clSetKernelArg(kernel, index, sizeof(value), &value);
-  }
-
-  template <typename T>
-  int SetKernelArg(cl::Kernel &kernel, uint32_t index, const T &value) {
-    return SetKernelArg(kernel(), index, value);
+  typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(
+    cl::Kernel &kernel, uint32_t index, const T value, const MemType mem_type = MemType::IMG) {
+    return kernel.setArg(index, value);
   }
 
-  bool CreateProgramFromIL(const std::vector<u_char> program_binary, const std::string flag);
-  bool CreateKernelFromIL(cl_kernel &kernel, const std::string kernel_name);
+  cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
+  cl::Program CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary, const std::string &flag);
+  cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
+  std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program);
   bool LoadSource(const std::string &program_name, const std::string &source);
   int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
                   const std::set<std::string> &build_options);
-  int RunKernel(const cl_kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
-                cl::CommandQueue *command_queue);
   int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
                 cl::CommandQueue *command_queue);
   bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                            bool sync = false) const;
   bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                            bool sync = false) const;
-  void *MapBuffer(const cl::Buffer buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
+  void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
                   bool sync = false) const;
-  void *MapBuffer(const cl::Image2D buffer, bool sync, int flags, const std::vector<size_t> &region,
+  void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
                   cl::CommandQueue *command_queue = nullptr) const;
   int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
                 bool sync = false) const;
-  int UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
+  int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
   int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
   bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
 
@@ -136,17 +140,18 @@ class OpenCLRuntime {
   GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);
 
   bool LoadProgram(const std::string &program_name, cl::Program *program);
-  bool BuildProgram(const std::string &build_options, cl::Program *program);
+  bool BuildProgram(const std::string &build_options, const cl::Program &program);
 
  private:
-  static std::shared_ptr<OpenCLRuntime> opencl_runtime_singleton_;
   static bool init_done_;
-  std::shared_ptr<cl::CommandQueue> default_command_queue_{nullptr};
-  std::shared_ptr<cl::Context> context_{nullptr};
-  std::shared_ptr<cl::Device> device_{nullptr};
-  std::shared_ptr<OpenCLAllocator> allocator_{nullptr};
-  std::map<std::string, cl::Program> program_map_{};
-  cl_program il_program_{0};
+  static size_t instance_count_;
+  static OpenCLRuntime *ocl_runtime_instance_;
+  cl::CommandQueue *default_command_queue_{nullptr};
+  cl::Context *context_{nullptr};
+  cl::Device *device_{nullptr};
+  OpenCLAllocator *allocator_{nullptr};
+  std::map<std::string, cl::Program> program_map_;
+  cl::Program binary_program_{0};
   uint64_t global_memery_cachesize_{0};
   int max_work_group_size;
   uint32_t compute_units_{0};
@@ -155,11 +160,11 @@ class OpenCLRuntime {
   GpuInfo gpu_info_;
   bool support_fp16_{false};
   bool fp16_enable_{false};
+  bool svm_enable_{false};
   cl_device_svm_capabilities svm_capabilities_{0};
   cl_uint image_pitch_align_{0};
   std::vector<size_t> max_work_item_sizes_;
 };
 
 }  // namespace mindspore::lite::opencl
-
 #endif  // MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
diff --git a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
index b97f0259ed..67393505db 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
@@ -66,19 +66,13 @@ static const std::vector<std::string> g_opencl_library_paths = {
 };
 
 OpenCLWrapper *OpenCLWrapper::GetInstance() {
-  static std::once_flag opencl_wrapper_once;
-  std::call_once(opencl_wrapper_once,
-                 []() { opencl_wrapper_singleton_ = std::shared_ptr<OpenCLWrapper>(new OpenCLWrapper()); });
-
-  return opencl_wrapper_singleton_.get();
+  static OpenCLWrapper ocl_wrapper;
+  return &ocl_wrapper;
 }
 
 OpenCLWrapper::OpenCLWrapper() {}
 
-OpenCLWrapper::~OpenCLWrapper() {
-  if (nullptr == opencl_wrapper_singleton_.get()) return;
-  opencl_wrapper_singleton_->UnLoadOpenCLLibrary();
-}
+OpenCLWrapper::~OpenCLWrapper() {}
 
 // load default library path
 bool OpenCLWrapper::LoadOpenCLLibrary() {
@@ -273,6 +267,15 @@ cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const ch
   return func(context, count, strings, lengths, errcode_ret);
 }
 
+// clCreateProgramWithBinary wrapper, use OpenCLWrapper function.
+cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id *devices_list,
+                                     const size_t *lengths, const unsigned char **binaries, cl_int *binary_status,
+                                     cl_int *errcode_ret) {
+  auto func = mindspore::lite::opencl::OpenCLWrapper::GetInstance()->clCreateProgramWithBinary;
+  MS_ASSERT(func != nullptr);
+  return func(context, num_devices, devices_list, lengths, binaries, binary_status, errcode_ret);
+}
+
 // clGetProgramInfo wrapper, use OpenCLWrapper function.
 cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value,
                         size_t *param_value_size_ret) {
diff --git a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
index fa56d86b43..3998b3b55b 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
@@ -230,8 +230,7 @@ class OpenCLWrapper {
   bool LoadLibraryFromPath(const std::string &path);
 
  private:
-  static std::shared_ptr<OpenCLWrapper> opencl_wrapper_singleton_;
-  void *handle_ = nullptr;
+  void *handle_{nullptr};
 };
 
 }  // namespace mindspore::lite::opencl
diff --git a/mindspore/lite/src/runtime/parallel_executor.cc b/mindspore/lite/src/runtime/parallel_executor.cc
index fc7e6eaa27..488e7ad736 100644
--- a/mindspore/lite/src/runtime/parallel_executor.cc
+++ b/mindspore/lite/src/runtime/parallel_executor.cc
@@ -14,37 +14,23 @@
  * limitations under the License.
  */
 
+#include <utility>
 #include "src/runtime/parallel_executor.h"
-using mindspore::predict::ThreadPool;
-using mindspore::predict::TvmEnv;
+#include "src/runtime/runtime_api.h"
+
 #define MAX_THREAD_NUM 8
 namespace mindspore::lite {
-ParallelExecutor::~ParallelExecutor() {
-  delete pool;
-  pool = nullptr;
-}
+ParallelExecutor::~ParallelExecutor() {}
 int ParallelExecutor::Prepare(std::vector<mindspore::kernel::LiteKernel *> &kernels) {
-  pool = new ThreadPool();
-  pool->ConfigThreadPool(NO_BIND, MAX_THREAD_NUM);
-  for (mindspore::kernel::LiteKernel *kernel : kernels) {
-    refCount[kernel] = kernel->out_kernels().size();
+  int status = ConfigThreadPool(THREAD_POOL_DEFAULT, MAX_THREAD_NUM, NO_BIND);
+  if (status != 0) {
+    MS_LOG(ERROR) << "Memory error: fail to new ThreadPool";
+    return RET_ERROR;
   }
   return RET_OK;
 }
 
-void ParallelExecutor::PrepareReadyKernels(const std::vector<mindspore::kernel::LiteKernel *> &kernels) {
-  for (auto iter = refCount.begin(); iter != refCount.end();) {
-    if (iter->second == 0) {
-      readyKernels.emplace_back(iter->first);
-      iter = refCount.erase(iter);
-    } else {
-      iter++;
-    }
-  }
-  results.resize(readyKernels.size());
-}
-
-static int RunKernel(int index, TvmEnv *env, void *data) {
+static int RunKernel(void *data, int index) {
   ParallelExecutor *executor = reinterpret_cast<ParallelExecutor *>(data);
   auto kernel = executor->GetReadyKernel(index);
   auto ret = kernel->Run();
@@ -83,27 +69,49 @@ int ParallelExecutor::Run(std::vector<tensor::Tensor *> &in_tensors, std::vector
   }
   kernel::LiteKernelUtil::InitTensorRefCount(kernels);
 
-  PrepareReadyKernels(kernels);
+  for (auto kernel : kernels) {
+    if (kernel->in_kernels().size() == 0) {
+      readyKernels.emplace_back(kernel);
+      continue;
+    }
+    refCount[kernel] = kernel->in_kernels().size();
+  }
+  std::vector<kernel::LiteKernel *> newReadyKernels;
   while (readyKernels.size() > 0) {
-    pool->LaunchWork(RunKernel, this, readyKernels.size());
+    results.resize(readyKernels.size(), RET_OK);
+    ParallelLaunch(THREAD_POOL_DEFAULT, RunKernel, this, readyKernels.size());
 
     if (std::find_if(results.begin(), results.end(), [](const int &ret) { return (ret != 0); }) != results.end()) {
       return RET_ERROR;
     }
-    for (auto completedKernel : readyKernels) {
-      for (auto out : completedKernel->out_kernels()) {
+    newReadyKernels.clear();
+    for (auto completed : readyKernels) {
+      for (auto out : completed->out_kernels()) {
         auto iter = refCount.find(out);
         if (iter == refCount.end()) {
           continue;
         }
         (iter->second)--;
         if (iter->second <= 0) {
+          newReadyKernels.emplace_back(iter->first);
           refCount.erase(iter);
         }
       }
+
+      for (auto input_kernel : completed->in_kernels()) {
+        MS_ASSERT(input_kernel != nullptr);
+        if (input_kernel->is_model_output()) {
+          continue;
+        }
+        auto ret = input_kernel->DecOutTensorRefCount();
+        if (0 != ret) {
+          MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << completed->name() << " failed";
+          return -1;
+        }
+      }
     }
     readyKernels.clear();
-    PrepareReadyKernels(kernels);
+    readyKernels = std::move(newReadyKernels);
   }
 
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/parallel_executor.h b/mindspore/lite/src/runtime/parallel_executor.h
index fd47ca38d6..95dfbbd58f 100644
--- a/mindspore/lite/src/runtime/parallel_executor.h
+++ b/mindspore/lite/src/runtime/parallel_executor.h
@@ -23,7 +23,6 @@
 #include "src/lite_kernel.h"
 #include "include/lite_session.h"
 #include "src/executor.h"
-#include "src/runtime/thread_pool.h"
 
 namespace mindspore::lite {
 class ParallelExecutor : public Executor {
@@ -40,10 +39,6 @@ class ParallelExecutor : public Executor {
   inline void SetResult(const int index, const int result) { results.at(index) = result; }
 
  private:
-  void PrepareReadyKernels(const std::vector<kernel::LiteKernel *> &kernels);
-
- private:
-  predict::ThreadPool *pool;
   std::unordered_map<kernel::LiteKernel *, size_t> refCount;
   std::vector<kernel::LiteKernel *> readyKernels;
   std::vector<int> results;
diff --git a/mindspore/lite/src/runtime/runtime_api.cc b/mindspore/lite/src/runtime/runtime_api.cc
index fa7170404f..374796cdaf 100644
--- a/mindspore/lite/src/runtime/runtime_api.cc
+++ b/mindspore/lite/src/runtime/runtime_api.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2019 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,19 @@
  * limitations under the License.
  */
 
+#include "src/runtime/runtime_api.h"
 #include <mutex>
 #include <string>
-#include "src/runtime/runtime_api.h"
 #include "src/runtime/workspace_pool.h"
-#include "src/runtime/thread_pool.h"
 #include "utils/log_adapter.h"
 
 static std::mutex gWorkspaceMutex;
 #ifdef __cplusplus
 extern "C" {
 #endif
-void LiteAPISetLastError(const char *msg) {
-  MS_LOG(ERROR) << "The lite api set last error is " << msg;
-}
+void LiteAPISetLastError(const char *msg) { MS_LOG(ERROR) << "The lite api set last error is " << msg; }
 
-void *LiteBackendAllocWorkspace(int deviceType,
-                                int deviceId,
-                                uint64_t size,
-                                int dtypeCode,
-                                int dtypeBits) {
+void *LiteBackendAllocWorkspace(int deviceType, int deviceId, uint64_t size, int dtypeCode, int dtypeBits) {
   std::lock_guard<std::mutex> lock(gWorkspaceMutex);
   auto p = mindspore::predict::WorkspacePool::GetInstance();
   if (p == nullptr) {
@@ -52,54 +45,6 @@ int LiteBackendFreeWorkspace(int deviceType, int deviceId, void *ptr) {
   p->FreeWorkSpaceMem(ptr);
   return 0;
 }
-
-void SetMaxWokerNum(int num) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  if (num < 0) {
-    LiteAPISetLastError("The number of work thread is less than 0");
-    return;
-  }
-  p->ConfigMaxThreadNum(num);
-}
-
-void ConfigThreadPool(int mode, int nthreads) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  p->ConfigThreadPool(mode, nthreads);
-}
-
-int LiteBackendParallelLaunch(FTVMParallelLambda flambda, void *cdata, int num_task) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return -1;
-  }
-  if (!p->LaunchWork(flambda, cdata, num_task)) {
-    MS_LOG(ERROR) << "launch thread pool work failed";
-    return -1;
-  }
-  return 0;
-}
-
-void DoAllThreadBind(bool ifBind, int mode) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  if (!p->BindAllThreads(ifBind, mode)) {
-    MS_LOG(ERROR) << "do thread cpu bind failed";
-  }
-}
-
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/mindspore/lite/src/runtime/runtime_api.h b/mindspore/lite/src/runtime/runtime_api.h
index cd3942d79e..0755e9245f 100644
--- a/mindspore/lite/src/runtime/runtime_api.h
+++ b/mindspore/lite/src/runtime/runtime_api.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2019 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
+#ifndef PREDICT_SRC_RUNTIME_RUNTIME_API_H_
+#define PREDICT_SRC_RUNTIME_RUNTIME_API_H_
 #include <memory>
 
 #ifndef INTERNAL_API_DLL
@@ -32,26 +31,16 @@
 
 #ifdef __cplusplus
 extern "C" {
+#include "src/runtime/thread_pool.h"
+
 #endif
 
-typedef struct {
-  void *sync_handle;
-  int32_t num_task;
-} LiteParallelGroupEnv;
-typedef int (*FTVMParallelLambda)(int task_id, LiteParallelGroupEnv *penv, void *cdata);
 INTERNAL_API_DLL void LiteAPISetLastError(const char *msg);
 INTERNAL_API_DLL void *LiteBackendAllocWorkspace(int deviceType, int deviceId, uint64_t size, int dtypeCode,
                                                  int dtypeBits);
 INTERNAL_API_DLL int LiteBackendFreeWorkspace(int deviceType, int deviceId, void *ptr);
-INTERNAL_API_DLL void SetMaxWokerNum(int num);
-INTERNAL_API_DLL void ConfigThreadPool(int mode, int nthreads);
-INTERNAL_API_DLL inline void CfgThreadPool(int nthread) { ConfigThreadPool(-1, nthread); }
-INTERNAL_API_DLL int LiteBackendParallelLaunch(FTVMParallelLambda flambda, void *cdata, int num_task);
 INTERNAL_API_DLL int LiteBackendRegisterSystemLibSymbol(const char *name, void *ptr);
-INTERNAL_API_DLL void DoAllThreadBind(bool ifBind, int mode);
-
 #ifdef __cplusplus
 }
 #endif
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
-
+#endif  // PREDICT_SRC_RUNTIME_RUNTIME_API_H_
diff --git a/mindspore/lite/src/runtime/thread_pool.c b/mindspore/lite/src/runtime/thread_pool.c
new file mode 100644
index 0000000000..6c98496ccf
--- /dev/null
+++ b/mindspore/lite/src/runtime/thread_pool.c
@@ -0,0 +1,796 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/thread_pool.h"
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdatomic.h>
+#include <semaphore.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef __ANDROID__
+#define BIND_CORE
+#include <unistd.h>
+#include <sched.h>
+#endif
+
+#ifdef THREAD_POOL_DEBUG
+#include <stdio.h>
+#define LOG_INFO(content, args...) \
+  { printf("[INFO] %s|%d|%s: " #content "\r\n", __FILE__, __LINE__, __func__, ##args); }
+#else
+#define LOG_INFO(content, args...)
+#endif
+
+#define RET_TP_OK (0)
+#define RET_TP_ERROR (1)
+#define RET_TP_SYSTEM_ERROR (-1)
+
+#define MAX_TASK_NUM (2)
+#define MAX_THREAD_NUM (8)
+#define MAX_THREAD_POOL_NUM (4)
+#define DEFAULT_SPIN_COUNT (30000)
+
+typedef struct {
+  int (*func)(void *arg, int);
+  void *content;
+} Task;
+
+typedef struct Thread {
+  int thread_pool_id;
+  int thread_id;
+  struct Thread *next;
+  pthread_t pthread;
+  Task *task_list[MAX_TASK_NUM];
+  atomic_int task_size;
+  atomic_int head;
+  atomic_int tail;
+  atomic_bool activate;
+  atomic_bool is_running;
+  sem_t sem;
+} Thread;
+
+typedef struct {
+  Thread *head;
+  Thread *tail;
+  pthread_mutex_t lock;
+  int size;
+} ThreadList;
+
+typedef struct ThreadPool {
+  ThreadList *thread_list;
+  int thread_num;
+  BindMode mode;
+  atomic_bool is_alive;
+} ThreadPool;
+
+static ThreadPool thread_pool_list[MAX_THREAD_POOL_NUM];
+static atomic_int thread_pool_refcount[MAX_THREAD_POOL_NUM] = {ATOMIC_VAR_INIT(0)};
+static atomic_bool thread_pool_is_created[MAX_THREAD_POOL_NUM] = {ATOMIC_VAR_INIT(false)};
+
+ThreadPool *GetInstance(int thread_pool_id) {
+  if (thread_pool_id < 0 || thread_pool_id >= MAX_THREAD_POOL_NUM) {
+    LOG_INFO("invaid context id: %d", thread_pool_id);
+    // DestroyThreadPool(thread_pool_id);
+    return NULL;
+  }
+  return &thread_pool_list[thread_pool_id];
+}
+
+Thread *GetThread(int thread_pool_id, int thread_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, thread_id);
+    return NULL;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thead list is null");
+    return NULL;
+  }
+  if (thread_id >= thread_list->size) {
+    LOG_INFO("invalid thread id: %d, thread_pool_id: %d, thread size: %d", thread_id, thread_pool_id,
+             thread_list->size);
+    return NULL;
+  }
+  if (thread_id == 0) {
+    return thread_list->head;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    if (thread->thread_id == thread_id) {
+      break;
+    }
+    thread = thread->next;
+  }
+  return thread;
+}
+
+void FreeThread(ThreadList *thread_list, Thread *thread) {
+  if (thread_list == NULL) {
+    LOG_INFO("thead list is null");
+    return;
+  }
+  if (thread == NULL) {
+    LOG_INFO("thread is nullptr");
+    return;
+  }
+  // only support sequential release
+  thread_list->head = thread->next;
+  sem_post(&thread->sem);
+  while (thread != NULL && !thread->is_running) {
+    sem_destroy(&thread->sem);
+    free(thread);
+    thread = NULL;
+  }
+}
+
+#ifdef BIND_CORE
+#define MAX_CORE_NUM (16)
+static int gCoreNum = 8;
+static int gHigNum = 0;
+static int gMidNum = 0;
+static int cpu_cores[MAX_CORE_NUM];
+static bool run_once = true;
+
+#define MAX_CPU_ID (9)
+#define MAX_PATH_SIZE (256)
+typedef struct {
+  int core_id;
+  int max_freq;
+} CpuInfo;
+
+int GetCpuCoreNum() { return (int)sysconf(_SC_NPROCESSORS_CONF); }
+
+static int ConcatCPUPath(int cpuID, const char *str1, const char *str2, char *str3) {
+  if (cpuID > MAX_CPU_ID || str1 == NULL || str2 == NULL) {
+    return RET_TP_ERROR;
+  }
+  memset(str3, 0, strlen(str3));
+  char *tmp = str3;
+  char id = cpuID + '0';
+  memcpy(tmp, str1, strlen(str1));
+  tmp += strlen(str1);
+  memcpy(tmp, &id, 1);
+  tmp += 1;
+  memcpy(tmp, str2, strlen(str2));
+  return RET_TP_OK;
+}
+
+int GetMaxFrequence(int core_id) {
+  char path[MAX_PATH_SIZE] = "";
+  int ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpufreq/stats/cpu", "/time_in_state", path);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/stats/cpu/time_in_state failed!");
+    return RET_TP_ERROR;
+  }
+  FILE *fp = fopen(path, "rb");
+  if (fp == NULL) {
+    ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpufreq/stats/cpu", "/cpufreq/stats/time_in_state", path);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/stats/cpu/cpufreq/stats/time_instate failed!");
+      return RET_TP_ERROR;
+    }
+    fp = fopen(path, "rb");
+    if (fp == NULL) {
+      ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpu", "/cpufreq/cpuinfo_max_freq", path);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/cpuinfo_max_freq failed!");
+        return RET_TP_ERROR;
+      }
+      fp = fopen(path, "rb");
+      if (fp == NULL) {
+        LOG_INFO("GetCPUMaxFreq failed, cannot find cpuinfo_max_freq.");
+        return RET_TP_ERROR;
+      }
+      int maxFreq = -1;
+      int result __attribute__((unused));
+      result = fscanf(fp, "%d", &maxFreq);
+      fclose(fp);
+      return maxFreq;
+    }
+  }
+  int maxFreq = -1;
+  while (feof(fp) == 0) {
+    int freq = 0;
+    int tmp = fscanf(fp, "%d", &freq);
+    if (tmp != 1) {
+      break;
+    }
+    if (freq > maxFreq) {
+      maxFreq = freq;
+    }
+  }
+  fclose(fp);
+  return maxFreq;
+}
+
+int SortCpuProcessor() {
+  gCoreNum = GetCpuCoreNum();
+  if (gCoreNum <= 0) {
+    LOG_INFO("invalid cpu count");
+    return RET_TP_ERROR;
+  }
+  CpuInfo freq_set[gCoreNum];
+  for (int i = 0; i < gCoreNum; ++i) {
+    int max_freq = GetMaxFrequence(i);
+    freq_set[i].core_id = i;
+    freq_set[i].max_freq = max_freq;
+  }
+  // sort core id by frequency
+  for (int i = 0; i < gCoreNum; ++i) {
+    for (int j = i + 1; j < gCoreNum; ++j) {
+      if (freq_set[i].max_freq <= freq_set[j].max_freq) {
+        CpuInfo temp = freq_set[i];
+        freq_set[i] = freq_set[j];
+        freq_set[j] = temp;
+      }
+    }
+  }
+  for (int i = 0; i < gCoreNum; ++i) {
+    cpu_cores[i] = freq_set[i].core_id;
+    LOG_INFO("sorted_order: %d, frequency: %d", freq_set[i].core_id, freq_set[i].max_freq);
+  }
+  gHigNum = 0;
+  gMidNum = 0;
+  int max_freq = freq_set[0].max_freq;
+  int min_freq = freq_set[gCoreNum - 1].max_freq;
+  int little = 0;
+  for (int i = 0; i < gCoreNum; ++i) {
+    if (freq_set[i].max_freq == max_freq) {
+      gHigNum++;
+    }
+    if (freq_set[i].max_freq == min_freq) {
+      little++;
+    }
+  }
+  gMidNum = gCoreNum - gHigNum - little;
+  if (gHigNum == gCoreNum || max_freq == min_freq) {
+    // fix MTK800
+    gHigNum = 2;
+    gMidNum = 2;
+    LOG_INFO("core frequency may be wrong.");
+  }
+  LOG_INFO("gCoreNum: %d, gHigNum: %d, gMidNum: %d, gLitNum: %d", gCoreNum, gHigNum, gMidNum, little);
+  return RET_TP_OK;
+}
+
+#ifndef CPU_SET
+#define CPU_SETSIZE 1024
+#define __NCPUBITS (8 * sizeof(unsigned long))
+typedef struct {
+  unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+} cpu_set_t;
+#define CPU_SET(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
+#endif  // CPU_SET
+
+int SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) {
+#ifdef __ANDROID__
+#if __ANDROID_API__ >= 21
+  LOG_INFO("thread: %d, mask: %lu", pthread_gettid_np(thread_id), cpuSet->__bits[0]);
+  int ret = sched_setaffinity(pthread_gettid_np(thread_id), sizeof(cpu_set_t), cpuSet);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind thread %d to cpu failed. ERROR %d", pthread_gettid_np(thread_id), ret);
+    return RET_TP_OK;
+  }
+#endif
+#else
+#ifdef __APPLE__
+  LOG_INFO("not bind thread to apple's cpu.");
+  return RET_TP_ERROR;
+#else
+  int ret = pthread_setaffinity_np(thread_id, sizeof(cpu_set_t), cpuSet);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("set thread: %lu to cpu failed", thread_id);
+    return RET_TP_SYSTEM_ERROR;
+  }
+#endif  // __APPLE__
+#endif
+  return RET_TP_OK;
+}
+
+int BindMasterThread(int thread_pool_id, bool is_bind) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  if (is_bind) {
+    unsigned int attach_id;
+    if (thread_pool->mode == MID_MODE) {
+      attach_id = cpu_cores[gHigNum + gMidNum - 1];
+    } else {
+      attach_id = cpu_cores[0];
+    }
+    LOG_INFO("mode: %d, attach id: %u", thread_pool->mode, attach_id);
+    CPU_SET(attach_id, &mask);
+  } else {
+    for (int i = 0; i < gHigNum + gMidNum; ++i) {
+      CPU_SET(cpu_cores[i], &mask);
+    }
+  }
+  int ret = SetAffinity(pthread_self(), &mask);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("set master thread affinity failed");
+    return RET_TP_ERROR;
+  }
+  LOG_INFO("BindMasterThread success.");
+  return RET_TP_OK;
+}
+
+int BindSalverThreads(int thread_pool_id, bool is_bind) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  cpu_set_t mask;
+  if (is_bind && thread_pool->mode != NO_BIND_MODE) {
+    unsigned int attach_id;
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      if (thread_pool->mode == MID_MODE) {
+        int core_id = gHigNum + gMidNum - i - 2;
+        if (core_id >= 0) {
+          attach_id = cpu_cores[core_id];
+        } else {
+          attach_id = cpu_cores[0];
+        }
+      } else {
+        attach_id = cpu_cores[i + 1];
+      }
+      LOG_INFO("mode: %d, attach id: %u", thread_pool->mode, attach_id);
+      CPU_ZERO(&mask);
+      CPU_SET(attach_id, &mask);
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return false;
+      }
+      int ret = SetAffinity(thread->pthread, &mask);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("set thread affinity failed");
+        return RET_TP_ERROR;
+      }
+    }
+  } else {
+    CPU_ZERO(&mask);
+    for (int i = 0; i < gHigNum + gMidNum; ++i) {
+      CPU_SET(cpu_cores[i], &mask);
+    }
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return false;
+      }
+      int ret = SetAffinity(thread->pthread, &mask);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("set thread affinity failed");
+        return RET_TP_ERROR;
+      }
+    }
+  }
+  LOG_INFO("BindSalverThreads success");
+  return RET_TP_OK;
+}
+#endif
+
+int BindThreads(int thread_pool_id, bool is_bind, int mode) {
+#ifdef BIND_CORE
+  if (mode == NO_BIND_MODE) {
+    return RET_TP_OK;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  thread_pool->mode = mode;
+  int ret = BindMasterThread(thread_pool_id, is_bind);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind master thread failed.");
+  }
+  ret = BindSalverThreads(thread_pool_id, is_bind);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind salver thread failed.");
+  }
+  return ret;
+#else
+  return RET_TP_OK;
+#endif
+}
+
+bool PushTaskToQueue(int thread_pool_id, int thread_id, Task *task) {
+  Thread *thread = GetThread(thread_pool_id, thread_id);
+  if (thread == NULL) {
+    LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, thread_id);
+    return false;
+  }
+  const int tail_index = atomic_load_explicit(&thread->tail, memory_order_relaxed);
+  int next = (tail_index + 1) % MAX_TASK_NUM;
+  if (next == atomic_load_explicit(&thread->head, memory_order_acquire)) {
+    return false;
+  }
+  thread->task_list[tail_index] = task;
+  atomic_store_explicit(&thread->tail, next, memory_order_release);
+  atomic_fetch_add_explicit(&thread->task_size, 1, memory_order_relaxed);
+  // atomic_store_explicit(&thread->task_size, thread->task_size + 1, memory_order_relaxed);
+  sem_post(&thread->sem);
+  return true;
+}
+
+bool PopTaskFromQueue(Thread *thread, Task **task) {
+  if (thread == NULL) {
+    LOG_INFO("thread is nullptr");
+    return false;
+  }
+  if (thread->task_size == 0) {
+    return false;
+  }
+  const int head_index = atomic_load_explicit(&thread->head, memory_order_relaxed);
+  if (head_index == atomic_load_explicit(&thread->tail, memory_order_acquire)) {
+    return false;
+  }
+  *task = thread->task_list[head_index];
+  atomic_store_explicit(&thread->head, (head_index + 1) % MAX_TASK_NUM, memory_order_release);
+  return true;
+}
+
+void WaitAllThread(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  bool k_success_flag = false;
+  while (!k_success_flag) {
+    k_success_flag = true;
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return;
+      }
+      if (thread->task_size != 0) {
+        k_success_flag = false;
+        break;
+      }
+    }
+  }
+}
+
+int DistributeTask(int thread_pool_id, Task *task, int task_num) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  if (task_num > thread_pool->thread_num || task_num <= 1) {
+    LOG_INFO("invalid task num: %d, thread num: %d", task_num, thread_pool->thread_num);
+    return RET_TP_ERROR;
+  }
+  bool k_success_flag = false;
+  int size = thread_pool->thread_num < task_num ? thread_pool->thread_num : task_num;
+  for (int i = 0; i < size - 1; ++i) {
+    do {
+      k_success_flag = true;
+      if (!PushTaskToQueue(thread_pool_id, i, task)) {
+        k_success_flag = false;
+      }
+    } while (!k_success_flag);
+  }
+  // master thread
+  task->func(task->content, size - 1);
+  // wait
+  WaitAllThread(thread_pool_id);
+  return RET_TP_OK;
+}
+
+int AddTask(int thread_pool_id, int func(void *, int), void *content, int task_num) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  // if single thread, run master thread
+  if (thread_pool->thread_num <= 1 || task_num <= 1) {
+    for (int i = 0; i < task_num; ++i) {
+      func(content, i);
+    }
+    return RET_TP_OK;
+  }
+  Task task;
+  task.func = func;
+  task.content = content;
+  return DistributeTask(thread_pool_id, &task, task_num);
+}
+
+int ParallelLaunch(int thread_pool_id, int (*func)(void *, int), void *content, int task_num) {
+  return AddTask(thread_pool_id, func, content, task_num);
+}
+
+void ThreadRun(Thread *thread) {
+  ThreadPool *thread_pool = GetInstance(thread->thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  Task *task = NULL;
+  int thread_id = thread->thread_id;
+  int spin_count = 0;
+  thread->is_running = true;
+  while (thread_pool->is_alive) {
+    while (thread->activate) {
+      if (PopTaskFromQueue(thread, &task)) {
+        task->func(task->content, thread_id);
+        atomic_fetch_sub_explicit(&thread->task_size, 1, memory_order_relaxed);
+        // atomic_store_explicit(&thread->task_size, thread->task_size - 1, memory_order_relaxed);
+        spin_count = 0;
+        sem_trywait(&thread->sem);
+      } else {
+        sched_yield();
+        spin_count++;
+      }
+      if (spin_count == DEFAULT_SPIN_COUNT) {
+        break;
+      }
+    }
+    sem_wait(&thread->sem);
+  }
+  thread->is_running = false;
+}
+
+void PushThreadToList(int thread_pool_id, Thread *thread) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread list is null");
+    DestroyThreadPool(thread_pool_id);
+    return;
+  }
+  pthread_mutex_lock(&thread_list->lock);
+  if (thread_list->size == 0) {
+    thread_list->head = thread;
+    thread_list->tail = thread;
+  } else {
+    thread_list->tail->next = thread;
+    thread_list->tail = thread;
+  }
+  thread_list->size++;
+  pthread_mutex_unlock(&thread_list->lock);
+}
+
+int CreateNewThread(int thread_pool_id, int thread_id) {
+  LOG_INFO("thread_pool_id: %d, create thread: %d", thread_pool_id, thread_id);
+  Thread *thread = (Thread *)malloc(sizeof(Thread));
+  if (thread == NULL) {
+    LOG_INFO("create thread failed");
+    DestroyThreadPool(thread_pool_id);
+    return RET_TP_ERROR;
+  }
+  thread->thread_pool_id = thread_pool_id;
+  thread->thread_id = thread_id;
+  thread->head = ATOMIC_VAR_INIT(0);
+  thread->tail = ATOMIC_VAR_INIT(0);
+  thread->task_size = ATOMIC_VAR_INIT(0);
+  thread->activate = ATOMIC_VAR_INIT(true);
+  thread->is_running = ATOMIC_VAR_INIT(false);
+  thread->next = NULL;
+  sem_init(&thread->sem, 0, 0);
+  PushThreadToList(thread_pool_id, thread);
+  pthread_create(&thread->pthread, NULL, (void *)ThreadRun, thread);
+  pthread_detach(thread->pthread);
+  return RET_TP_OK;
+}
+
+int ReConfigThreadPool(int thread_pool_id, int thread_num, int mode) {
+  LOG_INFO("reconfig thread pool, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num, mode);
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  if (thread_num <= thread_pool->thread_num) {
+    LOG_INFO("no need to add thread");
+    return RET_TP_OK;
+  }
+  int curr_thread_num = thread_pool->thread_num;
+  thread_pool->thread_num = thread_num > MAX_THREAD_NUM ? MAX_THREAD_NUM : thread_num;
+  thread_pool->mode = mode;
+  if (thread_pool->thread_list == NULL) {
+    thread_pool->thread_list = (ThreadList *)malloc(sizeof(ThreadList));
+    if (thread_pool->thread_list == NULL) {
+      LOG_INFO("create thread list failed");
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+    thread_pool->thread_list->head = NULL;
+    thread_pool->thread_list->tail = NULL;
+    thread_pool->thread_list->size = 0;
+    pthread_mutex_init(&thread_pool->thread_list->lock, NULL);
+  }
+  int add_thread_num = thread_pool->thread_num - curr_thread_num;
+  for (int i = curr_thread_num - 1, j = 0; j < add_thread_num; ++i, ++j) {
+    int ret = CreateNewThread(thread_pool_id, i);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create new thread failed");
+      return RET_TP_ERROR;
+    }
+  }
+  return BindThreads(thread_pool_id, true, mode);
+}
+
+int CreateThreadPool(int thread_pool_id, int thread_num, int mode) {
+  LOG_INFO("create thread pool, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num, mode);
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+#ifdef BIND_CORE
+  if (run_once) {
+    SortCpuProcessor();
+    run_once = false;
+  }
+#endif
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  thread_pool->thread_num = thread_num > MAX_THREAD_NUM ? MAX_THREAD_NUM : thread_num;
+  thread_pool->is_alive = ATOMIC_VAR_INIT(true);
+  thread_pool->mode = mode;
+  thread_pool->thread_list = NULL;
+  if (thread_num > 1) {
+    thread_pool->thread_list = (ThreadList *)malloc(sizeof(ThreadList));
+    if (thread_pool->thread_list == NULL) {
+      LOG_INFO("create thread list failed");
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+    thread_pool->thread_list->head = NULL;
+    thread_pool->thread_list->tail = NULL;
+    thread_pool->thread_list->size = 0;
+    pthread_mutex_init(&thread_pool->thread_list->lock, NULL);
+  }
+  for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+    int ret = CreateNewThread(thread_pool_id, i);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create thread %d failed", i);
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+  }
+  return RET_TP_OK;
+}
+
+int ConfigThreadPool(int thread_pool_id, int thread_num, int mode) {
+  LOG_INFO("config: thread_pool_id: %d, thread_num: %d, mode: %d, is_created: %d, refcount: %d", thread_pool_id,
+           thread_num, mode, thread_pool_is_created[thread_pool_id], thread_pool_refcount[thread_pool_id]);
+  if (thread_pool_id >= MAX_THREAD_POOL_NUM) {
+    LOG_INFO("invalid context id: %d", thread_pool_id);
+    return RET_TP_ERROR;
+  }
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+  thread_pool_refcount[thread_pool_id] += 1;
+  int ret;
+  if (thread_pool_is_created[thread_pool_id]) {
+    ret = ReConfigThreadPool(thread_pool_id, thread_num, mode);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("reconfig thread pool failed, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num,
+               mode);
+    }
+  } else {
+    thread_pool_is_created[thread_pool_id] = true;
+    ret = CreateThreadPool(thread_pool_id, thread_num, mode);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create thread pool failed, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num,
+               mode);
+    }
+  }
+  return ret;
+}
+
+void ActivateThreadPool(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    sem_post(&thread->sem);
+    thread->activate = true;
+    thread = thread->next;
+  }
+}
+
+void DeactivateThreadPool(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    thread->activate = false;
+    thread = thread->next;
+  }
+}
+
+void DestroyThreadPool(int thread_pool_id) {
+  thread_pool_refcount[thread_pool_id]--;
+  if (thread_pool_refcount[thread_pool_id] > 0) {
+    LOG_INFO("no need to free, thread_pool_id: %d, refcount: %d", thread_pool_id, thread_pool_refcount[thread_pool_id]);
+    return;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  if (thread_pool->thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  DeactivateThreadPool(thread_pool_id);
+  thread_pool_is_created[thread_pool_id] = false;
+  thread_pool->is_alive = false;
+  for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+    Thread *thread = GetThread(thread_pool_id, i);
+    if (thread != NULL) {
+      FreeThread(thread_pool->thread_list, thread);
+    }
+  }
+  free(thread_pool->thread_list);
+  thread_pool->thread_list = NULL;
+  LOG_INFO("destroy thread pool success, thread_pool_id: %d, refcount: %d", thread_pool_id,
+           thread_pool_refcount[thread_pool_id]);
+}
+
+int GetCurrentThreadNum(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return 0;
+  }
+  return thread_pool->thread_num;
+}
diff --git a/mindspore/lite/src/runtime/thread_pool.cc b/mindspore/lite/src/runtime/thread_pool.cc
deleted file mode 100644
index ecbad2772f..0000000000
--- a/mindspore/lite/src/runtime/thread_pool.cc
+++ /dev/null
@@ -1,464 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/thread_pool.h"
-#include <algorithm>
-#include "utils/log_adapter.h"
-#ifdef MS_COMPILE_IOS
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <mach/machine.h>
-#endif  // MS_COMPILE_IOS
-
-namespace mindspore {
-namespace predict {
-constexpr int kDefaultBigCount = 2;
-constexpr int kDefaultMidCount = 2;
-constexpr uint32_t kDefaultSpinCount = 300000;
-constexpr int kSmallCpuNum = 4;
-constexpr int kBigMidCpuNum = 4;
-constexpr int kDefaultThreadNum = 1;
-static unsigned int kDefaultMaxThreadNums = 8;
-static unsigned int localMaxThreadNums = 1;
-static ThreadPool globalThreadPool;
-
-ThreadPool *GlobalThreadPool() { return &globalThreadPool; }
-
-bool LiteQueue::Enqueue(ThreadPoolTask *task) {
-  const int tailIndex = tail.load(std::memory_order_relaxed);
-  // queue full
-  auto next = (tailIndex + 1) % kSingleThreadMaxTask;
-  if (next == head.load(std::memory_order_acquire)) {
-    return false;
-  }
-  buffer[tailIndex] = task;
-  tail.store(next, std::memory_order_release);
-  ++taskSize;
-  return true;
-}
-
-bool LiteQueue::Dequeue(ThreadPoolTask **out) {
-  if (taskSize == 0) {
-    return false;
-  }
-  // queue empty
-  const int headIndex = head.load(std::memory_order_relaxed);
-  if (headIndex == tail.load(std::memory_order_acquire)) {
-    return false;
-  }
-  *out = buffer[headIndex];
-  head.store((headIndex + 1) % kSingleThreadMaxTask, std::memory_order_release);
-  return true;
-}
-
-bool LiteThreadBind::Bind(bool ifBind, int numThreads, bool master) {
-  if (master) {
-    if (!BindMasterThread(ifBind, bindModel)) {
-      MS_LOG(ERROR) << "bind msater thread failed";
-      return false;
-    }
-    MS_LOG(DEBUG) << "bind master thread successful";
-  }
-  if (numThreads > static_cast<int>(sortedCpuIds.size())) {
-    MS_LOG(ERROR) << "thread num " << numThreads << " is larger than cores " << static_cast<int>(sortedCpuIds.size())
-                  << " in the system";
-    return true;
-  }
-
-  if (!BindThreads(ifBind)) {
-    MS_LOG(ERROR) << "action " << ifBind << " thread failed";
-    return false;
-  }
-  MS_LOG(DEBUG) << "action " << ifBind << " thread successful";
-  return true;
-}
-
-void LiteThreadBind::InitSortedCpuId() {
-  // mate10(970)|p20(970): 4big, 4small
-  // mate20(980)|p30(980)|mate30(990): 2big, 2mid, 4small
-  // note: p30's core 7 not allowed to be bind
-  int numCores = 0;
-#ifdef MS_COMPILE_IOS
-  size_t len = sizeof(numCores);
-  sysctlbyname("hw.ncpu", &numCores, &len, NULL, 0);
-  numCores = numCores > 1 ? numCores : 1;
-#else
-  numCores = static_cast<int>(std::thread::hardware_concurrency());
-#endif  // MS_COMPILE_IOS
-  if (numCores < 0) {
-    MS_LOG(ERROR) << "get numCores return invalid value: " << numCores;
-    sortedCpuIds.clear();
-    return;
-  }
-  if (numCores < kBigMidCpuNum) {
-    bigCore = 0;
-    midCore = numCores;
-  } else {
-    bigCore = kDefaultBigCount;
-    midCore = kDefaultMidCount;
-  }
-  sortedCpuIds.clear();
-  for (int i = numCores - 1; i >= 0; --i) {
-    sortedCpuIds.emplace_back(i);
-  }
-  if (sortedCpuIds.size() > kSmallCpuNum) {
-    sortedCpuIds.resize(bigCore + midCore);
-  }
-}
-
-bool LiteThreadBind::BindMasterThread(bool bindFlag, int mode) {
-  std::vector<int> cpu;
-  if (bindFlag) {
-    size_t cpuIndex;
-    if (mode == MID_CORE) {
-      cpuIndex = sortedCpuIds.size() - 1;
-    } else {
-      cpuIndex = 0;
-    }
-    cpu.emplace_back(sortedCpuIds[cpuIndex]);
-  } else {
-    // unbind master
-    cpu.assign(sortedCpuIds.begin(), sortedCpuIds.end());
-  }
-  cpu_set_t cpuSet;
-#ifndef CPU_SET
-  (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-  CPU_ZERO(&cpuSet);
-#endif
-  for (auto coreId : cpu) {
-#ifndef CPU_SET
-    CPU_SET_LOCAL(coreId, &cpuSet);
-#else
-    CPU_SET(coreId, &cpuSet);
-#endif
-  }
-  if (!SetCPUBind(pthread_self(), &cpuSet)) {
-    MS_LOG(ERROR) << "do master bind failed. mode: " << mode;
-    return false;
-  }
-  return true;
-}
-
-bool LiteThreadBind::BindThreads(bool bindFlag) {
-  if (bindFlag && bindModel != NO_BIND) {
-    size_t bindNums = std::min(sortedCpuIds.size(), threadIdList.size());
-    cpu_set_t cpuSet;
-    size_t coreIndex;
-    for (size_t i = 0; i < bindNums; ++i) {
-#ifndef CPU_SET
-      (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-      CPU_ZERO(&cpuSet);
-#endif
-      if (bindModel == MID_CORE) {
-        coreIndex = sortedCpuIds.size() - 2 - i;
-      } else {
-        coreIndex = i + 1;
-      }
-#ifndef CPU_SET
-      CPU_SET_LOCAL(sortedCpuIds[coreIndex], &cpuSet);
-#else
-      CPU_SET(sortedCpuIds[coreIndex], &cpuSet);
-#endif
-      if (!SetCPUBind(threadIdList[i], &cpuSet)) {
-        MS_LOG(ERROR) << "do SetCPUBind failed";
-        return false;
-      }
-    }
-  } else {
-    // unbind
-    size_t bindNums = std::min(sortedCpuIds.size(), threadIdList.size());
-    cpu_set_t cpuSet;
-#ifndef CPU_SET
-    (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-    CPU_ZERO(&cpuSet);
-#endif
-    for (auto coreId : sortedCpuIds) {
-#ifndef CPU_SET
-      CPU_SET_LOCAL(coreId, &cpuSet);
-#else
-      CPU_SET(coreId, &cpuSet);
-#endif
-    }
-    for (size_t i = 0; i < bindNums; ++i) {
-      if (!SetCPUBind(threadIdList[i], &cpuSet)) {
-        MS_LOG(ERROR) << "do SetCPUBind failed";
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool LiteThreadBind::SetCPUBind(pthread_t threadId, cpu_set_t *cpuSet) {
-#if defined(__ANDROID__)
-#if __ANDROID_API__ >= 21
-  int ret = sched_setaffinity(pthread_gettid_np(threadId), sizeof(cpu_set_t), cpuSet);
-  if (ret != 0) {
-    MS_LOG(ERROR) << "bind thread " << threadId << "to cpu failed.ERROR " << ret;
-  }
-#endif
-#else
-#ifdef __APPLE__
-  MS_LOG(ERROR) << "not bind thread to apple's cpu.";
-  return false;
-#else
-#ifndef _WIN32
-  int ret = pthread_setaffinity_np(threadId, sizeof(cpuSet), cpuSet);
-  if (ret != 0) {
-    MS_LOG(ERROR) << "bind thread " << threadId << " to cpu failed.ERROR " << ret;
-    return false;
-  }
-#endif
-#endif  // __APPLE__
-#endif
-  return true;
-}
-
-bool ThreadPool::SetThreadPool() {
-  std::lock_guard<std::mutex> Lock(poolMutex);
-  if (configThreadNums <= 0) {
-    MS_LOG(WARNING) << "numThreads " << configThreadNums << ", must be greater than 0";
-    configThreadNums = curThreadRunNums;
-  }
-  if (localMaxThreadNums == 0) {
-    localMaxThreadNums = 1;
-  } else if (localMaxThreadNums > kDefaultMaxThreadNums) {
-    localMaxThreadNums = kDefaultMaxThreadNums;
-  }
-  if (configThreadNums > static_cast<int>(kDefaultMaxThreadNums)) {
-    configThreadNums = kDefaultMaxThreadNums;
-  }
-  int addNum = 0;
-  if (configThreadNums > static_cast<int>(kDefaultMaxThreadNums)) {
-    addNum = configThreadNums - curThreadRunNums;
-  } else if (static_cast<int>(localMaxThreadNums) > curThreadNums) {
-    addNum = localMaxThreadNums - curThreadNums;
-  }
-  AddNewThread(addNum);
-  if (curThreadRunNums > static_cast<int>(localMaxThreadNums)) {
-    SubRunThread(localMaxThreadNums);
-  } else {
-    AddRunThread(localMaxThreadNums);
-  }
-  return true;
-}
-
-void ThreadPool::AddNewThread(int newNums) {
-  for (int i = curThreadNums - 1, j = 0; j < newNums; ++i, ++j) {
-    auto active = new std::atomic_bool{true};
-    auto queue = std::make_shared<LiteQueue>();
-    threadList.emplace_back([this, i, active, queue]() {
-      ThreadPoolTask *task = nullptr;
-      uint32_t spin_count = 0;
-      while (!exitRun) {
-        while (*active) {
-          if (queue->Dequeue(&task)) {
-            auto ret = task->first(i + 1, task->second.tvmParam, task->second.cdata);
-            if (ret != 0) {
-              errorInfo.emplace_back(std::make_pair(i + 1, std::make_pair(false, ret)));
-            }
-            queue->taskSize--;
-            spin_count = 0;
-          } else {
-            ++spin_count;
-          }
-          if (spin_count == kDefaultSpinCount) {
-            *(activateList[i]) = false;
-            --curThreadRunNums;
-            spin_count = 0;
-            break;
-          }
-          std::this_thread::yield();
-        }
-        std::unique_lock<std::mutex> queueLock(tMutex);
-        queueReady.wait(queueLock, [active, this] { return exitRun || *active; });
-      }
-    });
-    activateList.emplace_back(active);
-    queueList.emplace_back(queue);
-  }
-  curThreadNums += newNums;
-  curThreadRunNums += newNums;
-}
-
-bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) {
-  if (curThreadRunNums <= 0) {
-    MS_LOG(ERROR) << "no threads need to be bind, totalThreadNum : " << curThreadRunNums;
-    return false;
-  }
-  if (threadBind == nullptr) {
-    threadBind = std::unique_ptr<LiteThreadBind>(new LiteThreadBind());
-    if (threadBind == nullptr) {
-      MS_LOG(ERROR) << "create threadBind failed";
-      return false;
-    }
-    threadBind->threadIdList.resize(kDefaultMaxThreadNums);
-    threadBind->InitSortedCpuId();
-  }
-  threadBind->threadIdList.clear();
-  for (auto &it : threadList) {
-    threadBind->threadIdList.emplace_back(it.native_handle());
-  }
-  threadBind->bindModel = static_cast<AffinityMode>(mode);
-  if (!threadBind->Bind(ifBind, curThreadRunNums, master)) {
-    MS_LOG(ERROR) << "bind failed";
-    return false;
-  }
-  return true;
-}
-
-bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) {
-  if (numTask <= 0) {
-    numTask = curThreadRunNums;
-  }
-  TvmEnv env{};
-  env.num_task = numTask;
-  errorInfo.clear();
-  // single task, run master thread
-  if (curThreadRunNums <= 1) {
-    for (int i = 0; i < numTask; ++i) {
-      int ret = worker(i, &env, cdata);
-      if (ret != 0) {
-        errorInfo.emplace_back(std::make_pair(0, std::make_pair(false, ret)));
-      }
-    }
-    return CheckResult();
-  }
-  ThreadPoolTask task;
-  task.first = std::move(worker);
-  task.second.cdata = cdata;
-  task.second.tvmParam = &env;
-  return DistributeTask(&task, numTask);
-}
-
-bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) {
-  auto taskOri = *task;
-  if (numTask > curThreadRunNums) {
-    task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int {
-      for (int i = task_id; i < numTask; i += curThreadRunNums) {
-        int ret = taskOri.first(i, penv, cdata);
-        if (ret != 0) {
-          errorInfo.emplace_back(std::make_pair(i + 1, std::make_pair(false, ret)));
-        }
-      }
-      return 0;
-    };
-  }
-  bool kSuccFlag;
-  auto size = std::min(curThreadRunNums, numTask);
-  for (int i = 0; i < size - 1; ++i) {
-    do {
-      kSuccFlag = true;
-      if (!queueList[i]->Enqueue(task)) {
-        std::this_thread::yield();
-        kSuccFlag = false;
-      }
-    } while (!kSuccFlag);
-  }
-  // master thread
-  int ret = task->first(0, task->second.tvmParam, task->second.cdata);
-  if (ret != 0) {
-    errorInfo.emplace_back(std::make_pair(0, std::make_pair(false, ret)));
-  }
-  kSuccFlag = false;
-  while (!kSuccFlag) {
-    std::this_thread::yield();
-    kSuccFlag = true;
-    for (int i = 0; i < curThreadRunNums - 1; ++i) {
-      if (queueList[i]->taskSize != 0) {
-        kSuccFlag = false;
-        break;
-      }
-    }
-  }
-  return CheckResult();
-}
-
-void ThreadPool::AddRunThread(int num) {
-  int activeNums = num - curThreadRunNums;
-  if (activeNums <= 0 || static_cast<int>(activateList.size()) < activeNums) {
-    return;
-  }
-  for (int i = curThreadRunNums - 1, j = 0; j < activeNums; ++i, ++j) {
-    *activateList[i] = true;
-  }
-  std::lock_guard<std::mutex> queueLock(tMutex);
-  queueReady.notify_all();
-  curThreadRunNums = num;
-}
-
-void ThreadPool::SubRunThread(int num) {
-  int deactiveNums = curThreadRunNums - num;
-  if (deactiveNums <= 0) {
-    return;
-  }
-  for (int i = num - 1, j = 0; j < deactiveNums; ++i, ++j) {
-    *activateList[i] = false;
-  }
-  curThreadRunNums = num;
-}
-
-bool ThreadPool::CheckResult() {
-  bool kSuccFlag = true;
-  for (auto result : errorInfo) {
-    if (result.second.first) {
-      MS_LOG(ERROR) << "task " << result.first << " failed, error code is " << result.second.second;
-      kSuccFlag = false;
-    }
-  }
-  return kSuccFlag;
-}
-
-bool ThreadPool::LaunchWork(WorkFun worker, void *cdata, int numTask) {
-  if (!SetThreadPool()) {
-    return false;
-  }
-  return AddTask(std::move(worker), cdata, numTask);
-}
-
-bool ThreadPool::BindAllThreads(bool ifBind, int mode, bool master) {
-  if (!SetThreadPool()) {
-    return false;
-  }
-  return SetThreadCpuBind(ifBind, mode, master);
-}
-
-void ThreadPool::ConfigThreadPool(int mode, int numThreads) {
-  configBindMode = mode;
-  configThreadNums = numThreads;
-}
-
-void ThreadPool::ConfigMaxThreadNum(unsigned int num) { localMaxThreadNums = num; }
-
-ThreadPool::~ThreadPool() {
-  curThreadRunNums = static_cast<int>(threadList.size() + 1);
-  exitRun = true;
-  SubRunThread(kDefaultThreadNum);
-  queueReady.notify_all();
-  for (auto &it : threadList) {
-    if (it.joinable()) {
-      it.join();
-    }
-  }
-  for (const auto &it : activateList) {
-    delete it;
-  }
-}
-}  // namespace predict
-}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/thread_pool.h b/mindspore/lite/src/runtime/thread_pool.h
index 6670f7a932..d537aaaf02 100644
--- a/mindspore/lite/src/runtime/thread_pool.h
+++ b/mindspore/lite/src/runtime/thread_pool.h
@@ -17,111 +17,67 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
 
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-#include <vector>
-#include <string>
-#include <atomic>
-#include <memory>
-#include <utility>
-#include <functional>
-#include <iostream>
-#include "src/runtime/runtime_api.h"
+#include <stdbool.h>
 
-namespace mindspore {
-namespace predict {
-#ifndef CPU_SET
-const int CPU_SETSIZE = 1024;
-#define __NCPUBITS (8 * sizeof(uint64_t))
-typedef struct {
-  uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
-} cpu_set_t;
+/// \brief BindMode defined for holding bind cpu strategy argument.
+typedef enum {
+  MID_MODE = -1,   /**< bind middle cpu first */
+  HIGHER_MODE = 1, /**< bind higher cpu first */
+  NO_BIND_MODE = 0     /**< no bind */
+} BindMode;
 
-#define CPU_SET_LOCAL(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
-#endif
+/// \brief ThreadPoolId defined for specifying which thread pool to use.
+typedef enum {
+  THREAD_POOL_DEFAULT = 0, /**< default thread pool id */
+  THREAD_POOL_SECOND = 1,  /**< the second thread pool id */
+  THREAD_POOL_THIRD = 2,   /**< the third thread pool id */
+  THREAD_POOL_FOURTH = 3   /**< the fourth thread pool id */
+} ThreadPoolId;
 
-constexpr int kSingleThreadMaxTask = 2;
-using TvmEnv = LiteParallelGroupEnv;
-using WorkFun = std::function<int(int, TvmEnv *, void *)>;
-using TaskParam = struct Param {
-  void *cdata;
-  TvmEnv *tvmParam;
-};
-using ThreadPoolTask = std::pair<WorkFun, TaskParam>;
-enum AffinityMode : int { BIG_CORE = 1, MID_CORE = -1, NO_BIND = 0 };
-
-class LiteQueue {
- public:
-  LiteQueue() = default;
-  ~LiteQueue() = default;
-  bool Enqueue(ThreadPoolTask *task);
-  bool Dequeue(ThreadPoolTask **out);
-  std::atomic_int taskSize = {0};
-
- private:
-  std::atomic_int head = {0};
-  std::atomic_int tail = {0};
-  ThreadPoolTask *buffer[kSingleThreadMaxTask]{};
-};
+/**
+ * create thread pool and init
+ * @param thread_num
+ * @param mode
+ */
+int ConfigThreadPool(int thread_pool_id, int thread_num, int mode);
 
-class LiteThreadBind {
- public:
-  LiteThreadBind() = default;
-  ~LiteThreadBind() = default;
-  void InitSortedCpuId();
-  bool Bind(bool ifBind, int numThreads, bool master);
-  AffinityMode bindModel = MID_CORE;
-  std::vector<pthread_t> threadIdList;
+/**
+ *
+ * @param session_index, support multi session
+ * @param job
+ * @param content
+ * @param task_num
+ */
+int ParallelLaunch(int thread_pool_id, int (*job)(void *, int), void *content, int task_num);
 
- private:
-  bool BindMasterThread(bool bindFlag, int mode);
-  bool BindThreads(bool bindFlag);
-  bool SetCPUBind(pthread_t threadId, cpu_set_t *cpuSet);
-  int bigCore = 0;
-  int midCore = 0;
-  std::vector<unsigned int> sortedCpuIds{};
-};
+/**
+ * bind each thread to specified cpu core
+ * @param is_bind
+ * @param mode
+ */
+int BindThreads(int thread_pool_id, bool is_bind, int mode);
 
-class ThreadPool {
- public:
-  ThreadPool() = default;
-  ~ThreadPool();
-  bool LaunchWork(WorkFun worker, void *cdata, int numTask);
-  void ConfigThreadPool(int mode, int numThreads);
-  void ConfigMaxThreadNum(unsigned int num);
-  bool BindAllThreads(bool ifBind, int mode, bool master = true);
-  ThreadPool(const ThreadPool &) = delete;
-  ThreadPool &operator=(const ThreadPool &) = delete;
+/**
+ * activate the thread pool
+ * @param thread_pool_id
+ */
+void ActivateThreadPool(int thread_pool_id);
 
- private:
-  bool SetThreadPool();
-  void AddNewThread(int newNums);
-  bool SetThreadCpuBind(bool ifBind, int mode, bool master);
-  bool AddTask(WorkFun &&worker, void *cdata, int numTask);
-  bool DistributeTask(ThreadPoolTask *task, int numTask);
-  void AddRunThread(int num);
-  void SubRunThread(int num);
-  bool CheckResult();
+/**
+ * deactivate the thread pool
+ * @param thread_pool_id
+ */
+void DeactivateThreadPool(int thread_pool_id);
 
-  std::mutex poolMutex;
-  std::mutex tMutex;
-  std::condition_variable queueReady;
-  std::atomic_bool exitRun = {false};
-  std::vector<std::atomic_bool *> activateList{};
-  int curThreadNums = 1;
-  int curThreadRunNums = 1;
-  int configThreadNums = 1;
-  int configBindMode = -1;
-  std::vector<std::thread> threadList{};
-  std::vector<std::shared_ptr<LiteQueue>> queueList{};
-  std::unique_ptr<LiteThreadBind> threadBind{nullptr};
-  std::vector<std::pair<int, std::pair<bool, int>>> errorInfo{};
-};
+/**
+ *
+ * @return current thread num
+ */
+int GetCurrentThreadNum(int thread_pool_id);
 
-ThreadPool* GlobalThreadPool();
-}  // namespace predict
-}  // namespace mindspore
+/**
+ * destroy thread pool, and release resource
+ */
+void DestroyThreadPool(int thread_pool_id);
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
-
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index 6232875309..250ab9b610 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -16,6 +16,8 @@
 
 #include "src/scheduler.h"
 #include <vector>
+#include <string>
+#include <algorithm>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/common/graph_util.h"
@@ -77,10 +79,10 @@ int Scheduler::ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels) {
 }
 
 int Scheduler::InferShape(const lite::Model *model, std::vector<tensor::Tensor *> *tensors) {
-  MS_EXCEPTION_IF_NULL(model);
-  MS_EXCEPTION_IF_NULL(tensors);
+  MS_ASSERT(nullptr != model);
+  MS_ASSERT(nullptr != tensors);
   auto meta_graph = model->GetMetaGraph();
-  MS_EXCEPTION_IF_NULL(meta_graph);
+  MS_ASSERT(nullptr != meta_graph);
   bool infer_shape_interrupt = false;
   uint32_t kernelCount = meta_graph->nodes()->size();
   for (uint32_t i = 0; i < kernelCount; i++) {
@@ -121,10 +123,10 @@ int Scheduler::InferShape(const lite::Model *model, std::vector<tensor::Tensor *
 
 int Scheduler::InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tensor *> *tensors,
                              std::vector<kernel::LiteKernel *> *kernels) {
-  MS_EXCEPTION_IF_NULL(model);
-  MS_EXCEPTION_IF_NULL(tensors);
+  MS_ASSERT(nullptr != model);
+  MS_ASSERT(nullptr != tensors);
   auto meta_graph = model->GetMetaGraph();
-  MS_EXCEPTION_IF_NULL(meta_graph);
+  MS_ASSERT(nullptr != meta_graph);
   uint32_t kernelCount = meta_graph->nodes()->size();
   auto graph_output_node_indexes = GetGraphOutputNodes(meta_graph);
   for (uint32_t i = 0; i < kernelCount; i++) {
@@ -140,7 +142,7 @@ int Scheduler::InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tenso
       outputs.emplace_back(tensors->at(size_t(outIndexes->GetAs<uint32_t>(j))));
     }
     auto *primitive = model->GetOp(cNode->name()->str());
-    auto *kernel = this->ScheduleNode(inputs, outputs, primitive);
+    auto *kernel = this->ScheduleNode(inputs, outputs, primitive, cNode);
     if (nullptr == kernel) {
       MS_LOG(ERROR) << "ScheduleNode return nullptr, name: " << cNode->name()->str()
                     << ", type: " << schema::EnumNamePrimitiveType(cNode->primitive()->value_type());
@@ -176,22 +178,29 @@ void Scheduler::ConstructSubgraphs(std::vector<kernel::LiteKernel *> *kernels) {
   }
 
   std::vector<kernel::LiteKernel *> subgraph_kernels;
+  size_t sub_cnt{0};
   for (auto temp_kernels : sub_kernels_list) {
     kernel::KERNEL_ARCH arch = temp_kernels.front()->desc().arch;
     if (arch == kernel::KERNEL_ARCH::kCPU) {
       for (auto kernel : temp_kernels) {
         for (auto tensor : kernel->out_tensors()) {
           tensor->set_allocator(context_->allocator.get());
-          if (context_->float16_priority && tensor->data_type() == kNumberTypeFloat16) {
-            tensor->set_data_type(kNumberTypeFloat32);
-          }
+        }
+      }
+      std::vector<tensor::Tensor *> output_tensor = kernel::LiteKernelUtil::SubgraphOutputTensors(temp_kernels);
+      for (auto tensor : output_tensor) {
+        if (context_->float16_priority && tensor->data_type() == kNumberTypeFloat16) {
+          tensor->set_data_type(kNumberTypeFloat32);
         }
       }
       std::copy(temp_kernels.begin(), temp_kernels.end(), std::back_inserter(subgraph_kernels));
     } else {
       auto subgraph_kernel = CreateSubKernel(temp_kernels, arch);
       subgraph_kernels.emplace_back(subgraph_kernel);
+      std::string arch_name = (arch == kernel::KERNEL_ARCH::kGPU) ? "GPU" : "NPU";
+      MS_LOG(INFO) << arch_name << " subgraph id" << sub_cnt << " created.";
     }
+    ++sub_cnt;
   }
   kernels->clear();
   kernels->insert(kernels->begin(), subgraph_kernels.begin(), subgraph_kernels.end());
@@ -220,7 +229,7 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer
 
 kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors,
                                             const std::vector<tensor::Tensor *> &out_tensors,
-                                            const mindspore::lite::PrimitiveC *primitive) {
+                                            const mindspore::lite::PrimitiveC *primitive, const schema::CNode *cnode) {
   MS_ASSERT(nullptr != primitive);
   auto data_type = in_tensors.front()->data_type();
   kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, static_cast<schema::PrimitiveType>(primitive->Type())};
@@ -230,6 +239,10 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *>
     if (nullptr != kernel) {
       kernel->set_desc(desc);
       return kernel;
+    } else {
+      MS_LOG(ERROR) << "Not supported GPU Op "
+                   << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(primitive->Type())) << " "
+                   << (cnode->name()->str());
     }
   }
 
diff --git a/mindspore/lite/src/scheduler.h b/mindspore/lite/src/scheduler.h
index 0815d9c6be..aa39383c07 100644
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@@ -35,7 +35,8 @@ class Scheduler {
  protected:
   kernel::LiteKernel *ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors,
                                    const std::vector<tensor::Tensor *> &out_tensors,
-                                   const mindspore::lite::PrimitiveC *primitive);
+                                   const mindspore::lite::PrimitiveC *primitive,
+                                   const schema::CNode *cnode);
 
  private:
   int InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tensor *> *tensors,
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index 7b1e91a602..29687a1f89 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -43,6 +43,7 @@ if(BUILD_CONVERTER)
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/scope.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/value.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/value_extends.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/dtype/ref.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/dtype/container.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/dtype/empty.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../core/ir/dtype/number.cc
@@ -155,9 +156,9 @@ if (SUPPORT_GPU)
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
-            ${LITE_DIR}/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/prelu.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
+            ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc
             )
 endif()
 ### minddata lite
@@ -177,7 +178,7 @@ set(TEST_LITE_SRC
         ${KERNEL_OP_SRC}
         ${LITE_DIR}/src/runtime/allocator.cc
         ${LITE_DIR}/src/runtime/runtime_api.cc
-        ${LITE_DIR}/src/runtime/thread_pool.cc
+        ${LITE_DIR}/src/runtime/thread_pool.c
         ${LITE_DIR}/src/runtime/workspace_pool.cc
         ${LITE_DIR}/src/runtime/parallel_executor.cc
         ${LITE_DIR}/src/ir/tensor.cc
@@ -228,6 +229,7 @@ if(BUILD_CONVERTER)
             ${TEST_LITE_SRC}
             ${TEST_CASE_TFLITE_PARSERS_SRC}
             ${TOP_DIR}/mindspore/core/utils/flags.cc
+            ${LITE_DIR}/tools/common/protobuf_utils.cc
             ${LITE_DIR}/tools/converter/optimizer.cc
             ${LITE_DIR}/tools/converter/anf_transform.cc
             ${LITE_DIR}/tools/converter/graphdef_transform.cc
@@ -244,6 +246,7 @@ if(BUILD_CONVERTER)
             ${LITE_DIR}/tools/optimizer/common/gllo_utils.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_biasadd_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_activation_fusion.cc
+             ${LITE_DIR}/tools/optimizer/fusion/conv_tuple_activation_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_transform_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_scale_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_bn_fusion.cc
@@ -335,9 +338,9 @@ if (SUPPORT_GPU)
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc
-            ${TEST_DIR}/ut/src/runtime/kernel/opencl/caffe_prelu_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/prelu_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc
+            ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc
             )
 endif()
 
diff --git a/mindspore/lite/test/models_caffe.cfg b/mindspore/lite/test/models_caffe.cfg
index 44f0ca60c9..39da5005f6 100644
--- a/mindspore/lite/test/models_caffe.cfg
+++ b/mindspore/lite/test/models_caffe.cfg
@@ -16,8 +16,8 @@ tracking
 mtk_isface
 mtk_landmark
 mtk_pose_tuku
-# mtk_face_recognition_v1
-# mtk_2012_ATLANTA_10class_20190614_v41
+mtk_face_recognition_v1
+mtk_2012_ATLANTA_10class_20190614_v41
 mtk_detect-deeper-halfdeeper-mbv1-lastearlySSD-shortcut-400-400_nopostprocess_simplified
 detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified
 hiai_face_detect_rfb
@@ -28,16 +28,16 @@ ml_hand_detection
 ml_ocr_cn
 ml_ocr_sfz_detect_0325
 ml_hardware_liveness
-# ml_liveness_detect_landmark
+ml_liveness_detect_landmark
 ml_face_contour
 2012_ATLANTA_1class_20190621_v4.x_nomean
 ml_handpose
 ml_ocr_sfz_add_final_0325
-# ml_hardware_pose
+ml_hardware_pose
 ml_bank_recog
 2012_ATLANTA_10class_20190131_v4.0
 mnet
-# recognition
+recognition
 ml_face_landmark
 model_hebing_3branch
 hiai_cv_focusShootOCRModel_07
@@ -48,9 +48,9 @@ hiai_cv_focusShootOCRModel_04
 hiai_cv_focusShootOCRModel_06
 hiai_cpu_face_hat
 hiai_video_seg
-# hiai_semantic_seg
+hiai_semantic_seg
 hiai_human_seg
-# hiai_face_recognition_1
+hiai_face_recognition_1
 hiai_cpu_face_detect
 hiai_cpu_face_attr
 hiai_face_attr1
diff --git a/mindspore/lite/test/models_fp16.cfg b/mindspore/lite/test/models_fp16.cfg
index e1abfab54c..f229177bcd 100644
--- a/mindspore/lite/test/models_fp16.cfg
+++ b/mindspore/lite/test/models_fp16.cfg
@@ -1,7 +1,7 @@
 detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified.fp16
-model_emotions_0727_nosoftmax.fp16
+model_emotions_0727_nosoftmax.tflite.fp16
 mtk_isface.fp16
 mtk_landmark.fp16
 mtk_pose_tuku.fp16
-mtk_age_gender.fp16
-mtk_model_face_dress.fp16
+mtk_age_gender.tflite.fp16
+mtk_model_face_dress.tflite.fp16
diff --git a/mindspore/lite/test/models_tflite.cfg b/mindspore/lite/test/models_tflite.cfg
index 13f1db88dc..b8a8f7c58a 100644
--- a/mindspore/lite/test/models_tflite.cfg
+++ b/mindspore/lite/test/models_tflite.cfg
@@ -56,6 +56,7 @@ nasnet_large.tflite
 model_emotions_0727_nosoftmax.tflite
 inception_resnet_v2.tflite
 ml_ocr_latin.tflite
+hiai_PoseEstimation_Pcm.tflite
 hiai_ssd_mobilenetv2_object.tflite
 hiai_cv_focusShootOCRModel_02.tflite
 hiai_cv_poseEstimation.tflite
@@ -66,11 +67,20 @@ mtk_model_face_dress_fp16.tflite
 mtk_AADB_HADB_MBV2_model_f16.tflite
 mtk_AADB_HADB_MBV3_model_f16.tflite
 mtk_model_emotions_0725_fp16.tflite
+mtk_face_features_v1_fp16.tflite
+siteAI_digcom_AI_ECN.tflite
+siteAI_digcom_g2v_keras.tflite
+siteAI_trans_nonlinear.tflite
+siteAI_trans_tcpclassify.tflite
+siteAI_wireless_depress_w.tflite
+siteAI_wireless_restore_w.tflite
 magenta_arbitrary-image-stylization-v1-256_fp16_prediction_1.tflite
 ml_object_detect.tflite
 hiai_cpu_face_emotion.tflite
 hiai_cpu_face_gazing.tflite
 hiai_cpu_face_headpose.tflite
+hiai_humanDetection.tflite
+hiai_cv_focusShootOCRModel_08.tflite
 ml_face_openclose.tflite
 hiai_face_model_npu.tflite
 hiai_ctpn_feature_map.tflite
diff --git a/mindspore/lite/test/models_tflite_awaretraining.cfg b/mindspore/lite/test/models_tflite_awaretraining.cfg
index 6a72f203e4..66dacbda9a 100644
--- a/mindspore/lite/test/models_tflite_awaretraining.cfg
+++ b/mindspore/lite/test/models_tflite_awaretraining.cfg
@@ -1,2 +1,8 @@
 video_infer.tflite
+mobilenet_v1_1.0_224_quant.tflite
 mobilenet_v2_1.0_224_quant.tflite
+hiai_graph_8bit_combined.tflite
+inception_v1_224_quant.tflite
+inception_v3_quant.tflite
+inception_v4_299_quant.tflite
+graph_8bit_1021_combine.tflite
diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh
index 389229ab30..ae5f15d651 100644
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -8,16 +8,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "{run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-	    run_result='Run_x86: '${model_name}' pass'
+	    run_result='x86: '${model_name}' pass'
 	    echo ${run_result} >> ${run_benchmark_result_file}
 	else
-	    run_result='Run_x86: '${model_name}' fail <<===========================this is the failed case'
+	    run_result='x86: '${model_name}' failed'
 	    echo ${run_result} >> ${run_benchmark_result_file}
 	    return 1
         fi
@@ -29,16 +29,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "${run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_x86: '${model_name}' pass'
+            run_result='x86: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_x86: '${model_name}' fail <<===========================this is the failed case'
+            run_result='x86: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -50,16 +50,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "${run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_x86: '${model_name}' pass'
+            run_result='x86: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_x86: '${model_name}' fail <<===========================this is the failed case'
+            run_result='x86: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -71,16 +71,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "${run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'_posttraining.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_posttraining.ms.out --warmUpLoopCount=1 --loopCount=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}_posttraining.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_posttraining.ms.out --warmUpLoopCount=1 --loopCount=1
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'_posttraining.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_posttraining.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}_posttraining.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_posttraining.ms.out --warmUpLoopCount=1 --loopCount=1 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_x86: '${model_name}'_posttraining pass'
+            run_result='x86: '${model_name}'_posttraining pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_x86: '${model_name}'_posttraining fail <<===========================this is the failed case'
+            run_result='x86: '${model_name}'_posttraining failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -92,16 +92,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "${run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --numThreads=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 --numThreads=1
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --numThreads=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 --numThreads=1 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_x86: '${model_name}'_awaretraining pass'
+            run_result='x86: '${model_name}'_awaretraining pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_x86: '${model_name}'_awaretraining fail <<===========================this is the failed case'
+            run_result='x86: '${model_name}'_awaretraining failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -113,16 +113,16 @@ function Run_x86() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
-        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  '${convertor_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86} >> "${run_benchmark_log_file}"
         cd ${convertor_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86} || return 1
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' || return 1
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 --accuracyThreshold=1.5
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath='${ms_models_path}'/'${model_name}'.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib;./benchmark/benchmark --modelPath=${ms_models_path}/${model_name}.ms --inDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --calibDataPath=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --warmUpLoopCount=1 --loopCount=1 --accuracyThreshold=1.5 >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_x86: '${model_name}' pass'
+            run_result='x86: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_x86: '${model_name}' fail <<===========================this is the failed case'
+            run_result='x86: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -137,30 +137,30 @@ function Run_arm64() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64: '${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
         # run benchmark test without clib data
-        echo ${model_name}
+        #echo ${model_name}
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64: '${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -173,30 +173,30 @@ function Run_arm64() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64:'${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
         # run benchmark test without clib data
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64:'${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -209,30 +209,30 @@ function Run_arm64() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64:'${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
         # run benchmark test without clib data
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "{run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64:'${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
@@ -245,48 +245,84 @@ function Run_arm64() {
         if [[ $model_name == \#* ]]; then
           continue
         fi
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64: '${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
+            run_result='arm64: '${model_name}' failed'
             echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
         # run benchmark test without clib data
-        echo ${model_name}
+        echo ${model_name} >> "${run_benchmark_log_file}"
         echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
-        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5'
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
         echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
-        adb -s ${device_id} shell < adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
         if [ $? = 0 ]; then
-            run_result='Run_arm64: '${model_name}' pass'
+            run_result='arm64: '${model_name}' pass'
             echo ${run_result} >> ${run_benchmark_result_file}
         else
-            run_result='Run_arm64:'${model_name}' fail <<===========================this is the failed case'
-            echo ${run_result} >> ${run_benchmark_result_file}
+            run_result='arm64: '${model_name}' failed'
+	    echo ${run_result} >> ${run_benchmark_result_file}
             return 1
         fi
 	#sleep 1
     done < ${models_fp16_config}
+
+    # Run tflite aware training quantization converted models:
+    while read line; do
+        model_name=${line}
+        if [[ $model_name == \#* ]]; then
+          continue
+        fi
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+        # run benchmark test without clib data
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+	#sleep 1
+    done < ${models_tflite_awaretraining_config}
 }
 
 # Print start msg before run testcase
 function MS_PRINT_TESTCASE_START_MSG() {
     echo ""
-    echo -e "----------------------------------------------------------------------------------------------------------------------------------------------"
-    echo -e "Testcase Result                                                                                                                               "
-    echo -e "-------- ------                                                                                                                               "
+    echo -e "-----------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "env        Testcase                                                                                                       Result   "
+    echo -e "---        --------                                                                                                       ------   "
 }
 
 # Print start msg after run testcase
 function MS_PRINT_TESTCASE_END_MSG() {
-    echo -e "----------------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "-----------------------------------------------------------------------------------------------------------------------------------"
 }
 
 
@@ -332,16 +368,13 @@ process_unit_x86=${suffix[0]}
 
 # Unzip arm
 cd ${arm_path} || exit 1
-mkdir mindspore-lite-${version}-runtime-arm64-${process_unit_arm}
-tar -zxf mindspore-lite-${version}-runtime-arm64-${process_unit_arm}.tar.gz -C mindspore-lite-${version}-runtime-arm64-${process_unit_arm} --strip-components 1 || exit 1
+tar -zxf mindspore-lite-${version}-runtime-arm64-${process_unit_arm}.tar.gz || exit 1
 
 # Unzip x86 runtime and convertor
 cd ${convertor_path} || exit 1
-mkdir mindspore-lite-${version}-runtime-x86-${process_unit_x86}
-tar -zxf mindspore-lite-${version}-runtime-x86-${process_unit_x86}.tar.gz -C mindspore-lite-${version}-runtime-x86-${process_unit_x86} --strip-components 1 || exit 1
+tar -zxf mindspore-lite-${version}-runtime-x86-${process_unit_x86}.tar.gz || exit 1
 
-mkdir mindspore-lite-${version}-convert-ubuntu
-tar -zxf mindspore-lite-${version}-convert-ubuntu.tar.gz -C mindspore-lite-${version}-convert-ubuntu --strip-components 1 || exit 1
+tar -zxf mindspore-lite-${version}-convert-ubuntu.tar.gz || exit 1
 cd ${convertor_path}/mindspore-lite-${version}-convert-ubuntu || exit 1
 cp converter/converter_lite ./ || exit 1
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:./lib/:./third_party/protobuf/lib
@@ -349,6 +382,13 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:./lib/:./third_party/protobuf/lib
 # Convert the models
 cd ${convertor_path}/mindspore-lite-${version}-convert-ubuntu || exit 1
 
+# Write resulte to temp file
+run_benchmark_result_file=${basepath}/run_benchmark_result.txt
+echo ' ' > ${run_benchmark_result_file}
+
+run_benchmark_log_file=${basepath}/run_benchmark_log.txt
+echo 'run benchmark logs: ' > ${run_benchmark_log_file}
+
 # Set models config filepath
 models_tflite_config=${basepath}/models_tflite.cfg
 models_caffe_config=${basepath}/models_caffe.cfg
@@ -357,30 +397,21 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg
 models_onnx_config=${basepath}/models_onnx.cfg
 models_fp16_config=${basepath}/models_fp16.cfg
 models_mindspore_config=${basepath}/models_mindspore.cfg
+Convert_status=0
 
 rm -rf ${basepath}/ms_models
 mkdir -p ${basepath}/ms_models
 ms_models_path=${basepath}/ms_models
 
-# Copy fp16 ms models:
-while read line; do
-  model_name=${line}
-  if [[ $model_name == \#* ]]; then
-      continue
-  fi
-  echo 'cp '${models_path}'/'${model_name}'.ms' ${ms_models_path}'/'${model_name}'.ms'
-  cp $models_path/${model_name}.ms ${ms_models_path}/${model_name}.ms
-done < ${models_fp16_config}
-
 # Convert tflite models:
 while read line; do
     model_name=${line}
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}''
-    ./converter_lite  --fmk=TFLITE --modelFile=$models_path/${model_name} --outputFile=${ms_models_path}/${model_name} || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}'' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=TFLITE --modelFile=$models_path/${model_name} --outputFile=${ms_models_path}/${model_name} || Convert_status=$?
 done < ${models_tflite_config}
 
 # Convert caffe models:
@@ -389,10 +420,10 @@ while read line; do
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    pwd
-    echo './converter_lite  --fmk=CAFFE --modelFile='${models_path}'/'${model_name}'.prototxt --weightFile='${models_path}'/'${model_name}'.caffemodel --outputFile='${ms_models_path}'/'${model_name}''
-    ./converter_lite  --fmk=CAFFE --modelFile=${models_path}/${model_name}.prototxt --weightFile=${models_path}/${model_name}.caffemodel --outputFile=${ms_models_path}/${model_name} || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    #pwd >> ${run_benchmark_log_file}
+    echo './converter_lite  --fmk=CAFFE --modelFile='${models_path}'/'${model_name}'.prototxt --weightFile='${models_path}'/'${model_name}'.caffemodel --outputFile='${ms_models_path}'/'${model_name}'' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=CAFFE --modelFile=${models_path}/${model_name}.prototxt --weightFile=${models_path}/${model_name}.caffemodel --outputFile=${ms_models_path}/${model_name} || Convert_status=$?
 done < ${models_caffe_config}
 
 # Convert onnx models:
@@ -401,10 +432,10 @@ while read line; do
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    pwd
-    echo './converter_lite  --fmk=ONNX --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}''
-    ./converter_lite  --fmk=ONNX --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    #pwd >> ${run_benchmark_log_file}
+    echo './converter_lite  --fmk=ONNX --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}'' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=ONNX --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} || Convert_status=$?
 done < ${models_onnx_config}
 
 # Convert mindspore models:
@@ -413,10 +444,10 @@ while read line; do
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    pwd
-    echo './converter_lite  --fmk=MS --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}''
-    ./converter_lite  --fmk=MS --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    pwd >> "${run_benchmark_log_file}"
+    echo './converter_lite  --fmk=MS --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}'' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=MS --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} || Convert_status=$?
 done < ${models_mindspore_config}
 
 # Convert TFLite PostTraining models:
@@ -425,9 +456,9 @@ while read line; do
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}_posttraining' --quantType=PostTraining --config_file='${models_path}'/'${model_name}'_posttraining.config'
-    ./converter_lite  --fmk=TFLITE --modelFile=$models_path/${model_name} --outputFile=${ms_models_path}/${model_name}_posttraining --quantType=PostTraining --config_file=${models_path}/${model_name}_posttraining.config || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}_posttraining' --quantType=PostTraining --config_file='${models_path}'/'${model_name}'_posttraining.config' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=TFLITE --modelFile=$models_path/${model_name} --outputFile=${ms_models_path}/${model_name}_posttraining --quantType=PostTraining --config_file=${models_path}/${model_name}_posttraining.config || Convert_status=$?
 done < ${models_tflite_posttraining_config}
 
 # Convert TFLite AwareTraining models:
@@ -436,13 +467,33 @@ while read line; do
     if [[ $model_name == \#* ]]; then
       continue
     fi
-    echo ${model_name}
-    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}' --quantType=AwareTraining'
-    ./converter_lite  --fmk=TFLITE --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} --quantType=AwareTraining || exit 1
+    echo ${model_name} >> "${run_benchmark_log_file}"
+    echo './converter_lite  --fmk=TFLITE --modelFile='${models_path}'/'${model_name}' --outputFile='${ms_models_path}'/'${model_name}' --quantType=AwareTraining' >> "${run_benchmark_log_file}"
+    ./converter_lite  --fmk=TFLITE --modelFile=${models_path}/${model_name} --outputFile=${ms_models_path}/${model_name} --quantType=AwareTraining || Convert_status=$?
 done < ${models_tflite_awaretraining_config}
 
+# Copy fp16 ms models:
+while read line; do
+  model_name=${line%.*}
+  if [[ $model_name == \#* ]]; then
+      continue
+  fi
+  echo 'cp '${ms_models_path}'/'${model_name}'.ms' ${ms_models_path}'/'${model_name}'.fp16.ms'
+  cp ${ms_models_path}/${model_name}.ms ${ms_models_path}/${model_name}.fp16.ms
+done < ${models_fp16_config}
+
+# Check all result and return value
+if [[ ${Convert_status} = 0 ]];then
+    echo "convert is ended"
+else
+    echo "convert failed"
+    cat ${run_benchmark_log_file}
+    exit 1
+fi
+
 # Push to the arm and run benchmark:
 # First:copy benchmark exe and so files to the server which connected to the phone
+echo "Push files to the arm and run benchmark"
 rm -rf ${basepath}/benchmark_test
 mkdir -p ${basepath}/benchmark_test
 benchmark_test_path=${basepath}/benchmark_test
@@ -455,7 +506,7 @@ cp -a ${arm_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm}/be
 cp -a ${ms_models_path}/*.ms ${benchmark_test_path} || exit 1
 
 # Second:adb push all needed files to the phone
-adb -s ${device_id} push ${benchmark_test_path} /data/local/tmp/
+adb -s ${device_id} push ${benchmark_test_path} /data/local/tmp/ > adb_push_log.txt
 
 # Third:run adb ,run session ,check the result:
 echo 'cd  /data/local/tmp/benchmark_test' > adb_cmd.txt
@@ -464,16 +515,14 @@ echo 'chmod 777 benchmark' >> adb_cmd.txt
 
 adb -s ${device_id} shell < adb_cmd.txt
 
-# Write resulte to temp file
-run_benchmark_result_file=${basepath}/run_benchmark_result.txt
-echo 'cases :' > ${run_benchmark_result_file}
-
 # Run on x86
+echo "start Run x86 ..."
 Run_x86 &
 Run_x86_PID=$!
 sleep 1
 
 # Run on arm64
+echo "start Run arm64 ..."
 Run_arm64 &
 Run_arm64_PID=$!
 
@@ -486,7 +535,8 @@ Run_arm64_status=$?
 # Print all results:
 MS_PRINT_TESTCASE_START_MSG
 while read line; do
-    echo ${line}
+    arr=("${line}")
+    printf "%-10s %-110s %-7s\n" ${arr[0]} ${arr[1]} ${arr[2]}
 done < ${run_benchmark_result_file}
 MS_PRINT_TESTCASE_END_MSG
 
@@ -496,5 +546,15 @@ if [[ ${Run_x86_status} = 0 ]] && [[ ${Run_arm64_status} = 0 ]];then
     exit 0
 else
     echo "run failed"
+    cat ${run_benchmark_log_file}
+    
+    #print the result table again:
+    MS_PRINT_TESTCASE_START_MSG
+    while read line; do
+        arr=("${line}")
+        printf "%-10s %-110s %-7s\n" ${arr[0]} ${arr[1]} ${arr[2]}
+    done < ${run_benchmark_result_file}
+    MS_PRINT_TESTCASE_END_MSG
+    
     exit 1
 fi
diff --git a/mindspore/lite/test/ut/src/infer_test.cc b/mindspore/lite/test/ut/src/infer_test.cc
index 8ad738f3ed..9de1a2dde1 100644
--- a/mindspore/lite/test/ut/src/infer_test.cc
+++ b/mindspore/lite/test/ut/src/infer_test.cc
@@ -130,7 +130,7 @@ TEST_F(InferTest, TestConvNode) {
   memcpy(data, input_data, input_size);
   ret = session->RunGraph();
   ASSERT_EQ(lite::RET_OK, ret);
-  auto outputs = session->GetOutputs();
+  auto outputs = session->GetOutputMapByNode();
   ASSERT_EQ(outputs.size(), 1);
   ASSERT_EQ(outputs.begin()->second.size(), 1);
   auto outTensor = outputs.begin()->second.front();
@@ -222,7 +222,7 @@ TEST_F(InferTest, TestAddNode) {
   (void)inTensor1->MutableData();
   ret = session->RunGraph();
   ASSERT_EQ(lite::RET_OK, ret);
-  auto outputs = session->GetOutputs();
+  auto outputs = session->GetOutputMapByNode();
   ASSERT_EQ(outputs.size(), 1);
   ASSERT_EQ(outputs.begin()->second.size(), 1);
   auto outTensor = outputs.begin()->second.front();
@@ -325,7 +325,7 @@ TEST_F(InferTest, TestParallelExecutor) {
   (void)inTensor1->MutableData();
   ret = session->RunGraph();
   ASSERT_EQ(lite::RET_OK, ret);
-  auto outputs = session->GetOutputs();
+  auto outputs = session->GetOutputMapByNode();
   ASSERT_EQ(outputs.size(), 1);
   ASSERT_EQ(outputs.begin()->second.size(), 1);
   auto outTensor = outputs.begin()->second.front();
@@ -362,7 +362,7 @@ TEST_F(InferTest, TestModel) {
   (void)inTensor->MutableData();
   ret = session->RunGraph();
   ASSERT_EQ(lite::RET_OK, ret);
-  auto outputs = session->GetOutputs();
+  auto outputs = session->GetOutputMapByNode();
   MS_LOG(INFO) << "Passed";
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
index 5ad46139ba..8220e31e47 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
@@ -47,8 +47,8 @@ void InitConvParamPack(ConvParameter *conv_param) {
   conv_param->dilation_h_ = 1;
   conv_param->dilation_w_ = 1;
 
-  conv_param->pad_h_ = 1;
-  conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = 1;
+  conv_param->pad_l_ = 1;
 }
 
 TEST_F(TestPack, PackInputFp32) {
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
index 599e75739b..bad7728b9e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
@@ -50,8 +50,8 @@ void InitConvParamGroup1Fp16(ConvParameter *conv_param) {
   conv_param->dilation_h_ = 1;
   conv_param->dilation_w_ = 1;
 
-  conv_param->pad_h_ = 1;
-  conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = 1;
+  conv_param->pad_l_ = 1;
   conv_param->thread_num_ = 1;
 }
 
@@ -75,8 +75,8 @@ void InitConvParamGroup2Fp16(ConvParameter *conv_param) {
   conv_param->dilation_h_ = 1;
   conv_param->dilation_w_ = 1;
 
-  conv_param->pad_h_ = 1;
-  conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = 1;
+  conv_param->pad_l_ = 1;
   conv_param->thread_num_ = 1;
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
index 9ad190d410..4fad7b355f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
@@ -19,9 +19,8 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
-#include "src/runtime/kernel/arm/fp32/convolution_1x1.h"
 #include "nnacl/matmul_parameter.h"
-#include "nnacl/strassen_matmul.h"
+#include "src/runtime/kernel/arm/fp32/convolution_1x1.h"
 
 namespace mindspore {
 using mindspore::lite::tensor::Tensor;
@@ -51,10 +50,10 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
   conv_param->output_h_ = 4;
   conv_param->output_w_ = 5;
   conv_param->stride_h_ = conv_param->stride_w_ = 4;
-  conv_param->pad_h_ = conv_param->pad_w_ = 2;
+  conv_param->pad_u_ = conv_param->pad_l_ = 2;
 
   float out[20] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
   EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20));
   delete conv_param;
 }
@@ -92,10 +91,10 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
   conv_param->output_h_ = 7;
   conv_param->output_w_ = 4;
   conv_param->stride_h_ = conv_param->stride_w_ = 3;
-  conv_param->pad_h_ = conv_param->pad_w_ = 0;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
 
   float out[28] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
   CompareOutputData(out, correct, 28, 0.0001);
   delete conv_param;
 }
@@ -106,7 +105,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
   conv_param->input_h_ = conv_param->input_w_ = 3;
   conv_param->output_h_ = conv_param->output_w_ = 3;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
 
   float in[] = {1.6767339, 12.25904,  19.018835, 3.0790641,  -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112,
                 -5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931,  -6.501229, -4.122992, 9.540845};
@@ -114,7 +113,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
   float correct[] = {0.0,       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112,
                      -5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
   EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18));
   delete conv_param;
 }
@@ -125,7 +124,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
   conv_param->input_h_ = conv_param->input_w_ = 3;
   conv_param->output_h_ = conv_param->output_w_ = 3;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
   float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542,
                 18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298,
                 0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866,
@@ -136,7 +135,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
                      -1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0,    0.0,    0.0,
                      0.0,    0.0,    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0};
   float out[54] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
   EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54));
   delete conv_param;
 }
@@ -282,8 +281,8 @@ int Conv1x1TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
   conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
-  conv_param->is_relu_ = conv_param->is_relu6_ = false;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
+  conv_param->act_type_ = ActType_No;
   return out_t->ElementsNum();
 }
 
@@ -349,9 +348,8 @@ int Conv1x1TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
   conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
   conv_param->stride_h_ = conv_param->stride_w_ = 1;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
-  conv_param->pad_h_ = conv_param->pad_w_ = 0;
-  conv_param->is_relu_ = false;
-  conv_param->is_relu6_ = false;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
+  conv_param->act_type_ = ActType_No;
   return out_t->ElementsNum();
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc
index 38c25d694b..becf047a69 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc
@@ -47,8 +47,8 @@ void InitConvDwParam(ConvParameter *conv_param) {
   conv_param->dilation_h_ = 1;
   conv_param->dilation_w_ = 1;
 
-  conv_param->pad_h_ = 1;
-  conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = 1;
+  conv_param->pad_l_ = 1;
 }
 
 void InitConvDwCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
index 2cb2f522f6..4549f6d641 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
@@ -468,7 +468,7 @@ int DeConvTestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
   conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
   return out_t->ElementsNum();
 }
 
@@ -537,7 +537,7 @@ int DeConvTestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
   conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
   return out_t->ElementsNum();
 }
 
@@ -548,14 +548,14 @@ TEST_F(TestDeConvolutionFp32, DeConvTest2) {
   float *correct;
   int total_size = DeConvTestInit2(&inputs_, &outputs_, deconv_param, &correct);
   lite::Context *ctx = new lite::Context;
-  ctx->thread_num_ = 4;
+  ctx->thread_num_ = 1;
   kernel::DeConvolutionCPUKernel *deconv =
     new kernel::DeConvolutionCPUKernel(reinterpret_cast<OpParameter *>(deconv_param), inputs_, outputs_, ctx, nullptr);
 
   deconv->Init();
   deconv->Run();
-  EXPECT_EQ(0, lite::CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size));
-  delete deconv_param;
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+
   delete deconv;
   for (auto t : inputs_) delete t;
   for (auto t : outputs_) delete t;
@@ -616,7 +616,7 @@ int DeConvTestInit3(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
   conv_param->kernel_h_ = conv_param->kernel_w_ = 2;
   conv_param->stride_h_ = conv_param->stride_w_ = 3;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 2;
-  conv_param->pad_h_ = conv_param->pad_w_ = 0;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
   return out_t->ElementsNum();
 }
 
@@ -635,7 +635,6 @@ TEST_F(TestDeConvolutionFp32, DeConvTest3) {
   deconv->Run();
   CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
 
-  delete deconv_param;
   delete deconv;
   for (auto t : inputs_) delete t;
   for (auto t : outputs_) delete t;
@@ -686,8 +685,8 @@ int DeConvTestInit4(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
   conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
   conv_param->stride_h_ = conv_param->stride_w_ = 1;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
-  conv_param->pad_h_ = conv_param->pad_w_ = 0;
-  conv_param->is_relu_ = conv_param->is_relu6_ = false;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
+  conv_param->act_type_ = ActType_No;
 
   return out_t->ElementsNum();
 }
@@ -723,7 +722,6 @@ TEST_F(TestDeConvolutionFp32, DeConvTest4) {
   uint64_t time_avg = cost / loop_count;
   printf("deconv fp32 average time : %f ms\n", time_avg / 1000.0f);
 
-  delete deconv_param;
   delete deconv;
   for (auto t : inputs_) delete t;
   for (auto t : outputs_) delete t;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
index 8f968c443f..d9c2d011d6 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
@@ -459,8 +459,6 @@ TEST_F(TestMatMulFp32, batch) {
                      -17.63555145263672, -8.490625381469727,  5.317771911621094,   -14.561882019042969,
                      -7.251564025878906, -2.508212089538574,  5.86458683013916,    -3.466249465942383,
                      8.869029998779297,  25.034008026123047};
-
-  float *output = reinterpret_cast<float *>(outputs_[0]->Data());
   CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
   delete mm;
   for (auto t : inputs_) delete t;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_batch_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_batch_fp32_tests.cc
index 69f9441314..59eed62a15 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_batch_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_batch_fp32_tests.cc
@@ -28,142 +28,175 @@ class SpaceToBatchTestFp32 : public mindspore::CommonTest {
   SpaceToBatchTestFp32() {}
 };
 
-void InitSpaceToBatchParameter(SpaceToBatchParameter *param) {
-  param->n_dims_ = 4;
-  param->n_space_dims_ = 2;
-
-  param->block_sizes_[0] = 2;
-  param->block_sizes_[1] = 2;
-
-  param->paddings_[0] = 2;
-  param->paddings_[1] = 0;
-  param->paddings_[2] = 2;
-  param->paddings_[3] = 2;
-
-  param->in_shape_[0] = 1;
-  param->in_shape_[1] = 4;
-  param->in_shape_[2] = 4;
-  param->in_shape_[3] = 1;
-
-  param->padded_in_shape_[0] = 1;
-  param->padded_in_shape_[1] = 6;
-  param->padded_in_shape_[2] = 8;
-  param->padded_in_shape_[3] = 1;
-
-  param->num_elements_ = 16;
-  param->num_elements_padded_ = 48;
-
-  param->need_paddings_ = true;
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest4) {
+  std::vector<float> input = {1,  2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const size_t kOutSize = 16;
+  std::vector<float> expect_out = {1,  2,  3,  4,  9, 10, 11, 12,
+                                   5,  6,  7,  8, 13, 14, 15, 16};
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 1};
+  std::vector<int> out_shape = {2, 2, 4, 1};
+  SpaceToBatchParameter param;
+  param.block_sizes_[0] = 2;
+  param.block_sizes_[1] = 1;
+  DoSpaceToBatchNHWC(input.data(), out, &param, in_shape.data(), out_shape.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
 }
 
-void InitSpaceToBatchParameter2(SpaceToBatchParameter *param) {
-  param->block_sizes_[0] = 2;
-  param->block_sizes_[1] = 2;
-
-  param->paddings_[0] = 2;
-  param->paddings_[1] = 0;
-  param->paddings_[2] = 2;
-  param->paddings_[3] = 2;
-
-  param->in_shape_[0] = 1;
-  param->in_shape_[1] = 4;
-  param->in_shape_[2] = 4;
-  param->in_shape_[3] = 1;
-
-  param->padded_in_shape_[0] = 1;
-  param->padded_in_shape_[1] = 6;
-  param->padded_in_shape_[2] = 8;
-  param->padded_in_shape_[3] = 1;
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest5) {
+  std::vector<float> input = {1,  2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  size_t kOutSize = 16;
+  std::vector<float> expect_out = {1, 3, 5, 7, 9, 11, 13, 15,
+                                   2, 4, 6, 8, 10, 12, 14, 16};
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 1};
+  std::vector<int> out_shape = {2, 4, 2, 1};
+  SpaceToBatchParameter param;
+  param.block_sizes_[0] = 1;
+  param.block_sizes_[1] = 2;
+  DoSpaceToBatchNHWC(input.data(), out, &param, in_shape.data(), out_shape.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
 }
 
-TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest1) {
-  float input[16] = {1, 2, 5, 6, 10, 20, 3, 8, 18, 10, 3, 4, 11, 55, 15, 25};
-  const int out_size = 16;
-  float expect_out[16] = {1, 5, 18, 3, 2, 6, 10, 4, 10, 3, 11, 15, 20, 8, 55, 25};
-
-  float output[16];
-  int in_shape[4] = {1, 4, 4, 1};
-  int out_shape[4] = {4, 2, 2, 1};
-  int block_sizes[2] = {2, 2};
-  SpaceToBatchForNHWC((const float *)input, output, in_shape, 4, block_sizes, 0, 4 / 2);
-  for (int i = 0; i < out_size; ++i) {
-    std::cout << output[i] << " ";
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest6) {
+  std::vector<float> input = {1,  2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  size_t kOutSize = 16;
+  std::vector<float> expect_out = {1, 3, 9, 11, 2, 4, 10, 12,
+                                   5, 7, 13, 15, 6, 8, 14, 16};
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 1};
+  std::vector<int> out_shape = {4, 2, 2, 1};
+  SpaceToBatchParameter param;
+  param.block_sizes_[0] = 2;
+  param.block_sizes_[1] = 2;
+  DoSpaceToBatchNHWC(input.data(), out, &param, in_shape.data(), out_shape.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
   }
   std::cout << "\n";
-  CompareOutputData(output, expect_out, out_size, 0.000001);
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
 }
 
-TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest2) {
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest7) {
+  std::vector<float> input = {1,  11,  2,  12,   3,  13,   4,  14,
+                              5,  15,  6,  16,   7,  17,   8,  18,
+                              9,  19, 10, 110,  11, 111,  12, 112,
+                             10,  11, 20,  12,  30,  13,  40,  14,
+                             50,  15, 60,  16,  70,  17,  80,  18,
+                             13, 113, 14, 114,  15, 115,  16, 116};
+  size_t kOutSize = 48;
+  std::vector<float> expect_out = {1,  11,   3,  13,  9,  19,  11, 111,
+                                  50,  15,  70,  17,  2,  12,   4,  14,
+                                  10, 110,  12, 112, 60,  16,  80,  18,
+                                   5,  15,   7,  17, 10,  11,  30,  13,
+                                  13, 113,  15, 115,  6,  16,   8,  18,
+                                  20,  12,  40,  14, 14, 114,  16, 116};
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 6, 4, 2};
+  std::vector<int> out_shape = {4, 3, 2, 2};
   SpaceToBatchParameter param;
-  InitSpaceToBatchParameter(&param);
-  float input[16] = {1, 2, 5, 6, 10, 20, 3, 8, 18, 10, 3, 4, 11, 55, 15, 25};
-  const int out_size = 48;
-  float expect_out[48] = {0, 0, 0, 0, 0, 1,  5, 0, 0, 18, 3,  0, 0, 0, 0, 0, 0, 2,  6, 0, 0, 10, 4,  0,
-                          0, 0, 0, 0, 0, 10, 3, 0, 0, 11, 15, 0, 0, 0, 0, 0, 0, 20, 8, 0, 0, 55, 25, 0};
-  float output[48];
-  int in_shape[4] = {1, 4, 4, 1};
-  int out_shape[4] = {4, 3, 4, 1};
-  int block_sizes[2] = {2, 2};
+  param.block_sizes_[0] = 2;
+  param.block_sizes_[1] = 2;
+  DoSpaceToBatchNHWC(input.data(), out, &param, in_shape.data(), out_shape.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
+}
 
-  float padded_input[48]{}, tmp[48]{}, tmp_zero[48]{};
-  float *tmp_space[3] = {padded_input, tmp, tmp_zero};
-  // DoPadding
-  DoPadding(input, padded_input, param, tmp_space + 1);
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest8) {
+  std::vector<float> input = {1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8, -8,
+                              9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15, 16, -16};
+  std::vector<float> expect_out = {1, -1, 2, -2, 3, -3, 4, -4, 0, 0, 5, -5, 6, -6, 7, -7, 8, -8, 0, 0,
+                                   9, -9, 10, -10, 11, -11, 12, -12, 0, 0, 13, -13, 14, -14, 15, -15, 16, -16, 0, 0,
+                                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  size_t kOutSize = 50;
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 2};
+  std::vector<int> out_shape = {1, 5, 5, 2};
+  std::vector<int> padding = {0, 1, 0, 1};
+  std::vector<float> pedding_h(10, 0);
+  std::vector<float> pedding_w(2, 0);
+  DoSpaceToBatchPaddingNHWC(input.data(), out, in_shape.data(), padding.data(), out_shape.data(), pedding_h.data(),
+                            pedding_w.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
+}
 
-  auto ret = SpaceToBatch((const float *)padded_input, output, param, 0, 4 / 2);
-  std::cout << "return " << ret << std::endl;
-  for (int i = 0; i < out_size; ++i) {
-    std::cout << output[i] << " ";
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest9) {
+  std::vector<float> input = {1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8, -8,
+                              9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15, 16, -16};
+  std::vector<float> expect_out = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                   0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 0, 0,
+                                   0, 0, 5, -5, 6, -6, 7, -7, 8, -8, 0, 0,
+                                   0, 0, 9, -9, 10, -10, 11, -11, 12, -12, 0, 0,
+                                   0, 0, 13, -13, 14, -14, 15, -15, 16, -16, 0, 0,
+                                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  size_t kOutSize = 72;
+  float out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 2};
+  std::vector<int> out_shape = {1, 6, 6, 2};
+  std::vector<int> padding = {1, 1, 1, 1};
+  std::vector<float> pedding_h(12, 0);
+  std::vector<float> pedding_w(2, 0);
+  DoSpaceToBatchPaddingNHWC(input.data(), out, in_shape.data(), padding.data(), out_shape.data(), pedding_h.data(),
+                            pedding_w.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
   }
   std::cout << "\n";
-  CompareOutputData(output, expect_out, out_size, 0.000001);
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
 }
 
-TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest3) {
+TEST_F(SpaceToBatchTestFp32, SpaceToBatchTest10) {
+  std::vector<float> input = {1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8, -8,
+                              9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15, 16, -16};
+  std::vector<float> expect_out = {0, 0, 0, 0, 0, 0,
+                                   0, 0, 6, -6, 8, -8,
+                                   0, 0, 14, -14, 16, -16,
+                                   0, 0, 0, 0, 0, 0,
+                                   5, -5, 7, -7, 0, 0,
+                                   13, -13, 15, -15, 0, 0,
+                                   0, 0, 2, -2, 4, -4,
+                                   0, 0, 10, -10, 12, -12,
+                                   0, 0, 0, 0, 0, 0,
+                                   1, -1, 3, -3, 0, 0,
+                                   9, -9, 11, -11, 0, 0,
+                                   0, 0, 0, 0, 0, 0};
+  size_t kOutSize = 72;
+  float out[kOutSize];
+  float pedding_out[kOutSize];
+  std::vector<int> in_shape = {1, 4, 4, 2};
+  std::vector<int> pedding_out_shape = {1, 6, 6, 2};;
+  std::vector<int> out_shape = {4, 3, 3, 2};
+  std::vector<int> padding = {1, 1, 1, 1};
+  std::vector<float> pedding_h(12, 0);
+  std::vector<float> pedding_w(2, 0);
+  DoSpaceToBatchPaddingNHWC(input.data(), pedding_out, in_shape.data(), padding.data(), pedding_out_shape.data(),
+                            pedding_h.data(), pedding_w.data());
   SpaceToBatchParameter param;
-  InitSpaceToBatchParameter2(&param);
-  param.op_parameter_.type_ = schema::PrimitiveType_SpaceToBatch;
-
-  std::vector<float> input = {1, 2, 5, 6, 10, 20, 3, 8, 18, 10, 3, 4, 11, 55, 15, 25};
-  std::vector<int> in_shape = {1, 4, 4, 1};
-  lite::tensor::Tensor input_tensor;
-  input_tensor.SetData(input.data());
-  input_tensor.set_shape(in_shape);
-  input_tensor.SetFormat(schema::Format_NHWC);
-  input_tensor.set_data_type(kNumberTypeFloat32);
-  std::vector<lite::tensor::Tensor *> inputs_tensor;
-  inputs_tensor.emplace_back(&input_tensor);
-
-  const int out_size = 48;
-  float expect_out[48] = {0, 0, 0, 0, 0, 1,  5, 0, 0, 18, 3,  0, 0, 0, 0, 0, 0, 2,  6, 0, 0, 10, 4,  0,
-                          0, 0, 0, 0, 0, 10, 3, 0, 0, 11, 15, 0, 0, 0, 0, 0, 0, 20, 8, 0, 0, 55, 25, 0};
-  std::vector<float> output(48);
-  std::vector<int> out_shape = {4, 3, 4, 1};
-  lite::tensor::Tensor output_tensor;
-  output_tensor.SetData(output.data());
-  output_tensor.set_shape(out_shape);
-  output_tensor.SetFormat(schema::Format_NHWC);
-  output_tensor.set_data_type(kNumberTypeFloat32);
-  std::vector<lite::tensor::Tensor *> outputs_tensor;
-  outputs_tensor.emplace_back(&output_tensor);
-
-  lite::Context ctx;
-  ctx.thread_num_ = 2;
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_SpaceToBatch};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  kernel->Run();
-
-  for (int i = 0; i < out_size; ++i) {
-    std::cout << output[i] << " ";
+  param.block_sizes_[0] = 2;
+  param.block_sizes_[1] = 2;
+  DoSpaceToBatchNHWC(pedding_out, out, &param, pedding_out_shape.data(), out_shape.data());
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << out[i] << " ";
   }
   std::cout << "\n";
-  CompareOutputData(output.data(), expect_out, out_size, 0.000001);
-  input_tensor.SetData(nullptr);
-  output_tensor.SetData(nullptr);
+  CompareOutputData(out, expect_out.data(), kOutSize, 0.000001);
 }
-
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc
deleted file mode 100644
index f1c76c7676..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc
+++ /dev/null
@@ -1,369 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-#include <memory>
-#include "utils/log_adapter.h"
-#include "common/common_test.h"
-#include "src/common/file_utils.h"
-#include "mindspore/lite/nnacl/pack.h"
-#include "mindspore/lite/nnacl/fp32/strassen_matmul.h"
-#include "mindspore/lite/nnacl/conv_parameter.h"
-
-namespace mindspore {
-class TestStrassenFp32 : public mindspore::CommonTest {
- public:
-  TestStrassenFp32() {}
-};
-
-TEST_F(TestStrassenFp32, MatrixAdd1) {
-  float a[] = {0.06796285, 0.6176181,  0.33195993, 0.2752791,  0.36864007, 0.04605605, 0.33899087, 0.9820137,
-               0.49804246, 0.8242412,  0.8458231,  0.6530539,  0.6336898,  0.8367749,  0.57166654, 0.25895607,
-               0.90079665, 0.10585558, 0.8215811,  0.48977906, 0.7895138,  0.41816455, 0.18999523, 0.28736928,
-               0.5882977,  0.44262612, 0.65245426, 0.7834421,  0.60903394, 0.82289135, 0.03855767, 0.30543327,
-               0.37747085, 0,          0,          0,          0.590335,   0,          0,          0,
-               0.7578682,  0,          0,          0,          0.81001425, 0,          0,          0,
-               0.9487712,  0,          0,          0,          0.11742989, 0,          0,          0,
-               0.60004807, 0,          0,          0,          0.05973052, 0,          0,          0};
-  float b[] = {0.112120815, 0.6869974,  0.08290442, 0.43003577,  0.044390075, 0.23077105, 0.23964432, 0.4426781,
-               0.6612115,   0.14988606, 0.84881437, 0.032587975, 0.35028255,  0.41838303, 0.12859282, 0.060378596,
-               0.8272769,   0.6949804,  0.9120368,  0.12399232,  0.9292184,   0.7566025,  0.10235854, 0.015936268,
-               0.20426726,  0.9926392,  0.54714125, 0.7022856,   0.58746314,  0.95714045, 0.26433542, 0.9030878,
-               0.8596953,   0,          0,          0,           0.8341476,   0,          0,          0,
-               0.72301114,  0,          0,          0,           0.40733734,  0,          0,          0,
-               0.2873559,   0,          0,          0,           0.612321,    0,          0,          0,
-               0.5008707,   0,          0,          0,           0.2586266,   0,          0,          0};
-  float add[] = {0.18008366, 1.3046155,  0.41486436, 0.7053149, 0.41303015, 0.2768271, 0.5786352,  1.4246918,
-                 1.159254,   0.9741273,  1.6946375,  0.6856419, 0.9839724,  1.255158,  0.7002593,  0.3193347,
-                 1.7280736,  0.80083597, 1.7336179,  0.6137714, 1.7187322,  1.174767,  0.29235378, 0.30330554,
-                 0.792565,   1.4352653,  1.1995955,  1.4857277, 1.1964971,  1.7800318, 0.3028931,  1.2085211,
-                 1.2371662,  0,          0,          0,         1.4244826,  0,         0,          0,
-                 1.4808793,  0,          0,          0,         1.2173516,  0,         0,          0,
-                 1.2361271,  0,          0,          0,         0.72975093, 0,         0,          0,
-                 1.1009188,  0,          0,          0,         0.31835714, 0,         0,          0};
-  float out[64] = {0};
-  MatrixAdd(a, b, out, 32, 32, 32, 8, 2);
-  EXPECT_EQ(0, lite::CompareOutputData(out, add, 64));
-}
-
-TEST_F(TestStrassenFp32, MatrixAdd2) {
-  float a[] = {0.06796285, 0.6176181,  0.33195993, 0.2752791,  0.36864007, 0.04605605, 0.33899087, 0.9820137,
-               0.49804246, 0.8242412,  0.8458231,  0.6530539,  0.6336898,  0.8367749,  0.57166654, 0.25895607,
-               0.90079665, 0.10585558, 0.8215811,  0.48977906, 0.7895138,  0.41816455, 0.18999523, 0.28736928,
-               0.5882977,  0.44262612, 0.65245426, 0.7834421,  0.60903394, 0.82289135, 0.03855767, 0.30543327,
-               0,          0,          0,          0,          0,          0,          0,          0,
-               0,          0,          0,          0,          0.37747085, 0,          0,          0,
-               0.590335,   0,          0,          0,          0.7578682,  0,          0,          0,
-               0.81001425, 0,          0,          0,          0.9487712,  0,          0,          0,
-               0.11742989, 0,          0,          0,          0.60004807, 0,          0,          0,
-               0.05973052, 0,          0,          0,          0,          0,          0,          0,
-               0,          0,          0,          0,          0,          0,          0,          0};
-  float b[] = {0.112120815, 0.6869974,  0.08290442, 0.43003577,  0.044390075, 0.23077105, 0.23964432, 0.4426781,
-               0.6612115,   0.14988606, 0.84881437, 0.032587975, 0.35028255,  0.41838303, 0.12859282, 0.060378596,
-               0.8272769,   0.6949804,  0.9120368,  0.12399232,  0.9292184,   0.7566025,  0.10235854, 0.015936268,
-               0.20426726,  0.9926392,  0.54714125, 0.7022856,   0.58746314,  0.95714045, 0.26433542, 0.9030878,
-               0,           0,          0,          0,           0,           0,          0,          0,
-               0,           0,          0,          0,           0,           0,          0,          0,
-               0,           0,          0,          0,           0,           0,          0,          0,
-               0.8596953,   0,          0,          0,           0.8341476,   0,          0,          0,
-               0.72301114,  0,          0,          0,           0.40733734,  0,          0,          0,
-               0.2873559,   0,          0,          0,           0.612321,    0,          0,          0,
-               0.5008707,   0,          0,          0,           0.2586266,   0,          0,          0,
-               0,           0,          0,          0,           0,           0,          0,          0,
-               0,           0,          0,          0,           0,           0,          0,          0,
-               0,           0,          0,          0,           0,           0,          0,          0};
-  float add[] = {0.18008366, 1.3046155,  0.41486436, 0.7053149, 0.41303015, 0.2768271, 0.5786352,  1.4246918,
-                 1.159254,   0.9741273,  1.6946375,  0.6856419, 0.9839724,  1.255158,  0.7002593,  0.3193347,
-                 1.7280736,  0.80083597, 1.7336179,  0.6137714, 1.7187322,  1.174767,  0.29235378, 0.30330554,
-                 0.792565,   1.4352653,  1.1995955,  1.4857277, 1.1964971,  1.7800318, 0.3028931,  1.2085211,
-                 0,          0,          0,          0,         1.2371662,  0,         0,          0,
-                 1.4244826,  0,          0,          0,         1.4808793,  0,         0,          0,
-                 1.2173516,  0,          0,          0,         1.2361271,  0,         0,          0,
-                 0.72975093, 0,          0,          0,         1.1009188,  0,         0,          0,
-                 0.31835714, 0,          0,          0,         0,          0,         0,          0};
-  float out[72] = {0};
-  MatrixAdd(a, b, out, 44, 56, 36, 8, 2);
-  EXPECT_EQ(0, lite::CompareOutputData(out, add, 72));
-}
-
-TEST_F(TestStrassenFp32, MatrixSub1) {
-  float a[] = {0.4160896,  0.55011475, 0.60395557,  0.964036,   0.8010256,    0.908257,   0.60170764, 0.008877548,
-               0.4973592,  0.6104505,  0.2957374,   0.39589414, 0.0151615525, 0.45663023, 0.3815148,  0.6419536,
-               0.9118046,  0.5312479,  0.104496025, 0.5972911,  0.9671534,    0.7195669,  0.23360363, 0.22078007,
-               0.31118092, 0.7438336,  0.5592656,   0.7212792,  0.97856164,   0.26012093, 0.18205991, 0.90656054,
-               0.24593723, 0,          0,           0,          0.5024593,    0,          0,          0,
-               0.42271087, 0,          0,           0,          0.48668534,   0,          0,          0,
-               0.4374295,  0,          0,           0,          0.22822042,   0,          0,          0,
-               0.88180095, 0,          0,           0,          0.7505223,    0,          0,          0};
-  float b[] = {0.14911577, 0.63214976, 0.74834836, 0.36854064, 0.5801671,  0.24166176, 0.64528674, 0.04887214,
-               0.23637155, 0.34321627, 0.69035923, 0.6114065,  0.73006815, 0.575073,   0.88130534, 0.72951907,
-               0.17092401, 0.652334,   0.6288812,  0.62121505, 0.12793411, 0.16503152, 0.7564361,  0.51976234,
-               0.19353953, 0.5795124,  0.6671185,  0.10646773, 0.13608798, 0.37959677, 0.24294423, 0.1790138,
-               0.85054415, 0,          0,          0,          0.18541782, 0,          0,          0,
-               0.72714496, 0,          0,          0,          0.43221787, 0,          0,          0,
-               0.7200413,  0,          0,          0,          0.15780604, 0,          0,          0,
-               0.30473796, 0,          0,          0,          0.37719592, 0,          0,          0};
-  float s[] = {0.26697382, -0.082035,   -0.14439279, 0.59549534,  0.22085851, 0.6665952,   -0.0435791,  -0.03999459,
-               0.26098764, 0.26723424,  -0.39462185, -0.21551237, -0.7149066, -0.11844277, -0.49979055, -0.08756548,
-               0.7408806,  -0.12108606, -0.5243852,  -0.02392393, 0.8392193,  0.5545354,   -0.5228325,  -0.29898226,
-               0.11764139, 0.16432118,  -0.10785288, 0.6148115,   0.8424736,  -0.11947584, -0.06088431, 0.72754675,
-               -0.6046069, 0.,          0.,          0.,          0.31704146, 0.,          0.,          0.,
-               -0.3044341, 0.,          0.,          0.,          0.05446747, 0.,          0.,          0.,
-               -0.2826118, 0.,          0.,          0.,          0.07041438, 0.,          0.,          0.,
-               0.57706296, 0.,          0.,          0.,          0.3733264,  0.,          0.,          0.};
-  float out[64] = {0};
-  MatrixSub(a, b, out, 32, 32, 32, 8, 2);
-  EXPECT_EQ(0, lite::CompareOutputData(out, s, 64));
-}
-
-TEST_F(TestStrassenFp32, MatrixSub2) {
-  float a[] = {0.4160896,  0.55011475, 0.60395557,  0.964036,   0.8010256,    0.908257,   0.60170764, 0.008877548,
-               0.4973592,  0.6104505,  0.2957374,   0.39589414, 0.0151615525, 0.45663023, 0.3815148,  0.6419536,
-               0.9118046,  0.5312479,  0.104496025, 0.5972911,  0.9671534,    0.7195669,  0.23360363, 0.22078007,
-               0.31118092, 0.7438336,  0.5592656,   0.7212792,  0.97856164,   0.26012093, 0.18205991, 0.90656054,
-               0.24593723, 0,          0,           0,          0.5024593,    0,          0,          0,
-               0.42271087, 0,          0,           0,          0.48668534,   0,          0,          0,
-               0.4374295,  0,          0,           0,          0.22822042,   0,          0,          0,
-               0.88180095, 0,          0,           0,          0.7505223,    0,          0,          0};
-  float b[] = {0.14911577, 0.63214976, 0.74834836, 0.36854064, 0.5801671,  0.24166176, 0.64528674, 0.04887214,
-               0.23637155, 0.34321627, 0.69035923, 0.6114065,  0.73006815, 0.575073,   0.88130534, 0.72951907,
-               0.17092401, 0.652334,   0.6288812,  0.62121505, 0.12793411, 0.16503152, 0.7564361,  0.51976234,
-               0.19353953, 0.5795124,  0.6671185,  0.10646773, 0.13608798, 0.37959677, 0.24294423, 0.1790138,
-               0,          0,          0,          0,          0,          0,          0,          0,
-               0,          0,          0,          0,          0.85054415, 0,          0,          0,
-               0.18541782, 0,          0,          0,          0.72714496, 0,          0,          0,
-               0.43221787, 0,          0,          0,          0.7200413,  0,          0,          0,
-               0.15780604, 0,          0,          0,          0.30473796, 0,          0,          0,
-               0.37719592, 0,          0,          0,          0,          0,          0,          0,
-               0,          0,          0,          0,          0,          0,          0,          0};
-  float s[] = {0.26697382, -0.082035,   -0.14439279, 0.59549534,  0.22085851, 0.6665952,   -0.0435791,  -0.03999459,
-               0.26098764, 0.26723424,  -0.39462185, -0.21551237, -0.7149066, -0.11844277, -0.49979055, -0.08756548,
-               0.7408806,  -0.12108606, -0.5243852,  -0.02392393, 0.8392193,  0.5545354,   -0.5228325,  -0.29898226,
-               0.11764139, 0.16432118,  -0.10785288, 0.6148115,   0.8424736,  -0.11947584, -0.06088431, 0.72754675,
-               0,          0,           0,           0,           -0.6046069, 0.,          0.,          0.,
-               0.31704146, 0.,          0.,          0.,          -0.3044341, 0.,          0.,          0.,
-               0.05446747, 0.,          0.,          0.,          -0.2826118, 0.,          0.,          0.,
-               0.07041438, 0.,          0.,          0.,          0.57706296, 0.,          0.,          0.,
-               0.3733264,  0.,          0.,          0,           0,          0,           0,           0.};
-  float out[72] = {0};
-  MatrixSub(a, b, out, 32, 44, 36, 8, 2);
-  EXPECT_EQ(0, lite::CompareOutputData(out, s, 72));
-}
-
-TEST_F(TestStrassenFp32, MatrixPack1) {
-  float in[] = {4.1795, 13.142, -3.593, 16.505, 19.969, -6.235, -2.380, -9.027, 23.622, 8.3608, 47.325, -14.36,
-                -0.784, 37.925, -0.081, 6.1298, 37.998, 13.719, 11.029, 1.7127, 9.0560, 14.988, 3.1866, 0.0562,
-                14.530, -14.10, -8.115, -8.071, 19.250, 17.923, 13.584, 3.3293, -1.514, -0.293, 18.686, 0.0873,
-                19.899, 8.5562, 0.0,    0.0,    9.5542, 18.974, 0.0,    0.0,    15.370, 4.3049, 0.0,    0.0,
-                0.6721, -1.517, 0.0,    0.0,    -1.770, 41.903, 0.0,    0.0,    8.1381, 9.1391, 0.0,    0.0,
-                -8.158, 7.7566, 0.0,    0.0,    9.7341, 18.834, 0.0,    0.0,    4.2010, -2.253, 0.0,    0.0};
-  float correct[] = {4.1795, 13.142, -3.593, 16.505, 19.969, -6.235, -2.380, -9.027, 23.622, 8.3608, 47.325, -14.36,
-                     -0.784, 37.925, -0.081, 6.1298, 19.899, 8.5562, 0.0,    0.0,    9.5542, 18.974, 0.0,    0.0,
-                     15.370, 4.3049, 0.0,    0.0,    0.6721, -1.517, 0.0,    0.0,    37.998, 13.719, 11.029, 1.7127,
-                     9.0560, 14.988, 3.1866, 0.0562, 14.530, -14.10, -8.115, -8.071, -1.770, 41.903, 0.0,    0.0,
-                     8.1381, 9.1391, 0.0,    0.0,    -8.158, 7.7566, 0.0,    0.0};
-  float out[56] = {0};
-
-  MatrixPack(in, out, 7, 2, 36);
-  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 56));
-}
-
-TEST_F(TestStrassenFp32, MatrixPack2) {
-  float in[] = {4.1795, 13.142, -3.593, 16.505, 19.969, -6.235, -2.380, -9.027, 23.622, 8.3608, 47.325, -14.36,
-                -0.784, 37.925, -0.081, 6.1298, 37.998, 13.719, 11.029, 1.7127, 9.0560, 14.988, 3.1866, 0.0562,
-                14.530, -14.10, -8.115, -8.071, 19.250, 17.923, 13.584, 3.3293, -1.514, -0.293, 18.686, 0.0873,
-                19.899, 8.5562, 0.0,    0.0,    9.5542, 18.974, 0.0,    0.0,    15.370, 4.3049, 0.0,    0.0,
-                0.6721, -1.517, 0.0,    0.0,    -1.770, 41.903, 0.0,    0.0,    8.1381, 9.1391, 0.0,    0.0,
-                -8.158, 7.7566, 0.0,    0.0,    9.7341, 18.834, 0.0,    0.0,    4.2010, -2.253, 0.0,    0.0};
-  float correct[] = {4.1795, 13.142, -3.593, 16.505, 19.969, -6.235, -2.380, -9.027, 23.622, 8.3608, 47.325, -14.36,
-                     -0.784, 37.925, -0.081, 6.1298, 19.899, 8.5562, 0.0,    0.0,    9.5542, 18.974, 0.0,    0.0,
-                     15.370, 4.3049, 0.0,    0.0,    0.6721, -1.517, 0.0,    0.0,    37.998, 13.719, 11.029, 1.7127,
-                     9.0560, 14.988, 3.1866, 0.0562, 14.530, -14.10, -8.115, -8.071, 19.250, 17.923, 13.584, 3.3293,
-                     -1.770, 41.903, 0.0,    0.0,    8.1381, 9.1391, 0.0,    0.0,    -8.158, 7.7566, 0.0,    0.0,
-                     9.7341, 18.834, 0.0,    0.0,    -1.514, -0.293, 18.686, 0.0873, 4.2010, -2.253, 0.0,    0.0};
-  float out[72] = {0};
-  MatrixPack(in, out, 9, 2, 36);
-  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 72));
-}
-
-TEST_F(TestStrassenFp32, CommonMatmul1) {
-  float a_ptr[] = {7.756654,   19.250782, 17.923292,  0, 13.584222, 3.3293908, 9.734102,  0,
-                   18.83455,   -1.51425,  -0.29382,   0, 18.686155, 0.0873076, 4.2010098, 0,
-                   -2.2539594, 4.1795673, 13.14235,   0, -3.59393,  16.50578,  19.899279, 0,
-                   8.556229,   19.969376, -6.2355065, 0, -2.380469, -9.027744, 9.5542,    0};
-  float b_ptr[] = {0.2674241, 0.089372,  -0.081915, 2.0580146, -0.295045, 1.377944, 0.703658, 1.055378,
-                   1.204049,  -0.256505, -0.309640, 0.560465,  0,         0,        0,        0,
-                   0.646906,  0,         0,         0,         -0.168206, 0,        0,        0,
-                   -0.95630,  0,         0,         0,         0,         0,        0,        0};
-  float correct[] = {17.97499,  22.622334,  7.360805,  46.325558, 14.37076,  3.304931,  -1.784072, 36.925926,
-                     5.129812,  -0.3278886, -2.517368, 36.99899,  10.029593, 0.7127603, -2.77004,  40.90305,
-                     13.988123, 2.186689,   -0.943787, 7.138184,  18.128653, 17.31859,  5.7472067, 21.176342,
-                     -11.11159, 29.880829,  15.281498, 35.1893,   13.530734, -15.10318, -9.11581,  -9.071925,
-                     -15.36046, 0,          0,         0,         -1.081104, 0,         0,         0,
-                     12.719885, 0,          0,         0,         8.056052,  0,         0,         0,
-                     -14.72927, 0,          0,         0,         -24.1311,  0,         0,         0,
-                     8.139168,  0,          0,         0,         -9.158176, 0,         0,         0};
-  StrassenMatMulParameter *matmul_param = new StrassenMatMulParameter();
-  matmul_param->row_ = 8;
-  matmul_param->deep_ = 1;
-  matmul_param->col_ = 2;
-  matmul_param->a_stride_ = 32;
-  matmul_param->b_stride_ = 16;
-  matmul_param->c_stride_ = 32;
-
-  float c_ptr[64] = {0};
-  float tmp_ptr[32];
-  CommonMatMul(a_ptr, b_ptr, c_ptr, matmul_param, tmp_ptr);
-
-  EXPECT_EQ(0, lite::CompareOutputData(c_ptr, correct, 64));
-  delete matmul_param;
-}
-
-TEST_F(TestStrassenFp32, CommonMatmul2) {
-  StrassenMatMulParameter *matmul_param = new StrassenMatMulParameter();
-  float a[] = {4.864725,   6.830073,  0.76780415, 8.922394,  5.096872,  2.4946148,  4.2148714, 1.7762588, 0.89195687,
-               9.703938,   2.0654619, 9.048538,   2.358036,  5.643526,  2.5152204,  3.512572,  3.7913973, 3.7136157,
-               8.820186,   1.5324963, 3.135459,   7.5792265, 7.1820426, 0.267987,   8.737802,  4.064117,  2.7232447,
-               0.27355433, 0,         0,          0,         0,         0,          0,         0,         0,
-               6.320409,   9.479354,  0,          0,         1.6220464, 0.57753897, 0,         0,         9.786372,
-               6.0404425,  0,         0,          2.1067812, 4.8034563, 0,          0,         2.1140356, 8.204062,
-               0,          0,         3.29985,    1.2034118, 0,         0,          7.6059656, 4.162436,  0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0,
-               0,          0,         0,          0,         0,         0,          0,         0,         0};
-
-  float b[] = {
-    4.4558744,  0.6383263, 0.05037839, 9.730914, 8.1542015, 4.3625517, 8.654026,   3.805875,  9.845131,  4.08051,
-    9.667656,   7.73955,   9.283867,   8.465257, 2.292051,  9.853942,  0.13320169, 3.8789113, 9.460265,  4.2616735,
-    0.23831692, 4.420147,  0.5355651,  7.829217, 0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        1.9866786, 0,         0,          0,         6.0188327, 0,
-    0,          0,         6.6249146,  0,        0,         0,         3.5639563,  0,         0,         0,
-    0.14810833, 0,         0,          0,        7.4168983, 0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0,         0,         0,
-    0,          0,         0,          0,        0,         0,         0,          0};
-  float c[] = {170.86482, 177.98166, 152.0957,  268.3473,  101.39282, 55.216248, 82.31873,  120.65008, 190.18558,
-               192.58974, 220.54767, 239.75931, 115.32386, 95.52758,  103.82857, 145.08948, 150.4757,  112.04814,
-               145.50496, 207.63342, 149.6962,  84.76027,  167.65851, 141.06763, 103.42963, 84.63687,  136.74927,
-               189.26935, 0,         0,         0,         0,         0,         0,         0,         0,
-               0,         0,         0,         0,         158.90288, 0,         0,         0,         63.917973,
-               0,         0,         0,         152.3613,  0,         0,         0,         103.77265, 0,
-               0,         0,         154.94044, 0,         0,         0,         109.79707, 0,         0,
-               0,         92.83551,  0,         0,         0,         0,         0,         0,         0,
-               0,         0,         0,         0,         0,         0,         0,         0};
-
-  matmul_param->row_ = 7;
-  matmul_param->deep_ = 2;
-  matmul_param->col_ = 2;
-  matmul_param->a_stride_ = 36;
-  matmul_param->b_stride_ = 64;
-  matmul_param->c_stride_ = 40;
-  float out[80] = {0};
-  float tmp_ptr[1000];
-  CommonMatMul(a, b, out, matmul_param, tmp_ptr);
-  EXPECT_EQ(0, lite::CompareOutputData(out, c, 80));
-  delete (matmul_param);
-}
-
-TEST_F(TestStrassenFp32, RecMatmul1) {
-  StrassenMatMulParameter *matmul_param = new StrassenMatMulParameter();
-  matmul_param->row_ = 4;
-  matmul_param->deep_ = 2;
-  matmul_param->col_ = 2;
-  matmul_param->a_stride_ = 16;
-  matmul_param->b_stride_ = 32;
-  matmul_param->c_stride_ = 16;
-
-  float a[] = {9.02165,  8.657163,   0.56371903, 0.7272156, 1.6258951,  9.919627,   7.47593,   3.5311592,
-               8.958062, 0.55338514, 9.611276,   7.429841,  8.23804,    3.7503464,  1.2829816, 6.4470887,
-               4.303486, 6.282502,   0,          0,         9.4194765,  7.8199654,  0,         0,
-               6.738705, 7.5398073,  0,          0,         0.47684374, 0.87746763, 0,         0};
-  float b[] = {1.8100919,  6.016964,  5.733568, 5.768448,   2.2823029, 2.173359,  0.56861514, 7.134393,
-               0.26377398, 3.9010656, 4.868408, 0.33401546, 1.7973539, 8.21896,   5.62239,    8.54786,
-               0.97356945, 1.0714527, 6.447588, 6.161091,   3.332229,  2.8775468, 6.558747,   2.6986659,
-               0,          0,         0,        0,          0,         0,         0,          0,
-               1.9830805,  0,         0,        0,          8.44718,   0,         0,          0,
-               9.360418,   0,         0,        0,          6.220693,  0,         0,          0,
-               1.8369701,  0,         0,        0,          4.3965054, 0,         0,          0,
-               0,          0,         0,        0,          0,         0,         0,          0};
-  float c[] = {62.668518, 103.9633,  132.43439, 163.67749, 69.12974,  122.12326, 183.23413,  191.96806,
-               65.052124, 182.57918, 233.14148, 184.20694, 38.785316, 118.74806, 100.689575, 135.12036,
-               136.34613, 0,         0,         0,         230.64507, 0,         0,          0,
-               204.15103, 0,         0,         0,         104.86488, 0,         0,          0};
-  float out[32] = {0};
-
-  float tmp_ptr[1000];
-  RecursionMatmul(a, b, out, matmul_param, 1, 0, tmp_ptr);
-  EXPECT_EQ(0, lite::CompareOutputData(out, c, 32));
-  delete (matmul_param);
-}
-
-TEST_F(TestStrassenFp32, RecMatmul2) {
-  StrassenMatMulParameter *matmul_param = new StrassenMatMulParameter();
-  matmul_param->row_ = 4;
-  matmul_param->deep_ = 2;
-  matmul_param->col_ = 2;
-  matmul_param->a_stride_ = 32;
-  matmul_param->b_stride_ = 64;
-  matmul_param->c_stride_ = 32;
-
-  float a[] = {9.02165,  8.657163,   0.56371903, 0.7272156, 1.6258951,  9.919627,   7.47593,   3.5311592,
-               8.958062, 0.55338514, 9.611276,   7.429841,  8.23804,    3.7503464,  1.2829816, 6.4470887,
-               1,        2,          3,          4,         1,          2,          3,         4,
-               3,        2,          3,          4,         4,          2,          3,         4,
-               4.303486, 6.282502,   0,          0,         9.4194765,  7.8199654,  0,         0,
-               6.738705, 7.5398073,  0,          0,         0.47684374, 0.87746763, 0,         0,
-               1,        2,          3,          4,         1,          2,          3,         4,
-               3,        2,          3,          4,         4,          2,          3,         4};
-  float b[] = {
-    1.8100919, 6.016964,   5.733568,  5.768448,  2.2823029, 2.173359, 0.56861514, 7.134393,  0.26377398, 3.9010656,
-    4.868408,  0.33401546, 1.7973539, 8.21896,   5.62239,   8.54786,  0.97356945, 1.0714527, 6.447588,   6.161091,
-    3.332229,  2.8775468,  6.558747,  2.6986659, 0,         0,        0,          0,         0,          0,
-    0,         0,          11,        2,         3,         4,        22,         2,         3,          4,
-    33,        3,          3,         4,         44,        2,        3,          4,         11,         2,
-    3,         4,          22,        2,         3,         4,        33,         3,         3,          4,
-    44,        2,          3,         4,         1.9830805, 0,        0,          0,         8.44718,    0,
-    0,         0,          9.360418,  0,         0,         0,        6.220693,   0,         0,          0,
-    1.8369701, 0,          0,         0,         4.3965054, 0,        0,          0,         0,          0,
-    0,         0,          0,         0,         0,         0,        11,         2,         3,          4,
-    22,        2,          3,         4,         33,        3,        3,          4,         44,         2,
-    3,         4,          11,        2,         3,         4,        22,         2,         3,          4,
-    33,        3,          3,         4,         44,        2,        3,          4};
-  float c[] = {62.668518, 103.9633,  132.43439, 163.67749, 69.12974,  122.12326, 183.23413,  191.96806,
-               65.052124, 182.57918, 233.14148, 184.20694, 38.785316, 118.74806, 100.689575, 135.12036,
-               0,         0,         0,         0,         0,         0,         0,          0,
-               0,         0,         0,         0,         0,         0,         0,          0,
-               136.34613, 0,         0,         0,         230.64507, 0,         0,          0,
-               204.15103, 0,         0,         0,         104.86488, 0,         0,          0,
-               0,         0,         0,         0,         0,         0,         0,          0,
-               0,         0,         0,         0,         0,         0,         0,          0};
-  float out[64] = {0};
-
-  float tmp_ptr[1000];
-  RecursionMatmul(a, b, out, matmul_param, 1, 0, tmp_ptr);
-  EXPECT_EQ(0, lite::CompareOutputData(out, c, 64));
-  delete (matmul_param);
-}
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
index f7d2429858..61123741e5 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
@@ -45,7 +45,8 @@ TEST_F(TestTopKFp32, TopK) {
   auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
   ASSERT_NE(creator, nullptr);
 
-  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(&parameter), nullptr, desc, nullptr);
+  auto ctx = std::make_shared<lite::Context>();
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(&parameter), ctx.get(), desc, nullptr);
   ASSERT_NE(kernel, nullptr);
 
   auto ret = kernel->Run();
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc
index 9c6d3e9359..5af17aaa93 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc
@@ -65,7 +65,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) {
   }
 
   auto ret = DoTranspose(in, out, input_shape, output_shape, param, 0, 3);
-  MS_ASSERT(ret == 0);
+  ASSERT_EQ(ret, 0);
   delete param;
   CompareOutputData(out, correct, 24, 0.000001);
 }
@@ -105,7 +105,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) {
   }
 
   auto ret = DoTranspose(in, out, input_shape, output_shape, param, 0, 3);
-  MS_ASSERT(ret == 0);
+  ASSERT_EQ(ret, 0);
   delete param;
   CompareOutputData(out, correct, 24, 0.000001);
 }
@@ -146,7 +146,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) {
   }
 
   auto ret = DoTranspose(in, out, input_shape, output_shape, param, 0, 6);
-  MS_ASSERT(ret == 0);
+  ASSERT_EQ(ret, 0);
   delete param;
   CompareOutputData(out, correct, 24, 0.000001);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
index cb8b72cff1..76fe27358c 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
@@ -52,12 +52,11 @@ void InitConvParamGroup1FP32(ConvParameter *conv_param) {
   conv_param->dilation_h_ = 1;
   conv_param->dilation_w_ = 1;
 
-  conv_param->pad_h_ = 1;
-  conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = 1;
+  conv_param->pad_l_ = 1;
 
   conv_param->group_ = 1;
-  conv_param->is_relu_ = false;
-  conv_param->is_relu6_ = false;
+  conv_param->act_type_ = ActType_No;
   conv_param->thread_num_ = 1;
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc
index e1a9053e3e..22aabd7757 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc
@@ -26,7 +26,7 @@
 #include "nnacl/fp32_grad/pooling_grad.h"
 
 namespace mindspore {
-class TestPoolingGradFp32 :  public mindspore::CommonTest {
+class TestPoolingGradFp32 : public mindspore::CommonTest {
  public:
   TestPoolingGradFp32() {}
 };
@@ -161,8 +161,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
   auto pooling_param = new PoolingParameter();
   InitPoolingParamFP32(pooling_param);
   pooling_param->output_channel_ = 3;
-  pooling_param->avg_pooling_ = false;
-  pooling_param->max_pooling_ = true;
+  pooling_param->pool_mode_ = PoolMode_MaxPool;
   // runtime part
   printf("Calculating runtime cost...\n");
   uint64_t time_avg = 0;
@@ -215,8 +214,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
   // prepare stage
   auto maxpool = new PoolingParameter();
   InitPoolingParamFP32(maxpool);
-  maxpool->avg_pooling_ = false;
-  maxpool->max_pooling_ = true;
+  maxpool->pool_mode_ = PoolMode_MaxPool;
   maxpool->input_h_ = 30;
   maxpool->input_w_ = 30;
   maxpool->input_channel_ = 3;
@@ -268,8 +266,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
 
   auto pooling_param = new PoolingParameter();
   InitPoolingParamFP32(pooling_param);
-  pooling_param->avg_pooling_ = false;
-  pooling_param->max_pooling_ = true;
+  pooling_param->pool_mode_ = PoolMode_MaxPool;
   pooling_param->input_h_ = 10;
   pooling_param->input_w_ = 10;
   pooling_param->input_channel_ = 3;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
new file mode 100644
index 0000000000..3fe09b3e93
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
@@ -0,0 +1,281 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/lite_kernel.h"
+#include "src/common/file_utils.h"
+#include "nnacl/quantization/quantize.h"
+#include "nnacl/common_func.h"
+#include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
+
+namespace mindspore {
+using lite::tensor::Tensor;
+class TestConv1x1Int8 : public mindspore::CommonTest {
+ public:
+  TestConv1x1Int8() {}
+};
+
+TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
+  auto conv_param = new ConvParameter();
+  conv_param->input_channel_ = 6;
+  conv_param->input_h_ = conv_param->input_w_ = 3;
+  conv_param->output_h_ = conv_param->output_w_ = 3;
+  conv_param->stride_h_ = conv_param->stride_w_ = 2;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
+  int8_t in[] = {4,  13,  -3, 16, 19, 8,  19, -6, -2, -9, 9,  18, 23, 8,  47, -14, 15, 4,
+                 -0, 37,  -0, 6,  0,  -1, 37, 13, 11, 1,  -1, 41, 9,  14, 3,  0,   8,  9,
+                 14, -14, -8, -8, -8, 7,  19, 17, 13, 3,  9,  18, -1, -0, 18, 0,   4,  -2};
+  int8_t correct[] = {0, 0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11,
+                      1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0};
+  int8_t out[54] = {0};
+  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
+  CompareOutputData(out, correct, 54, 0);
+  delete conv_param;
+}
+
+TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
+  auto conv_param = new ConvParameter();
+  int8_t in[] = {-0, -0, -7, -0, -6, 4,  9,  9,  12, -0, 6,  2,  13, 15, 16, -7, 9,  1,  10, 13, 17, 17, 4,  13,
+                 -6, 5,  7,  -7, 15, 0,  1,  -5, -7, 18, 15, 19, -7, 13, 7,  -0, 16, -5, 16, -7, 6,  10, -5, 10,
+                 9,  12, -9, -8, -4, 18, -5, 0,  7,  12, 13, 16, -9, -4, 18, -0, 8,  6,  2,  10, 16, 1,  -1, 2,
+                 9,  8,  9,  13, 7,  -0, 15, -7, 0,  -0, 17, 19, 9,  17, -6, -2, 7,  -0, 10, -6, -6, 18, -0, 9,
+                 9,  6,  3,  -1, -8, 10, 17, -9, 17, 6,  -3, 7,  -2, -0, -9, 1,  -3, 15, 13, 4,  18};
+  int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0};
+
+  conv_param->input_h_ = 9;
+  conv_param->input_w_ = 13;
+  conv_param->input_channel_ = 1;
+  conv_param->output_h_ = 4;
+  conv_param->output_w_ = 5;
+  conv_param->stride_h_ = conv_param->stride_w_ = 4;
+  conv_param->pad_u_ = conv_param->pad_l_ = 2;
+
+  int8_t out[20] = {0};
+  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
+  CompareOutputData(out, correct, 20, 0);
+  delete conv_param;
+}
+
+int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_,
+                                    std::vector<lite::tensor::Tensor *> *outputs_, ConvParameter *conv_param,
+                                    int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  int8_t in[] = {62,  -14, 88, 2,   -35, 43,  83,  -111, 75,  26, 14,  -121,
+                 -78, 56,  37, -31, 15,  -75, -10, -115, -71, 74, -65, -15};
+  memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t));
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275;
+  auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275;
+  auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647;
+  weight_t->AddQuantParam(*weight_quant_arg1);
+  weight_t->AddQuantParam(*weight_quant_arg2);
+  weight_t->AddQuantParam(*weight_quant_arg3);
+  int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20};
+  memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t));
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24};
+  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t));
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
+  conv_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 70);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                         ConvParameter *conv_param, int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  float in[] = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
+                13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
+                6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
+  Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint,
+           reinterpret_cast<int8_t *>(in_t->Data()));
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
+  weight_t->AddQuantParam(*weight_quant_arg);
+  weight_t->MallocData();
+  float weight[] = {-0.7308652, 0.5257509,  -0.87825793, -1.123181,   -1.2206168, 0.562695,
+                    1.5382664,  -0.5020635, 0.8591602,   -0.26410004, 1.1262615,  0.073132955};
+  Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint,
+           reinterpret_cast<int8_t *>(weight_t->Data()));
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655,  31.3307619, 14.05927672,
+                     -1.178490666, 2.5676252,  16.39408946, -0.394793726, 25.2866881, 3.827249175,
+                     -0.626854507, -0.3122176, 10.42769169, 8.362184085,  6.04617807, -9.252362384};
+  Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct);
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
+  conv_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                         ConvParameter *conv_param, int8_t **correct) {
+  size_t buffer_size;
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  std::string input_path = "./input";
+  auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size);
+  memcpy(in_t->Data(), input, buffer_size);
+  inputs_->push_back(in_t);
+  delete[] input;
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
+  weight_t->AddQuantParam(*weight_quant_arg);
+  weight_t->MallocData();
+  std::string weight_path = "./weight";
+  auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size);
+  memcpy(weight_t->Data(), weight, buffer_size);
+  inputs_->push_back(weight_t);
+  delete[] weight;
+
+  Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  std::string bias_path = "./bias";
+  auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
+  memcpy(bias_t->Data(), bias, buffer_size);
+  inputs_->push_back(bias_t);
+  delete[] bias;
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  std::string output_path = "./output";
+  auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size);
+  memcpy(*correct, output, buffer_size);
+  delete[] output;
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 0;
+  conv_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
index 0791019c47..9484e6e528 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
@@ -134,54 +134,6 @@ TEST_F(TestDeconvInt8, PackInputTest1) {
   CompareOutputData(dst, co, 8 * 32, 1);
 }
 
-TEST_F(TestDeconvInt8, MatMulTest1) {
-  int8_t a_row_major_10_12[] = {
-    -6, 76,  32,  80,  -73, 8,   -85, -3,  114, 80,  30,  42,  -41, 117,  62,  -76, -77, -111, 88,  105,
-    68, 105, -74, 13,  51,  94,  31,  -52, -92, -4,  -35, -71, 101, -93,  46,  -65, 57,  -41,  -51, 77,
-    1,  9,   73,  -19, -36, 57,  81,  -24, 40,  103, 112, 109, -41, -68,  57,  61,  55,  -20,  3,   2,
-    17, -16, -31, 58,  -4,  67,  -4,  -95, -5,  -72, 81,  15,  -7,  -16,  -47, 112, 114, -26,  -98, 53,
-    15, -49, 26,  19,  19,  8,   -57, -35, -79, 118, 29,  21,  37,  -48,  83,  7,   124, 113,  -5,  15,
-    -8, 107, -65, -88, 50,  -47, -80, -84, 3,   -45, 92,  42,  -20, -101, 106, -10, 89,  67,   55,  10};
-  int32_t zp_a = 15;
-  int8_t a_col8_major[16 * 12] = {0};
-  int8_t b_col_major_12_18[] = {
-    92,  27,   22,   52,  -112, -20, -57, -2,   89,   32,  93,   -66,  -25, -54, 94,  -97, -119, -98,  101,  -99,
-    77,  -83,  76,   95,  59,   97,  8,   40,   -109, -20, 67,   -107, 37,  -6,  -54, -20, -30,  36,   -106, -103,
-    -3,  -86,  -82,  59,  4,    -75, -50, -106, 55,   104, -117, -71,  -20, -85, -77, 16,  -25,  -58,  4,    80,
-    -75, 94,   32,   -68, 2,    40,  56,  -103, 11,   -98, -70,  -69,  0,   57,  -6,  82,  66,   -112, -61,  33,
-    -77, -53,  95,   -38, 87,   -46, -3,  81,   -47,  43,  21,   26,   -45, -57, 50,  -24, -82,  -114, 61,   46,
-    -53, 78,   -24,  31,  -7,   37,  29,  38,   45,   106, 52,   -42,  31,  -6,  -61, -87, 2,    79,   -5,   -42,
-    43,  -106, -104, 7,   91,   -63, 58,  97,   -15,  74,  -96,  15,   -23, -3,  -47, -97, 100,  -54,  26,   -46,
-    35,  26,   100,  -80, 34,   -25, 96,  -67,  -80,  -27, 66,   41,   41,  -43, -43, -38, -4,   -64,  31,   7,
-    -8,  6,    -2,   39,  -119, 53,  75,  -91,  -44,  77,  -62,  22,   -44, 78,  -67, -48, -115, -4,   43,   81,
-    40,  -20,  -5,   -89, 60,   -62, -4,  -48,  66,   -64, -69,  62,   17,  -89, 1,   87,  81,   32,   -29,  51,
-    40,  27,   66,   67,  11,   -69, 85,  -79,  -106, 55,  22,   -23,  62,  69,  -74, 49};
-  int32_t zp_b = -20;
-  int8_t b_row8_major[12 * 24] = {0};
-  int32_t co_row_major_10_18[] = {
-    32005,  3597,   16595,  -3458,  6627,   -6663,  818,    -3910,  10228,  15079,  -19205, -10203, -3178,  -10046,
-    10374,  -6199,  5330,   12163,  1819,   20533,  17382,  18283,  9778,   9185,   -12623, -26234, -11987, 7904,
-    8144,   -1603,  27611,  -10190, -20053, 4999,   -28389, 21852,  24680,  25858,  23506,  17944,  11768,  24378,
-    -6102,  -4675,  -23460, 10434,  -47579, 1986,   12018,  -19418, -7248,  4938,   -32613, -941,   8171,   -4788,
-    3325,   -11310, -8351,  -14786, 6909,   16401,  2017,   -6456,  11242,  7393,   -9119,  17312,  2646,   -14402,
-    7201,   -9949,  23986,  17607,  27461,  -1547,  2783,   7558,   19487,  11158,  -2686,  6328,   -8225,  -11668,
-    21858,  -2079,  -8671,  -639,   -1544,  1235,   1156,   6582,   2829,   -10311, -2692,  5154,   1527,   10870,
-    106,    -8189,  -24174, -1846,  -15399, -3598,  14874,  -5591,  -619,   -13667, -6053,  -31103, -24499, 13008,
-    9143,   -17982, 28437,  2176,   -2114,  -11631, 10779,  -1032,  -24690, -3112,  2125,   432,    20270,  -33859,
-    8907,   10063,  1603,   3761,   4805,   4904,   -15594, 10786,  4287,   -13591, -18777, -1679,  2109,   -2243,
-    12051,  -8504,  -6558,  4209,   13606,  -25803, 27922,  12092,  7140,   27142,  -12267, 2339,   -26224, 23674,
-    -26579, -11398, -1823,  -18976, 3641,   4415,   -24878, -2045,  15937,  41465,  12601,  -14513, -17619, -5728,
-    334,    -424,   8147,   -1369,  5984,   11000,  19016,  4456,   -25920, 4506,   5930,   15458};
-  int32_t c_row8x8_major[16 * 24] = {0};
-
-  int32_t out_row_major[180] = {0};
-  RowMajor2Col8MajorInt8(a_row_major_10_12, a_col8_major, 10, 12);
-  RowMajor2Col8MajorInt8(b_col_major_12_18, b_row8_major, 18, 12);
-  MatMulInt8(a_col8_major, b_row8_major, c_row8x8_major, 16, 24, 12, zp_a, zp_b);
-  Row8x8Major2RowMajor(reinterpret_cast<float *>(c_row8x8_major), reinterpret_cast<float *>(out_row_major), 10, 18, 18);
-  CompareOutputData(out_row_major, co_row_major_10_18, 180, 1);
-}
-
 TEST_F(TestDeconvInt8, InputSumTest1) {
   int8_t packed_a[] = {
     -6,  76,  32,  80,  -73, 8,   -85, -3,  114, 80,  30,  42,  15,  15,  15,  15,  -41, 117,  62,  -76, -77, -111,
@@ -391,7 +343,7 @@ int DeConvInt8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vecto
   PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
 
   conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
-  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->pad_u_ = conv_param->pad_l_ = 1;
   conv_param->stride_h_ = conv_param->stride_w_ = 2;
   conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
   return out_t->ElementsNum();
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
index 60f1c2c5c7..9584d9fa33 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
@@ -29,99 +29,128 @@ class TestFcInt8 : public mindspore::CommonTest {
   TestFcInt8() {}
 };
 
-int FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
-                   MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
-  float input_max = 20;
-  float input_min = -20;
-  float weight_max = 1;
-  float weight_min = -1;
-  float output_max = 20;
-  float output_min = -20;
+struct TensorInfo {
+  float *data;
+  int *data_int;
+  float min;
+  float max;
+  int len;
+  std::vector<int> *shape;
+};
 
-  double input_scale =
-    (input_max - input_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int input_zp = std::numeric_limits<int8_t>::max() - input_max / input_scale;
-  double weight_scale =
-    (weight_max - weight_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int weight_zp = std::numeric_limits<int8_t>::max() - weight_max / weight_scale;
-  double output_scale =
-    (output_max - output_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int output_zp = std::numeric_limits<int8_t>::max() - output_max / output_scale;
-  *scale = output_scale;
-  *zeropoint = output_zp;
+extern void QuantProcess(float *input, int len, float min, float max, float *scale, int *zero_point, int8_t *output);
+extern lite::tensor::Tensor *MakeQuantTensor(int8_t *data, int len, std::vector<int> *shape, float scale, int zp);
 
-  Tensor *in_t = new Tensor(kNumberTypeInt8, {2, 2, 2, 2}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  in_t->MallocData();
-  float in[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
-                17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
-  Quantize(in, in_t->ElementsNum(), input_scale, input_zp, reinterpret_cast<int8_t *>(in_t->Data()));
-  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
-  in_quant_arg->zeroPoint = input_zp;
-  in_quant_arg->scale = input_scale;
-  in_t->AddQuantParam(*in_quant_arg);
-  inputs_->push_back(in_t);
+lite::tensor::Tensor *MakeIntTensor(int *data, int len, std::vector<int> *shape) {
+  auto tensor =
+    new lite::tensor::Tensor(kNumberTypeInt32, *shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  tensor->MallocData();
+  auto tensor_ptr = reinterpret_cast<int *>(tensor->Data());
+  memcpy(tensor_ptr, data, len * sizeof(int));
+  return tensor;
+}
 
-  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  weight_t->MallocData();
-  float weight[] = {-0.24438887,  0.06738146,  -0.8169129,   0.21510671,   -0.012470592, -0.053063435,
-                    0.6050155,    0.8656233,   0.12911413,   -0.028635843, -0.034080597, -0.10622552,
-                    -0.012254699, -0.01312836, 0.25241964,   -0.4706142,   0.2451482,    -0.9558459,
-                    0.4481974,    0.33251503,  -0.011705584, -0.1720293,   -0.39410214,  -0.73637343};
-  Quantize(weight, weight_t->ElementsNum(), weight_scale, weight_zp, reinterpret_cast<int8_t *>(weight_t->Data()));
-  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
-  weight_quant_arg->zeroPoint = weight_zp;
-  weight_quant_arg->scale = weight_scale;
-  weight_t->AddQuantParam(*weight_quant_arg);
-  inputs_->push_back(weight_t);
+void FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
+                    TensorInfo *in, TensorInfo *weight, TensorInfo *bias, TensorInfo *out) {
+  float in_scale, weight_scale, out_scale;
+  int in_zp, weight_zp, out_zp;
+  int8_t *in_data = new int8_t[in->len];
+  int8_t *weight_data = new int8_t[weight->len];
+  QuantProcess(in->data, in->len, in->min, in->max, &in_scale, &in_zp, in_data);
+  auto in_tensor = MakeQuantTensor(in_data, in->len, in->shape, in_scale, in_zp);
+  inputs->push_back(in_tensor);
+  QuantProcess(weight->data, weight->len, weight->min, weight->max, &weight_scale, &weight_zp, weight_data);
+  auto weight_tensor = MakeQuantTensor(weight_data, weight->len, weight->shape, weight_scale, weight_zp);
+  inputs->push_back(weight_tensor);
+  auto bias_tensor = MakeIntTensor(bias->data_int, bias->len, bias->shape);
+  inputs->push_back(bias_tensor);
+  QuantProcess(out->data, out->len, out->min, out->max, &out_scale, &out_zp, nullptr);
+  auto out_tensor = MakeQuantTensor(nullptr, out->len, out->shape, out_scale, out_zp);
+  outputs->push_back(out_tensor);
+  delete[] in_data;
+  delete[] weight_data;
+}
 
-  Tensor *bias_t = new Tensor(kNumberTypeInt32, {3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  bias_t->MallocData();
-  memset(bias_t->Data(), 0, sizeof(int) * bias_t->ElementsNum());
-  inputs_->push_back(bias_t);
+TEST_F(TestFcInt8, fctest1) {
+  float in[] = {4.259103407444801,   5.992151035772917,   -9.495343223733581,  3.0509999931426215, -16.635707833991095,
+                -14.72005749234452,  2.8290916795754093,  -15.827977973039049, -16.98208477063347, 2.8801101778935347,
+                -0.5905297521382735, 18.042746010536085,  3.913511213700396,   11.571264917136105, 19.084257392926148,
+                8.571560238377568,   17.58868010598305,   12.433311533838427,  4.548078598583526,  15.609650071521138,
+                6.663372887795717,   17.581323475674594,  1.453277207446778,   -6.119351424589654, -16.87310296820285,
+                11.906066592064796,  -13.290100998834653, 19.627129875430548,  16.034262583959162, 10.255738135902781,
+                12.134650347811792,  -5.5882066903433305, 15.554050723026322,  15.288481461776783, 17.651080309797287,
+                -9.258779162183215,  4.218532791445092,   -6.205309122668545,  1.2220458021156908, 1.6800736573947326};
+  TensorInfo in_params;
+  in_params.data = in;
+  in_params.len = 40;
+  std::vector<int> in_shape{5, 2, 2, 2};
+  in_params.shape = &in_shape;
+  in_params.min = -20;
+  in_params.max = 20;
 
-  Tensor *out_t = new Tensor(kNumberTypeInt8, {2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  out_t->MallocData();
-  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
-  output_quant_arg->zeroPoint = output_zp;
-  output_quant_arg->scale = output_scale;
-  out_t->AddQuantParam(*output_quant_arg);
-  outputs_->push_back(out_t);
+  float weight[] = {
+    -0.586269014312498,   0.10845796767603733,  0.8455159907124523,   0.20261291069007226,  0.7564258582027543,
+    0.4505005038790615,   -0.607259232240795,   -0.6962171798923924,  0.7967573009922135,   -0.46069496925353715,
+    -0.2967638879316592,  -0.7025557337565955,  -0.5313515272071268,  0.07584168670764102,  -0.6860034691410029,
+    0.9218806800279316,   -0.07408538201953907, -0.7933652717840096,  0.6636691558029275,   -0.30198695606477477,
+    0.790225747868754,    -0.9478140254555916,  0.4537316306461665,   0.1776848732022871,   -0.7492316745474277,
+    -0.5825825240770948,  0.5680842804542614,   -0.9255552309192772,  0.20866577718844725,  0.9570928647172854,
+    0.18172570688854406,  -0.26442830241827253, -0.24765169216720873, -0.19512285277145702, 0.1120696020054861,
+    0.7558578199370625,   -0.15032457481135109, -0.08485585411928809, 0.6343014796699504,   0.026380085222785787,
+    -0.40516674259120444, -0.7407588590646037,  -0.28521396461492454, 0.2555841827858194,   0.023640857478332444,
+    -0.6540694390119834,  0.7439705499824205,   -0.7579774562590929};
+  TensorInfo weight_params;
+  weight_params.data = weight;
+  weight_params.len = 48;
+  std::vector<int> weight_shape{6, 8};
+  weight_params.shape = &weight_shape;
+  weight_params.min = -1;
+  weight_params.max = 1;
 
-  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
-  float nchw_co[] = {3.84586822, 0.93586633, 12.16212629, -10.93835061, 2.46887183, 8.61480108};
-  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));
+  int bias[6] = {0};
+  TensorInfo bias_params;
+  bias_params.data_int = bias;
+  bias_params.len = 6;
+  std::vector<int> bias_shape{6};
+  bias_params.shape = &bias_shape;
 
-  matmal_param->b_transpose_ = true;
-  matmal_param->a_transpose_ = false;
-  matmal_param->has_bias_ = true;
-  matmal_param->act_type_ = ActType_No;
-  return out_t->ElementsNum();
-}
+  float correct[] = {-19.170732, -7.5019627, -13.015462, -27.760283, 4.1447954,  20.660276,  4.0412164,  -33.750015,
+                     -4.560128,  7.1035166,  27.976341,  9.75216,    14.383608,  -12.87587,  -24.688887, -12.185722,
+                     3.7933283,  -19.266382, 17.193876,  -49.99205,  -15.480089, -3.1659412, 19.470417,  13.758459,
+                     4.0713396,  4.614437,   11.296907,  -7.244551,  -11.143417, -21.233654};
+  TensorInfo out_params;
+  out_params.data = correct;
+  out_params.len = 30;
+  std::vector<int> out_shape{5, 6};
+  out_params.shape = &out_shape;
+  out_params.min = -50;
+  out_params.max = 50;
 
-TEST_F(TestFcInt8, fcint8) {
-  std::vector<lite::tensor::Tensor *> inputs_;
-  std::vector<lite::tensor::Tensor *> outputs_;
-  auto matmul_param = new MatMulParameter();
-  float *correct;
-  double output_scale;
-  int output_zp;
-  int total_size = FcInt8TestInit(&inputs_, &outputs_, matmul_param, &correct, &output_scale, &output_zp);
-  lite::Context *ctx = new lite::Context;
+  auto fc_param = new MatMulParameter();
+  fc_param->a_transpose_ = false;
+  fc_param->b_transpose_ = true;
+  fc_param->has_bias_ = true;
+  fc_param->act_type_ = ActType_No;
+  std::vector<lite::tensor::Tensor *> inputs;
+  std::vector<lite::tensor::Tensor *> outputs;
+  FcInt8TestInit(&inputs, &outputs, &in_params, &weight_params, &bias_params, &out_params);
+  auto ctx = new lite::Context;
   ctx->thread_num_ = 2;
-  kernel::FullconnectionInt8CPUKernel *fc = new kernel::FullconnectionInt8CPUKernel(
-    reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx, nullptr);
+
+  kernel::FullconnectionInt8CPUKernel *fc =
+    new kernel::FullconnectionInt8CPUKernel(reinterpret_cast<OpParameter *>(fc_param), inputs, outputs, ctx, nullptr);
 
   fc->Init();
   fc->Run();
-  float fout[6] = {0};
-  Dequantize(reinterpret_cast<int8_t *>(outputs_[0]->Data()), outputs_[0]->ElementsNum(), output_scale, output_zp,
-             fout);
-  CompareOutputData(fout, correct, 6, 0.2);
-  delete matmul_param;
+  float out_scale;
+  int out_zp;
+  QuantProcess(correct, out_params.len, out_params.min, out_params.max, &out_scale, &out_zp, nullptr);
+  float *out = new float[out_params.len];
+  Dequantize(reinterpret_cast<int8_t *>(outputs[0]->Data()), outputs[0]->ElementsNum(), out_scale, out_zp, out);
+  CompareOutputData(out, correct, 6, 0.3);
   delete fc;
-  for (auto t : inputs_) delete t;
-  for (auto t : outputs_) delete t;
-  free(correct);
+  for (auto t : inputs) delete t;
+  for (auto t : outputs) delete t;
+  delete[] out;
 }
-
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
new file mode 100644
index 0000000000..53ad94b4ea
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/nnacl/fp32/gatherNd.h"
+#include "mindspore/lite/nnacl/int8/gatherNd_int8.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/lite_kernel.h"
+
+namespace mindspore {
+class TestGatherNdInt8 : public mindspore::CommonTest {
+ public:
+  TestGatherNdInt8() {}
+};
+
+TEST_F(TestGatherNdInt8, GatherNdTest) {
+  std::vector<int8_t> in_data = {3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1};
+  std::vector<int8_t> in_data1 = {2, 4, 4, 2, 2, 4, 2, 4, 2};
+  // std::vector<int8_t> in_data1 = {2, 2, 2, 4};
+
+  std::vector<lite::tensor::Tensor *> inputs_tensor;
+  std::vector<lite::tensor::Tensor *> outputs_tensor;
+
+  GatherNdParameter op_param;
+  op_param.op_parameter_.type_ = schema::PrimitiveType_GatherNd;
+  op_param.batchDims_ = 1;
+  std::vector<int> shape = {1, 2, 2, 5};
+  std::vector<int> out_shape = {1, 3, 5};
+
+  lite::tensor::QuantArg input_quant_arg;
+  input_quant_arg.scale = 0.5;
+  input_quant_arg.zeroPoint = 1;
+  lite::tensor::QuantArg input_quant_arg_1;
+  input_quant_arg_1.scale = 0.5;
+  input_quant_arg_1.zeroPoint = 2;
+  lite::tensor::QuantArg output_quant_arg;
+  output_quant_arg.scale = 1;
+  output_quant_arg.zeroPoint = 0;
+
+  lite::tensor::Tensor input0_tensor;
+  lite::tensor::Tensor input1_tensor;
+
+  inputs_tensor.push_back(&input0_tensor);
+  inputs_tensor.push_back(&input1_tensor);
+
+  input0_tensor.SetData(in_data.data());
+  input1_tensor.SetData(in_data1.data());
+
+  input0_tensor.set_shape(shape);
+  input1_tensor.set_shape({3, 3});
+
+  input0_tensor.AddQuantParam(input_quant_arg);
+  input1_tensor.AddQuantParam(input_quant_arg_1);
+
+  std::vector<int8_t> output(15);
+  // std::vector<int8_t> corr_out = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
+  std::vector<int8_t> corr_out = {6, 7, 8, 9, 0, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5};
+  lite::tensor::Tensor output0_tensor;
+  outputs_tensor.push_back(&output0_tensor);
+  output0_tensor.SetData(output.data());
+  output0_tensor.set_shape(out_shape);
+  output0_tensor.AddQuantParam(output_quant_arg);
+
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_GatherNd};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  ASSERT_NE(creator, nullptr);
+  lite::Context ctx;
+  ctx.thread_num_ = 3;
+  kernel::LiteKernel *kernel =
+    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
+  ASSERT_NE(kernel, nullptr);
+  auto output_tensor_shape = output0_tensor.shape();
+  kernel->Run();
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
+    printf("%d, ", output[i]);
+  }
+  std::cout << std::endl;
+  CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
+
+  input0_tensor.SetData(nullptr);
+  input1_tensor.SetData(nullptr);
+  output0_tensor.SetData(nullptr);
+  MS_LOG(INFO) << "TestGatherNd accuracy passed";
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
new file mode 100644
index 0000000000..27f955153b
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/nnacl/gather_parameter.h"
+#include "mindspore/lite/nnacl/int8/gather_int8.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/lite_kernel.h"
+
+namespace mindspore {
+class TestGatherInt8 : public mindspore::CommonTest {
+ public:
+  TestGatherInt8() {}
+};
+
+TEST_F(TestGatherInt8, GatherTest) {
+  std::vector<int8_t> in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61};
+  std::vector<int8_t> in_data1 = {4, 2};
+  std::vector<lite::tensor::Tensor *> inputs_tensor;
+  std::vector<lite::tensor::Tensor *> outputs_tensor;
+
+  GatherParameter op_param;
+  op_param.op_parameter_.type_ = schema::PrimitiveType_Gather;
+  op_param.axis_ = 0;
+  op_param.batchDims_ = 1;
+  std::vector<int> shape = {2, 1, 3, 2};
+
+  lite::tensor::QuantArg input_quant_arg;
+  input_quant_arg.scale = 0.1;
+  input_quant_arg.zeroPoint = 1;
+  lite::tensor::QuantArg input_quant_arg_1;
+  input_quant_arg_1.scale = 0.5;
+  input_quant_arg_1.zeroPoint = 2;
+  lite::tensor::QuantArg output_quant_arg;
+  output_quant_arg.scale = 0.1;
+  output_quant_arg.zeroPoint = 1;
+
+  lite::tensor::Tensor input0_tensor;
+  lite::tensor::Tensor input1_tensor;
+
+  inputs_tensor.push_back(&input0_tensor);
+  inputs_tensor.push_back(&input1_tensor);
+
+  input0_tensor.SetData(in_data.data());
+  input1_tensor.SetData(in_data1.data());
+
+  input0_tensor.set_shape(shape);
+  input1_tensor.set_shape({2});
+
+  input0_tensor.AddQuantParam(input_quant_arg);
+  input1_tensor.AddQuantParam(input_quant_arg_1);
+
+  std::vector<int8_t> output(12);
+  // std::vector<int8_t> corr_out = {-18, -22, -16, -21, -14, -19, -22, -34, -24, -35, -26, -36 };
+  std::vector<int8_t> corr_out = {-11, -41, -21, -51, -31, -61, 11, 41, 21, 51, 31, 61};
+  lite::tensor::Tensor output0_tensor;
+  outputs_tensor.push_back(&output0_tensor);
+  output0_tensor.SetData(output.data());
+  output0_tensor.set_shape(shape);
+  output0_tensor.AddQuantParam(output_quant_arg);
+
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_Gather};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  ASSERT_NE(creator, nullptr);
+  lite::Context ctx;
+  ctx.thread_num_ = 3;
+  kernel::LiteKernel *kernel =
+    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
+  ASSERT_NE(kernel, nullptr);
+  auto output_tensor_shape = output0_tensor.shape();
+  kernel->Run();
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
+    printf("%d, ", output[i]);
+  }
+  std::cout << std::endl;
+  CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
+
+  input0_tensor.SetData(nullptr);
+  input1_tensor.SetData(nullptr);
+  output0_tensor.SetData(nullptr);
+  MS_LOG(INFO) << "TestGather_int8 accuracy passed";
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
index 017a7657b7..49a93eafb0 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
@@ -18,8 +18,9 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h"
-#include "mindspore/lite/nnacl/quantization/quantize.h"
-#include "mindspore/lite/nnacl/common_func.h"
+#include "nnacl/quantization/quantize.h"
+#include "nnacl/common_func.h"
+#include "nnacl/int8/matmul_int8.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"
 
@@ -29,99 +30,283 @@ class TestMatmulInt8 : public mindspore::CommonTest {
   TestMatmulInt8() {}
 };
 
-int MMInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
-                   MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
-  float input_max = 20;
-  float input_min = -20;
-  float weight_max = 1;
-  float weight_min = -1;
-  float output_max = 30;
-  float output_min = -30;
-
-  double input_scale =
-    (input_max - input_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int input_zp = std::numeric_limits<int8_t>::max() - input_max / input_scale;
-  double weight_scale =
-    (weight_max - weight_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int weight_zp = std::numeric_limits<int8_t>::max() - weight_max / weight_scale;
-  double output_scale =
-    (output_max - output_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
-  int output_zp = std::numeric_limits<int8_t>::max() - output_max / output_scale;
-  *scale = output_scale;
-  *zeropoint = output_zp;
-
-  auto in_t =
-    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  in_t->MallocData();
+struct TensorInfo {
+  float *data;
+  float min;
+  float max;
+  int len;
+  std::vector<int> *shape;
+};
+
+void QuantProcess(float *input, int len, float min, float max, float *scale, int *zero_point, int8_t *output) {
+  *scale = (max - min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
+  *zero_point = std::numeric_limits<int8_t>::max() - max / (*scale);
+  if (output) {
+    Quantize(input, len, *scale, *zero_point, output);
+  }
+}
+
+lite::tensor::Tensor *MakeQuantTensor(int8_t *data, int len, std::vector<int> *shape, float scale, int zp) {
+  auto tensor =
+    new lite::tensor::Tensor(kNumberTypeInt8, *shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  tensor->MallocData();
+  if (data) {
+    auto tensor_ptr = reinterpret_cast<int8_t *>(tensor->Data());
+    memcpy(tensor_ptr, data, len * sizeof(int8_t));
+  }
+  auto quant_arg = new mindspore::lite::tensor::QuantArg();
+  quant_arg->zeroPoint = zp;
+  quant_arg->scale = scale;
+  tensor->AddQuantParam(*quant_arg);
+  return tensor;
+}
+
+void MMInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
+                    TensorInfo *in, TensorInfo *weight, TensorInfo *out) {
+  float in_scale, weight_scale, out_scale;
+  int in_zp, weight_zp, out_zp;
+  int8_t *in_data = new int8_t[in->len];
+  int8_t *weight_data = new int8_t[weight->len];
+  QuantProcess(in->data, in->len, in->min, in->max, &in_scale, &in_zp, in_data);
+  auto in_tensor = MakeQuantTensor(in_data, in->len, in->shape, in_scale, in_zp);
+  inputs->push_back(in_tensor);
+  QuantProcess(weight->data, weight->len, weight->min, weight->max, &weight_scale, &weight_zp, weight_data);
+  auto weight_tensor = MakeQuantTensor(weight_data, weight->len, weight->shape, weight_scale, weight_zp);
+  inputs->push_back(weight_tensor);
+  QuantProcess(out->data, out->len, out->min, out->max, &out_scale, &out_zp, nullptr);
+  auto out_tensor = MakeQuantTensor(nullptr, out->len, out->shape, out_scale, out_zp);
+  outputs->push_back(out_tensor);
+  delete[] in_data;
+  delete[] weight_data;
+}
+
+TEST_F(TestMatmulInt8, simple) {
+#define ROW 10
+#define COL 15
+#define DEPTH 10
+#define ROW4 UP_ROUND(ROW, 4)
+#define COL4 UP_ROUND(COL, 4)
+#define DEPTH16 UP_ROUND(DEPTH, 16)
+  int8_t a[ROW * DEPTH] = {-3, -3, 0, -2, -4, -2, 1,  0,  -1, 0,  5,  1,  3,  4,  4,  -3, -5, 2,  -2, 4,
+                           4,  5,  1, -1, 5,  5,  2,  -1, 0,  4,  -4, 2,  5,  -2, 5,  3,  -1, 2,  -4, 5,
+                           -5, 4,  5, 3,  5,  4,  -2, 5,  5,  -5, -5, -5, 2,  -4, -3, 3,  -3, -5, 5,  0,
+                           2,  -4, 4, 2,  -5, 3,  -1, 3,  -3, 2,  -5, -4, 0,  -5, 2,  4,  0,  -5, -1, 4,
+                           3,  5,  5, 2,  -5, -5, -4, -5, 3,  3,  3,  0,  -2, 0,  -2, -3, -2, 3,  5,  -5};
+  int8_t b[DEPTH * COL] = {1,  2,  -2, -5, -4, 2,  3,  2,  -5, 4,  -5, 4,  1, -2, 1,  5,  5,  5,  2,  5,  -3, -3,
+                           -1, -3, -1, 0,  -4, 0,  1,  -2, -2, -3, -5, 1,  1, 0,  4,  5,  -3, -1, 4,  3,  5,  4,
+                           2,  4,  -3, -4, 1,  4,  -4, 5,  -1, -2, 3,  5,  5, 2,  1,  -4, 1,  2,  -3, 0,  -2, 4,
+                           -3, -3, 1,  3,  4,  -1, 3,  1,  -5, -1, 2,  0,  0, 5,  -1, -5, 5,  -5, 0,  3,  -3, 4,
+                           3,  1,  -3, -3, 2,  -2, -3, -3, 3,  4,  2,  -1, 2, 0,  -2, 4,  5,  3,  -1, -3, -2, -1,
+                           4,  3,  -5, 1,  0,  0,  -1, -4, -3, -2, 5,  3,  2, 1,  -4, 1,  4,  5,  -1, 2,  -2, 2,
+                           1,  -2, 5,  2,  -4, -4, 1,  1,  2,  -1, -5, -4, 4, 1,  -3, 4,  -1, -4};
+
+  int8_t correct[ROW * COL] = {
+    -36, -33, 11,  4,   -12, -7,  11,  0,   37,  -30, -13, -2, -30, -3,  29,  46,  -13, -84, -8,  6,   39,  26,
+    -67, -48, 57,  12,  32,  44,  -24, -85, 22,  32,  -8,  -8, 20,  10,  -45, 12,  -69, 36,  22,  -37, 58,  27,
+    -24, -11, -22, -50, 26,  50,  28,  -56, -42, -23, -1,  70, -58, 54,  35,  -61, 54,  40,  -11, 35,  43,  3,
+    7,   30,  -7,  -13, 73,  -3,  26,  26,  -11, -37, 0,   19, 34,  -4,  0,   -22, 71,  8,   -25, -6,  -5,  31,
+    8,   63,  -25, -55, -62, -17, 23,  1,   36,  12,  -38, 2,  11,  27,  18,  5,   4,   -59, -17, 1,   25,  9,
+    13,  -77, 13,  9,   -11, 26,  -52, 42,  28,  6,   44,  4,  2,   26,  19,  -31, 46,  23,  -57, 15,  -31, 39,
+    40,  -9,  8,   38,  40,  27,  -19, -47, 14,  50,  14,  18, 0,   -59, 39,  -48, -47, 35};
+
+  int8_t output[ROW * COL] = {0};
+  int8_t *a_r4x16 = new int8_t[ROW4 * DEPTH16];
+  memset(a_r4x16, 0, ROW4 * DEPTH16);
+  int8_t *b_c16x4 = new int8_t[COL4 * DEPTH16];
+  memset(b_c16x4, 0, COL4 * DEPTH16);
+  RowMajor2Row4x16Major(a, ROW, DEPTH, a_r4x16, DEPTH16);
+  RowMajor2Col16x4Major(b, DEPTH, COL, b_c16x4, DEPTH16);
+  int a_sums[ROW4] = {0};
+  int bias[COL4] = {0};
+  int multiplier, ls, rs;
+  QuantizeRoundParameter(1.0f, &multiplier, &ls, &rs);
+#ifdef ENABLE_ARM64
+  MatmulInt8Neon64(a_r4x16, b_c16x4, output, ROW4, COL4, DEPTH16, a_sums, bias, INT8_MIN, INT8_MAX, 0, multiplier, ls,
+                   rs, ROW, COL, COL);
+#else
+  MatmulInt8(a_r4x16, b_c16x4, output, a_sums, bias, INT8_MIN, INT8_MAX, 0, multiplier, ls, rs, ROW, COL, DEPTH16, COL);
+#endif
+  CompareOutputData(output, correct, ROW * COL, 0.1);
+  delete[] a_r4x16;
+  delete[] b_c16x4;
+}
+
+TEST_F(TestMatmulInt8, mmtest1) {
   float in[] = {6.583835634764597,   11.337275140963907,  -4.125256949459629, 10.994337291530833,
                 19.086065139532636,  3.620842999158455,   13.167624585590346, -18.326739299407755,
                 14.877693740734841,  -17.092677920571653, 19.24147072807235,  -15.14805323833401,
                 -18.075654829688737, -0.9164404591894204, -3.836646280336332, -10.870298671273918};
-  Quantize(in, in_t->ElementsNum(), input_scale, input_zp, reinterpret_cast<int8_t *>(in_t->Data()));
-  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
-  in_quant_arg->zeroPoint = input_zp;
-  in_quant_arg->scale = input_scale;
-  in_t->AddQuantParam(*in_quant_arg);
-  inputs_->push_back(in_t);
-
-  auto weight_t =
-    new lite::tensor::Tensor(kNumberTypeInt8, {1, 3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  weight_t->MallocData();
+  TensorInfo in_params;
+  in_params.data = in;
+  in_params.len = 16;
+  std::vector<int> in_shape{1, 2, 8};
+  in_params.shape = &in_shape;
+  in_params.min = -20;
+  in_params.max = 20;
+
   float weight[] = {0.3651070698591563,    -0.5856943921727129,  -0.7472032663840145,  0.9489992871641959,
                     -0.8179490270358738,   -0.873058811259344,   0.39876672713807215,  -0.1816769383004213,
                     -0.13584645926733696,  -0.7614673836659709,  -0.2535825872616164,  -0.05265760030895916,
                     0.28558728305658754,   0.15404213943520118,  -0.1634824450738006,  -0.5068199082730189,
                     -0.026961256849111326, -0.1508441942453307,  0.9375335677537737,   0.3304690744194263,
                     -0.5091563780251127,   0.029887336278646925, -0.39540496207319276, 0.46094065001445084};
-  Quantize(weight, weight_t->ElementsNum(), weight_scale, weight_zp, reinterpret_cast<int8_t *>(weight_t->Data()));
-  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
-  weight_quant_arg->zeroPoint = weight_zp;
-  weight_quant_arg->scale = weight_scale;
-  weight_t->AddQuantParam(*weight_quant_arg);
-  inputs_->push_back(weight_t);
-
-  auto out_t =
-    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
-  out_t->MallocData();
-  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
-  output_quant_arg->zeroPoint = output_zp;
-  output_quant_arg->scale = output_scale;
-  out_t->AddQuantParam(*output_quant_arg);
-  outputs_->push_back(out_t);
-
-  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
-  float nchw_co[] = {-0.912632942, 4.08398056, -25.385608673, 2.720281124, 7.745952606, 20.893184662};
-  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));
-
-  matmal_param->b_transpose_ = true;
-  matmal_param->a_transpose_ = false;
-  matmal_param->has_bias_ = false;
-  return out_t->ElementsNum();
+  TensorInfo weight_params;
+  weight_params.data = weight;
+  weight_params.len = 24;
+  std::vector<int> weight_shape{1, 3, 8};
+  weight_params.shape = &weight_shape;
+  weight_params.min = -1;
+  weight_params.max = 1;
+
+  float correct[] = {-0.912632942, 4.08398056, -25.385608673, 2.720281124, 7.745952606, 20.893184662};
+  TensorInfo out_params;
+  out_params.data = correct;
+  out_params.len = 6;
+  std::vector<int> out_shape{1, 2, 3};
+  out_params.shape = &out_shape;
+  out_params.min = -30;
+  out_params.max = 30;
+
+  auto matmul_param = new MatMulParameter();
+  matmul_param->a_transpose_ = false;
+  matmul_param->b_transpose_ = true;
+  matmul_param->has_bias_ = false;
+  std::vector<lite::tensor::Tensor *> inputs;
+  std::vector<lite::tensor::Tensor *> outputs;
+  MMInt8TestInit(&inputs, &outputs, &in_params, &weight_params, &out_params);
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  kernel::MatmulInt8CPUKernel *mm =
+    new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs, outputs, ctx, nullptr);
+
+  mm->Init();
+  mm->Run();
+  float out_scale;
+  int out_zp;
+  QuantProcess(correct, out_params.len, out_params.min, out_params.max, &out_scale, &out_zp, nullptr);
+  float *out = new float[out_params.len];
+  Dequantize(reinterpret_cast<int8_t *>(outputs[0]->Data()), outputs[0]->ElementsNum(), out_scale, out_zp, out);
+  CompareOutputData(out, correct, 6, 0.3);
+  delete mm;
+  for (auto t : inputs) delete t;
+  for (auto t : outputs) delete t;
+  delete[] out;
 }
 
-TEST_F(TestMatmulInt8, mmint8) {
-  std::vector<lite::tensor::Tensor *> inputs_;
-  std::vector<lite::tensor::Tensor *> outputs_;
+TEST_F(TestMatmulInt8, mmtest2) {
+  float in[] = {
+    -9.302902352910598,  16.65876088354537,    -7.2801759810348265, -6.3246021711950995, 8.467234093555248,
+    -4.729482636552028,  -3.747183865378627,   -8.690477390174504,  -2.7419930714530523, -3.9478573566319,
+    7.399137633080947,   -1.604450983941291,   0.3115665358682982,  -16.864318496334278, 2.5447052588244112,
+    -13.428639671203255, 13.417832391771974,   10.37917002467671,   14.709787234172168,  -16.347969268427146,
+    4.652834783979106,   6.03601450738973,     2.5788179666401874,  -9.236801653471375,  -0.18997468903009462,
+    19.977363387313744,  15.163337058447325,   -12.602897730843484, -6.178507797555191,  13.457928661476004,
+    -10.65587824516124,  -18.715557779424188,  -9.758039647923935,  8.102044210643097,   19.66309736072973,
+    -13.368041407254193, 9.928467253978024,    4.9981961360698755,  -4.2838547685981645, 1.5021547181513526,
+    -7.043468062239523,  11.964494917194845,   -4.783071964346499,  -17.646518743891008, -7.77810768119101,
+    14.869414292570454,  8.333036603520906,    11.053769742928765,  -1.768128637419725,  -14.971400302494597,
+    -0.8653626097283293, -6.21101640878031,    14.83875267850518,   7.224097292538833,   -16.747116419664213,
+    15.310978507353724,  -0.05593751363976551, 2.066880260042687,   -3.7053788331264137, 9.788933831258937,
+    -13.614523856950811, -5.656865231633642,   4.720270075138178,   -8.366650073458409,  7.197187069893303,
+    -18.78518907850054,  15.691652153539678,   7.914926057095165,   10.073559408864384,  10.437631177498353,
+    -3.0580194164595085, 17.36998905922836,    0.09998119223460122, 19.519199178417452,  -11.121833210377702,
+    19.655990774915622,  -17.25682638091008,   11.701013896880006,  -12.746728025401781, -9.370055221833699,
+    18.720474512055908,  7.634198897927405,    -15.521885320500694, -9.119267877304358,  -1.5853789671841945,
+    4.783147823043613,   14.6732610092525,     -9.294170215010427,  9.835421489234331,   13.051159704193232,
+    -1.422599906517025,  -1.5530696181467398,  19.51404609713284,   -12.297429715833763, 6.8811248552401985,
+    13.052476234003755,  18.66085390709462,    -8.097735292301103,  -6.868239274661935,  -8.067142805841826,
+    3.2707808734101533,  1.8239332220210827};
+  TensorInfo in_params;
+  in_params.data = in;
+  in_params.len = 6 * 17;
+  std::vector<int> in_shape{1, 6, 17};
+  in_params.shape = &in_shape;
+  in_params.min = -20;
+  in_params.max = 20;
+
+  float weight[] = {
+    -0.42740096214251677,  0.8557068789482212,    0.4560574664172552,   -0.1317821769705021,  0.2845963675712846,
+    0.8414603241768241,    0.24513271080109011,   0.16403708196683398,  -0.09111601416189297, -0.714027790956111,
+    0.12253431683185845,   -0.4542459426686125,   0.7123202105555202,   -0.3708573394849488,  -0.4571735646072892,
+    -0.595627630450934,    -0.5022671357384993,   0.2781065609468565,   -0.07586181451887586, -0.2667701710291306,
+    0.03141663091360791,   -0.013304592900917456, -0.7507975439396768,  0.5886778622432618,   -0.9056075431439199,
+    0.9393767525356569,    -0.2791312477047512,   0.7134531940450286,   0.3977932134993216,   -0.027832574334469395,
+    0.7222024948455503,    -0.2084178952731608,   -0.4869535410639745,  -0.8255185994321805,  0.975443145421772,
+    0.541914384763855,     -0.8831162309708303,   -0.3339354888475805,  0.3699271440691516,   -0.26923635397292944,
+    -0.4975347179262828,   0.2440013185603882,    0.5553443771246633,   0.6111909921005778,   -0.5968624036034165,
+    0.8367593317557596,    -0.843079440282104,    -0.5651924211153698,  0.7169318662247579,   0.5116755837443465,
+    -0.9079299375502927,   0.025240632113315176,  -0.5819662075810048,  -0.37278414060319176, -0.172154755034845,
+    -0.7372352723583462,   0.2462103743741677,    0.11785417820789856,  0.6712183976911841,   -0.7042964391243491,
+    -0.8215958062965967,   -0.7304378130182314,   0.3991295415760667,   -0.07226694075875573, 0.9329628273800614,
+    0.7866596674858193,    0.9410341281569592,    0.39672750454198225,  -0.5217505454791054,  0.9538253510722774,
+    -0.6286845762774464,   -0.773460418882959,    0.002296000778892804, 0.9763898918063998,   0.9648708739062339,
+    0.9400037814137154,    -0.6011085333221611,   -0.5890262409238565,  -0.8078857772627164,  0.233661306598278,
+    -0.6726381934018617,   -0.08533323149874539,  0.19055766469859425,  -0.7956482347958518,  -0.17012651641579035,
+    0.7181052528631318,    0.1285045774388125,    -0.6997527417326721,  -0.8436484573035989,  0.342855467305474,
+    0.4085157503460306,    -0.6199324510955382,   -0.6883822276097309,  0.4186437018431113,   0.3030114883148305,
+    0.0948227655828271,    -0.002521771948760465, -0.34878560791422397, 0.08513437045281003,  0.3116035319055901,
+    -0.7177514192203747,   0.050531673446029046,  -0.7399803440665007,  -0.9353609485885221,  -0.3899340891814298,
+    0.40867084031625356,   -0.17462484099335662,  -0.6313167634279941,  -0.8135597146296727,  -0.9762553414099975,
+    -0.1040485487920626,   -0.6517520252975368,   0.5877412140956126,   0.9433584450325512,   0.24701546283170672,
+    -0.3236849444311023,   -0.12043548611719657,  0.5300129281052712,   -0.1380138229226111,  -0.8787455295545508,
+    -0.4361728423289617,   0.7331994894985936,    0.45492774136929826,  -0.17836517403432972, 0.10896668585054625,
+    0.6176507847785211,    0.21617962964770676,   -0.6821928873814629,  0.021775035324277825, 0.15089571088539566,
+    -0.9923383126255942,   -0.6034706970202426,   0.17729888871670285,  0.1278810065499425,   -0.6575545415840387,
+    -0.022704865415375197, -0.7366071817901978,   -0.9300211224192332,  -0.153494127035938,   0.4836121912045357,
+    -0.3318483587414114,   -0.9658468087620375,   0.8388464445207262,   0.45745949405796127,  -0.3671803281863002,
+    -0.1543498074773253,   0.18955899788963748,   -0.4452120359256351,  -0.5338599486040962,  -0.06979561022721281,
+    -0.45964195574917355,  -0.4343754114042866,   -0.4318308749403197,  0.748107130947133,    -0.4703901010752156,
+    0.6655596561650823,    0.9075215202451821,    0.2708741258104177,   -0.6540233471632313,  0.7250124906689572,
+    0.6674821078610087,    0.8464696566759315,    -0.6106156844283976,  0.8675828337337224,   0.8517737949695063,
+    -0.8126381016475459,   -0.6140987457462099,   -0.2984524227549874,  0.2816320572339577,   -0.8131479383469931};
+  TensorInfo weight_params;
+  weight_params.data = weight;
+  weight_params.len = 170;
+  std::vector<int> weight_shape{1, 17, 10};
+  weight_params.shape = &weight_shape;
+  weight_params.min = -1;
+  weight_params.max = 1;
+
+  float correct[] = {35.815605,  26.532362,  14.777507,  -12.651591, -2.0373726, -47.020798,  -18.53121,  2.7848654,
+                     16.19751,   -30.754261, 25.830605,  47.635204,  10.247462,  -33.260662,  34.145412,  -6.1611304,
+                     -18.56802,  -24.669813, 20.314533,  -5.887198,  -14.757037, 24.78901,    20.512205,  17.985718,
+                     17.62954,   20.365099,  -26.223736, 0.99702793, 12.752281,  -35.30419,   -22.09603,  8.2218,
+                     8.120908,   27.685753,  -44.010464, -1.879332,  -4.531702,  21.434296,   4.2146144,  22.721859,
+                     7.485317,   20.148363,  -15.49375,  -4.5062046, 37.77292,   -0.23385821, -45.532917, -21.055403,
+                     46.854183,  -13.595161, 2.8823144,  -23.905682, 2.3569264,  26.975227,   32.806625,  9.185071,
+                     -39.330578, -1.0041192, -6.8353715, -33.2658};
+  TensorInfo out_params;
+  out_params.data = correct;
+  out_params.len = 60;
+  std::vector<int> out_shape{1, 6, 10};
+  out_params.shape = &out_shape;
+  out_params.min = -50;
+  out_params.max = 50;
+
   auto matmul_param = new MatMulParameter();
-  float *correct;
-  double output_scale;
-  int output_zp;
-  int total_size = MMInt8TestInit(&inputs_, &outputs_, matmul_param, &correct, &output_scale, &output_zp);
+  matmul_param->a_transpose_ = false;
+  matmul_param->b_transpose_ = false;
+  matmul_param->has_bias_ = false;
+  std::vector<lite::tensor::Tensor *> inputs;
+  std::vector<lite::tensor::Tensor *> outputs;
+  MMInt8TestInit(&inputs, &outputs, &in_params, &weight_params, &out_params);
   auto ctx = new lite::Context;
   ctx->thread_num_ = 2;
   kernel::MatmulInt8CPUKernel *mm =
-    new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx, nullptr);
+    new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs, outputs, ctx, nullptr);
 
   mm->Init();
   mm->Run();
-  float fout[6] = {0};
-  Dequantize(reinterpret_cast<int8_t *>(outputs_[0]->Data()), outputs_[0]->ElementsNum(), output_scale, output_zp,
-             fout);
-  CompareOutputData(fout, correct, 6, 0.3);
+  float out_scale;
+  int out_zp;
+  QuantProcess(correct, out_params.len, out_params.min, out_params.max, &out_scale, &out_zp, nullptr);
+  float *out = new float[out_params.len];
+  Dequantize(reinterpret_cast<int8_t *>(outputs[0]->Data()), outputs[0]->ElementsNum(), out_scale, out_zp, out);
+  CompareOutputData(out, correct, 6, 0.6);
   delete mm;
-  for (auto t : inputs_) delete t;
-  for (auto t : outputs_) delete t;
-  free(correct);
+  for (auto t : inputs) delete t;
+  for (auto t : outputs) delete t;
+  delete[] out;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
index 012a1643f4..adbf65265d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
@@ -65,14 +65,14 @@ TEST_F(TestPreluInt8, prelu_1) {
   outputs_tensor[0] = output0_tensor;
 
   LeakyReluQuantArg op_param;
-  op_param.op_parameter_.type_ = schema::PrimitiveType_Prelu;
+  op_param.op_parameter_.type_ = schema::PrimitiveType_LeakyReLU;
   op_param.slope_ = reinterpret_cast<float *>(malloc(sizeof(float)));
   op_param.slope_[0] = 0.25;
 
   lite::Context *ctx = new lite::Context;
   ctx->thread_num_ = 2;
   op_param.axis_ = 0.25;
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_Prelu};
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_LeakyReLU};
   auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
   ASSERT_NE(creator, nullptr);
   kernel::LiteKernel *kernel =
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
index 87a128a4c1..c7d8a2aaba 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
@@ -84,14 +84,13 @@ TEST_F(TestResizeBilinearInt8, Bilinear0) {
   int8_t output_data[16] = {0};
   std::vector<int> in_shape = {1, 2, 2, 1};
   std::vector<int> out_shape = {1, 4, 4, 1};
-  const lite::tensor::QuantArg quant_in = {0.005f, 2};
-  const lite::tensor::QuantArg quant_out = {0.008f, 5};
+  const lite::tensor::QuantArg quant_in = {0.005f, 0};
+  const lite::tensor::QuantArg quant_out = {0.008f, 0};
   bool align_corners = false;
   int thread_num = 1;
-  int8_t expect[16] = {4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6};
+  int8_t expect[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2};
 
   Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
-  kernel_->Init();
   kernel_->Run();
 
   CompareOutputInt8(output_data, expect, 16, err_percent_);
@@ -104,20 +103,19 @@ TEST_F(TestResizeBilinearInt8, Bilinear1) {
   int8_t input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
   int8_t output_data[160] = {0};
-  const lite::tensor::QuantArg quant_in = {0.005f, 2};
-  const lite::tensor::QuantArg quant_out = {0.008f, 5};
+  const lite::tensor::QuantArg quant_in = {0.005f, 0};
+  const lite::tensor::QuantArg quant_out = {0.008f, 0};
   int thread_num = 1;
   bool align_corners = false;
-  int8_t expect[160] = {4,  4,  5,  6,  6,  5,  6,  7,  7,  8,  7,  8,  8,  9,  9,  7,  8,  8,  9,  9,  7,  8,  8,
-                        9,  9,  8,  9,  10, 10, 11, 10, 11, 11, 12, 13, 10, 11, 11, 12, 13, 10, 11, 11, 12, 13, 12,
-                        12, 13, 13, 14, 13, 14, 14, 15, 16, 13, 14, 14, 15, 16, 10, 11, 11, 12, 13, 12, 12, 13, 13,
-                        14, 13, 14, 14, 15, 16, 13, 14, 14, 15, 16, 16, 17, 18, 18, 19, 18, 18, 19, 20, 20, 19, 20,
-                        21, 21, 22, 19, 20, 21, 21, 22, 19, 20, 21, 21, 22, 21, 22, 22, 23, 23, 23, 23, 24, 24, 25,
-                        23, 23, 24, 24, 25, 23, 23, 24, 24, 25, 24, 25, 25, 26, 27, 26, 26, 27, 28, 28, 26, 26, 27,
-                        28, 28, 23, 23, 24, 24, 25, 24, 25, 25, 26, 27, 26, 26, 27, 28, 28, 26, 26, 27, 28, 28};
+  int8_t expect[160] = {0,  1,  1,  2,  2,  2,  2,  3,  3,  4,  3,  4,  4,  5,  6,  3,  4,  4,  5,  6,  3,  4,  4,
+                        5,  6,  5,  5,  6,  7,  7,  6,  7,  8,  8,  9,  6,  7,  8,  8,  9,  6,  7,  7,  8,  9,  8,
+                        8,  9,  10, 10, 9,  10, 11, 11, 12, 9,  10, 11, 11, 12, 6,  7,  7,  8,  9,  8,  8,  9,  10,
+                        10, 9,  10, 11, 11, 12, 9,  10, 11, 11, 12, 13, 13, 14, 14, 15, 14, 15, 15, 16, 17, 16, 16,
+                        17, 18, 18, 16, 16, 17, 18, 18, 16, 16, 17, 18, 18, 17, 18, 18, 19, 20, 19, 19, 20, 21, 21,
+                        19, 19, 20, 21, 21, 19, 19, 20, 21, 21, 20, 21, 22, 22, 23, 22, 23, 23, 24, 24, 22, 23, 23,
+                        24, 24, 19, 19, 20, 21, 21, 20, 21, 22, 22, 23, 22, 23, 23, 24, 24, 22, 23, 23, 24, 24};
 
   Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
-  kernel_->Init();
   kernel_->Run();
 
   CompareOutputInt8(output_data, expect, 160, err_percent_);
@@ -131,22 +129,49 @@ TEST_F(TestResizeBilinearInt8, Bilinear2) {
                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
   int8_t output_data[160] = {0};
 
+  const lite::tensor::QuantArg quant_in = {0.005f, 0};
+  const lite::tensor::QuantArg quant_out = {0.008f, 0};
+  int thread_num = 2;
+  bool align_corners = true;
+  int8_t expect[160] = {0,  1,  1,  2,  2,  1,  2,  2,  3,  4,  2,  3,  3,  4,  5,  3,  4,  4,  5,  6,  2,  3,  3,
+                        4,  5,  3,  4,  4,  5,  6,  4,  5,  5,  6,  7,  5,  6,  6,  7,  8,  4,  5,  5,  6,  7,  5,
+                        6,  6,  7,  8,  6,  7,  8,  8,  9,  7,  8,  9,  9,  10, 6,  7,  7,  8,  9,  7,  8,  9,  9,
+                        10, 8,  9,  10, 10, 11, 9,  10, 11, 11, 12, 13, 13, 14, 14, 15, 14, 14, 15, 15, 16, 15, 15,
+                        16, 16, 17, 16, 16, 17, 18, 18, 15, 15, 16, 16, 17, 16, 16, 17, 18, 18, 17, 17, 18, 19, 19,
+                        18, 18, 19, 20, 20, 17, 17, 18, 19, 19, 18, 18, 19, 20, 20, 19, 19, 20, 21, 21, 20, 20, 21,
+                        22, 22, 19, 19, 20, 21, 21, 20, 20, 21, 22, 22, 21, 21, 22, 23, 23, 22, 23, 23, 24, 24};
+
+  Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
+  kernel_->Run();
+
+  CompareOutputInt8(output_data, expect, 160, err_percent_);
+}
+
+// 2*2*2*5 -> 2*4*4*5 thread num 2, align corners zp -128
+TEST_F(TestResizeBilinearInt8, Bilinear3) {
+  std::vector<int> in_shape = {2, 2, 2, 5};
+  std::vector<int> out_shape = {2, 4, 4, 5};
+  int8_t input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  int8_t output_data[160] = {0};
+
   const lite::tensor::QuantArg quant_in = {0.005f, 2};
-  const lite::tensor::QuantArg quant_out = {0.008f, 5};
+  const lite::tensor::QuantArg quant_out = {0.005f, 2};
   int thread_num = 2;
   bool align_corners = true;
-  int8_t expect[160] = {4,  4,  5,  6,  6,  5,  5,  6,  7,  7,  6,  6,  7,  8,  8,  7,  8,  8,  9,  9,  6,  6,  7,
-                        8,  8,  7,  8,  8,  9,  9,  8,  9,  9,  10, 10, 9,  10, 10, 11, 11, 8,  9,  9,  10, 10, 9,
-                        10, 10, 11, 11, 10, 11, 11, 12, 13, 11, 12, 12, 13, 14, 10, 11, 11, 12, 13, 11, 12, 12, 13,
-                        14, 12, 13, 13, 14, 15, 13, 14, 14, 15, 16, 16, 17, 18, 18, 19, 17, 18, 19, 19, 20, 18, 19,
-                        20, 20, 21, 19, 20, 21, 21, 22, 18, 19, 20, 20, 21, 19, 20, 21, 21, 22, 20, 21, 22, 22, 23,
-                        21, 22, 23, 23, 24, 20, 21, 22, 22, 23, 21, 22, 23, 23, 24, 23, 23, 24, 24, 25, 24, 24, 25,
-                        25, 26, 23, 23, 24, 24, 25, 24, 24, 25, 25, 26, 25, 25, 26, 26, 27, 26, 26, 27, 28, 28};
+  int8_t expect[160] = {0,  1,  2,  3,  4,  2,  3,  4,  5,  6,  3,  4,  5,  6,  7,  5,  6,  7,  8,  9,  3,  4,  5,
+                        6,  7,  5,  6,  7,  8,  9,  7,  8,  9,  10, 11, 8,  9,  10, 11, 12, 7,  8,  9,  10, 11, 8,
+                        9,  10, 11, 12, 10, 11, 12, 13, 14, 12, 13, 14, 15, 16, 10, 11, 12, 13, 14, 12, 13, 14, 15,
+                        16, 13, 14, 15, 16, 17, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 22, 23, 24, 25, 26, 23, 24,
+                        25, 26, 27, 25, 26, 27, 28, 29, 23, 24, 25, 26, 27, 25, 26, 27, 28, 29, 27, 28, 29, 30, 31,
+                        28, 29, 30, 31, 32, 27, 28, 29, 30, 31, 28, 29, 30, 31, 32, 30, 31, 32, 33, 34, 32, 33, 34,
+                        35, 36, 30, 31, 32, 33, 34, 32, 33, 34, 35, 36, 33, 34, 35, 36, 37, 35, 36, 37, 38, 39};
 
   Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
-  kernel_->Init();
   kernel_->Run();
 
+  err_percent_ = 0.325f;
   CompareOutputInt8(output_data, expect, 160, err_percent_);
 }
+
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
index c861a4378f..1298ec65ef 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
@@ -73,91 +73,228 @@ void printf_tensor(mindspore::lite::tensor::Tensor *in_data) {
   MS_LOG(INFO) << "Print tensor done";
 }
 
-kernel::ActivationOpenClKernel *create_kernel(lite::opencl::OpenCLAllocator *allocator,
-                                              const std::vector<lite::tensor::Tensor *> &inputs,
-                                              const std::vector<lite::tensor::Tensor *> &outputs, std::string test_name,
-                                              int type, std::string in_file, float alpha = 0.2) {
+TEST_F(TestActivationOpenCL, ReluFp32_dim4) {
+  std::string in_file = "/data/local/tmp/in_data.bin";
+  std::string out_file = "/data/local/tmp/relu.bin";
+  MS_LOG(INFO) << "Relu Begin test!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(INFO) << "Init tensors.";
+  std::vector<int> input_shape = {1, 9};
+  auto data_type = kNumberTypeFloat32;
+  auto tensor_type = schema::NodeType_ValueNode;
+  auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (input_tensor == nullptr) {
+    MS_LOG(ERROR) << "new input tensor error!";
+    return;
+  }
+  auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (output_tensor == nullptr) {
+    MS_LOG(ERROR) << "new output tensor error!";
+    delete input_tensor;
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> inputs{input_tensor};
+  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
+  inputs[0]->MallocData(allocator);
+  MS_LOG(INFO) << "Initialize input data";
+  LoadActivationData(inputs[0]->Data(), inputs[0]->Size(), in_file);
+  MS_LOG(INFO) << "==================input data================";
+  printf_tensor(inputs[0]);
+
   auto *param = new (std::nothrow) ActivationParameter();
   if (param == nullptr) {
     MS_LOG(ERROR) << "New ActivationParameter fail.";
-    return nullptr;
+    delete input_tensor;
+    delete output_tensor;
+    return;
   }
-  memcpy(param->op_parameter_.name_, test_name.c_str(), test_name.size());
-  param->alpha_ = alpha;
-  param->type_ = type;
+  param->type_ = ActivationType_RELU;
   auto *kernel =
     new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Kernel:Relu create fail.";
     delete param;
-    MS_LOG(ERROR) << "Kernel:" << test_name << " create fail.";
-    delete param;
-    return nullptr;
+    delete input_tensor;
+    delete output_tensor;
+    return;
   }
   auto ret = kernel->Init();
   if (ret != RET_OK) {
     delete param;
     delete kernel;
-    MS_LOG(ERROR) << "Init " << test_name << " fail.";
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Init relu fail.";
+    return;
+  }
+  MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel.";
+  std::vector<kernel::LiteKernel *> kernels{kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail.";
+    return;
+  }
+
+  MS_LOG(INFO) << "Initialize sub_graph.";
+  ret = sub_graph->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init sub_graph error.";
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    return;
+  }
+  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
+  ret = sub_graph->Run();
+  if (ret != RET_OK) {
     delete kernel;
     delete param;
-    return nullptr;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
+    return;
   }
+
+  MS_LOG(INFO) << "==================output data================";
+  printf_tensor(outputs[0]);
+  CompareRes(output_tensor, out_file);
+  delete kernel;
+  delete param;
+  delete input_tensor;
+  delete output_tensor;
+  delete sub_graph;
+}
+
+TEST_F(TestActivationOpenCL, Relu6Fp32_dim4) {
+  std::string in_file = "/data/local/tmp/in_data.bin";
+  std::string out_file = "/data/local/tmp/relu6.bin";
+  MS_LOG(INFO) << "Relu6 Begin test!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(INFO) << "Init tensors.";
+  std::vector<int> input_shape = {1, 9};
+  auto data_type = kNumberTypeFloat32;
+  auto tensor_type = schema::NodeType_ValueNode;
+  auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (input_tensor == nullptr) {
+    MS_LOG(ERROR) << "new input tensor error!";
+    return;
+  }
+  auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (output_tensor == nullptr) {
+    MS_LOG(ERROR) << "new output tensor error!";
+    delete input_tensor;
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> inputs{input_tensor};
+  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
+  inputs[0]->MallocData(allocator);
   MS_LOG(INFO) << "Initialize input data";
   LoadActivationData(inputs[0]->Data(), inputs[0]->Size(), in_file);
   MS_LOG(INFO) << "==================input data================";
   printf_tensor(inputs[0]);
-  return kernel;
-}
 
-int RunSubGraphOpenCLKernel(const std::vector<lite::tensor::Tensor *> &inputs,
-                            const std::vector<lite::tensor::Tensor *> &outputs,
-                            kernel::ActivationOpenClKernel *kernel) {
+  auto *param = new (std::nothrow) ActivationParameter();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "New ActivationParameter fail.";
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  param->type_ = ActivationType_RELU6;
+  auto *kernel =
+    new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Kernel:Relu6 create fail.";
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete param;
+    delete kernel;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Init relu6 fail.";
+    return;
+  }
   MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel.";
   std::vector<kernel::LiteKernel *> kernels{kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
     delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
     MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail.";
-    return RET_ERROR;
+    return;
   }
+
   MS_LOG(INFO) << "Initialize sub_graph.";
-  auto ret = sub_graph->Init();
+  ret = sub_graph->Init();
   if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init sub_graph error.";
     delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
     delete sub_graph;
-    MS_LOG(ERROR) << "Init sub_graph error.";
-    return RET_ERROR;
+    return;
   }
   MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
   ret = sub_graph->Run();
   if (ret != RET_OK) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
     delete sub_graph;
     MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
-    return RET_ERROR;
+    return;
   }
+
+  MS_LOG(INFO) << "==================output data================";
+  printf_tensor(outputs[0]);
+  CompareRes(output_tensor, out_file);
+  delete kernel;
+  delete param;
+  delete input_tensor;
+  delete output_tensor;
   delete sub_graph;
-  return RET_OK;
 }
 
-TEST_F(TestActivationOpenCL, ActivationFp32_dim4) {
-  MS_LOG(INFO) << "Begin test!";
+TEST_F(TestActivationOpenCL, SigmoidFp32_dim4) {
+  std::string in_file = "/data/local/tmp/in_data.bin";
+  std::string out_file = "/data/local/tmp/sigmoid.bin";
+  MS_LOG(INFO) << "Sigmoid Begin test!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
   MS_LOG(INFO) << "Init tensors.";
-  std::vector<int> input_shape = {1, 4, 3, 8};
-
+  std::vector<int> input_shape = {1, 9};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
-  auto *input_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensor_type);
+  auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
   if (input_tensor == nullptr) {
     MS_LOG(ERROR) << "new input tensor error!";
     return;
   }
-  auto *output_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensor_type);
+  auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
   if (output_tensor == nullptr) {
     MS_LOG(ERROR) << "new output tensor error!";
     delete input_tensor;
@@ -166,40 +303,184 @@ TEST_F(TestActivationOpenCL, ActivationFp32_dim4) {
   std::vector<lite::tensor::Tensor *> inputs{input_tensor};
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
   inputs[0]->MallocData(allocator);
+  MS_LOG(INFO) << "Initialize input data";
+  LoadActivationData(inputs[0]->Data(), inputs[0]->Size(), in_file);
+  MS_LOG(INFO) << "==================input data================";
+  printf_tensor(inputs[0]);
+
+  auto *param = new (std::nothrow) ActivationParameter();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "New ActivationParameter fail.";
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  param->type_ = ActivationType_SIGMOID;
+  auto *kernel =
+    new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Kernel:Sigmoid create fail.";
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete param;
+    delete kernel;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Init sigmoid fail.";
+    return;
+  }
+  MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel.";
+  std::vector<kernel::LiteKernel *> kernels{kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail.";
+    return;
+  }
+
+  MS_LOG(INFO) << "Initialize sub_graph.";
+  ret = sub_graph->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init sub_graph error.";
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    return;
+  }
+  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
+  ret = sub_graph->Run();
+  if (ret != RET_OK) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
+    return;
+  }
 
-  std::map<std::string, int> Test_Activation_Type;
-  std::map<std::string, std::string> Test_Res_File;
-  Test_Activation_Type["Relu"] = ActivationType_RELU;
-  Test_Activation_Type["Leaky_Relu"] = ActivationType_LEAKY_RELU;
-  Test_Activation_Type["Relu6"] = ActivationType_RELU6;
-  Test_Activation_Type["Sigmoid"] = ActivationType_SIGMOID;
-  Test_Res_File["Leaky_Relu"] = "/data/local/tmp/leaky_relu.bin";
-  Test_Res_File["Relu"] = "/data/local/tmp/relu.bin";
-  Test_Res_File["Relu6"] = "/data/local/tmp/relu6.bin";
-  Test_Res_File["Sigmoid"] = "/data/local/tmp/sigmoid.bin";
+  MS_LOG(INFO) << "==================output data================";
+  printf_tensor(outputs[0]);
+  CompareRes(output_tensor, out_file);
+  delete kernel;
+  delete param;
+  delete input_tensor;
+  delete output_tensor;
+  delete sub_graph;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+
+TEST_F(TestActivationOpenCL, LeakyReluFp32_dim4) {
   std::string in_file = "/data/local/tmp/in_data.bin";
+  std::string out_file = "/data/local/tmp/leaky_relu.bin";
+  MS_LOG(INFO) << "Leaky relu Begin test!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
 
-  std::map<std::string, int>::iterator it = Test_Activation_Type.begin();
-  while (it != Test_Activation_Type.end()) {
-    auto kernel = create_kernel(allocator, inputs, outputs, it->first, it->second, in_file, 0.3);
-    if (kernel == nullptr) {
-      MS_LOG(ERROR) << "Create kernel:" << it->first << " error.";
-      return;
-    }
+  MS_LOG(INFO) << "Init tensors.";
+  std::vector<int> input_shape = {1, 9};
+  auto data_type = kNumberTypeFloat32;
+  auto tensor_type = schema::NodeType_ValueNode;
+  auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (input_tensor == nullptr) {
+    MS_LOG(ERROR) << "new input tensor error!";
+    return;
+  }
+  auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
+  if (output_tensor == nullptr) {
+    MS_LOG(ERROR) << "new output tensor error!";
+    delete input_tensor;
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> inputs{input_tensor};
+  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
+  inputs[0]->MallocData(allocator);
+  MS_LOG(INFO) << "Initialize input data";
+  LoadActivationData(inputs[0]->Data(), inputs[0]->Size(), in_file);
+  MS_LOG(INFO) << "==================input data================";
+  printf_tensor(inputs[0]);
 
-    auto ret = RunSubGraphOpenCLKernel(inputs, outputs, kernel);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << it->first << " RunSubGraphOpenCLKernel error.";
-      return;
-    }
-    MS_LOG(INFO) << "==================output data================";
-    printf_tensor(outputs[0]);
-    CompareRes(output_tensor, Test_Res_File[it->first]);
-    it++;
+  auto *param = new (std::nothrow) ActivationParameter();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "New ActivationParameter fail.";
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  param->alpha_ = 0.3;
+  param->type_ = ActivationType_LEAKY_RELU;
+  auto *kernel =
+    new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Kernel:leaky relu create fail.";
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    return;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete param;
+    delete kernel;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Init leaky relu fail.";
+    return;
+  }
+  MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel.";
+  std::vector<kernel::LiteKernel *> kernels{kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail.";
+    return;
+  }
+
+  MS_LOG(INFO) << "Initialize sub_graph.";
+  ret = sub_graph->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init sub_graph error.";
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    return;
+  }
+  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
+  ret = sub_graph->Run();
+  if (ret != RET_OK) {
+    delete kernel;
+    delete param;
+    delete input_tensor;
+    delete output_tensor;
+    delete sub_graph;
+    MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
+    return;
   }
 
+  MS_LOG(INFO) << "==================output data================";
+  printf_tensor(outputs[0]);
+  CompareRes(output_tensor, out_file);
+  delete kernel;
+  delete param;
   delete input_tensor;
   delete output_tensor;
+  delete sub_graph;
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
index 1774875d1d..f74cd7cfda 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
@@ -48,8 +48,7 @@ void InitAvgPoolingParam(PoolingParameter *param) {
   param->pad_l_ = 0;
   param->pad_r_ = 0;
 
-  param->max_pooling_ = false;
-  param->avg_pooling_ = true;
+  param->pool_mode_ = PoolMode_AvgPool;
 }
 
 TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {
@@ -129,7 +128,6 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {
     printf("%.3f ", output_data[i]);
   }
   printf("\n");
-  size_t output_size = tensor_out->Size();
   float expect[4] = {2.0f, 3.0f, 4.0f, 5.0f};
 
   for (int i = 0; i < tensor_out->ElementsNum(); ++i)
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
index f8683bdc85..ab95c9e57e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
@@ -23,9 +23,13 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h"
 
 namespace mindspore {
-class TestBatchnormOpenCL : public mindspore::CommonTest {
+class TestBatchnormOpenCLfp32 : public mindspore::CommonTest {
  public:
-  TestBatchnormOpenCL() {}
+  TestBatchnormOpenCLfp32() {}
+};
+class TestBatchnormOpenCLfp16 : public mindspore::CommonTest {
+ public:
+  TestBatchnormOpenCLfp16() {}
 };
 
 template <typename T>
@@ -35,30 +39,154 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou
     ASSERT_LE(abs, err_bound);
   }
 }
+TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
+  MS_LOG(INFO) << "begin test";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->SetFp16Enable(true);
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(INFO) << "Read tensors from .bin";
+  std::vector<int> input_shape = {1, 256, 256, 48};
+  std::vector<int> output_shape = {1, 256, 256, 48};
+  auto data_type = kNumberTypeFloat32;
+  auto tensor_type = schema::NodeType_ValueNode;
+
+  // get the input from .bin
+  size_t input_size, output_size;
+  std::string input_path = "./test_data/batchnorm_in_datafp16.bin";
+  std::string mean_path = "./test_data/batchnorm_meanfp16.bin";
+  std::string var_path = "./test_data/batchnorm_varfp16.bin";
+  std::string offset_path = "./test_data/batchnorm_offsetfp16.bin";
+  std::string scale_path = "./test_data/batchnorm_scalefp16.bin";
+  std::string output_path = "./test_data/batchnorm_out_datafp16.bin";
+  auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
+  size_t mean_size, var_size, scale_size, offset_size;
+  auto mean_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(mean_path.c_str(), &mean_size));
+  auto var_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(var_path.c_str(), &var_size));
+  auto scale_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
+  auto offset_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
+
+  MS_LOG(INFO) << "construct tensors";
+  lite::tensor::Tensor *tensor_data =
+    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
+  lite::tensor::Tensor *tensor_mean =
+    new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
+  lite::tensor::Tensor *tensor_var =
+    new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
+  lite::tensor::Tensor *tensor_scale =
+    new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
+  lite::tensor::Tensor *tensor_offset =
+    new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
+  if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
+      tensor_offset == nullptr) {
+    MS_LOG(INFO) << "init tensor failed";
+    return;
+  }
+  auto *output_tensor =
+    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
+  if (output_tensor == nullptr) {
+    MS_LOG(INFO) << "init tensor failed";
+    delete tensor_data;
+    delete tensor_mean;
+    delete tensor_var;
+    delete tensor_scale;
+    delete tensor_offset;
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
+  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
 
-TEST_F(TestBatchnormOpenCL, Batchnorminput_dim4) {
+  MS_LOG(INFO) << "initialize tensors";
+  auto param = new (std::nothrow) BatchNormParameter();
+  if (param == nullptr) {
+    MS_LOG(INFO) << "new BatchNormParameter failed";
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    return;
+  }
+  param->epsilon_ = pow(10, -5);
+  auto *batchnorm_kernel =
+    new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (batchnorm_kernel == nullptr) {
+    MS_LOG(INFO) << "new kernel::BatchNorm_kernel failed";
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    return;
+  }
+  batchnorm_kernel->Init();
+
+  // to do allocate memory for inputs and outputs
+  for (auto &input_tensor : inputs) {
+    input_tensor->MallocData(allocator);
+  }
+
+  MS_LOG(INFO) << "initialize sub_graph";
+  std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    delete batchnorm_kernel;
+    return;
+  }
+  sub_graph->Init();
+  MS_LOG(INFO) << "init tensors";
+  memcpy(inputs[0]->Data(), input_data, input_size);
+  memcpy(inputs[1]->Data(), scale_data, scale_size);
+  memcpy(inputs[2]->Data(), offset_data, offset_size);
+  memcpy(inputs[3]->Data(), mean_data, mean_size);
+  memcpy(inputs[4]->Data(), var_data, var_size);
+  std::cout << "==================output data================" << std::endl;
+  sub_graph->Run();
+
+  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
+  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
+  for (auto tensor : inputs) {
+    delete tensor;
+  }
+  for (auto tensor : outputs) {
+    delete tensor;
+  }
+  delete param;
+  delete batchnorm_kernel;
+  delete sub_graph;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   MS_LOG(INFO) << "begin test";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
   MS_LOG(INFO) << "Read tensors from .bin";
-  std::vector<int> input_shape = {1, 256, 256, 16};
-  std::vector<int> output_shape = {1, 256, 256, 16};
+  std::vector<int> input_shape = {1, 256, 256, 47};
+  std::vector<int> output_shape = {1, 256, 256, 47};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
 
   // get the input from .bin
   size_t input_size, output_size;
-  std::string input_path = "./test_data/in_data.bin";
-  std::string mean_path = "./test_data/mean.bin";
-  std::string var_path = "./test_data/var.bin";
-  std::string output_path = "./test_data/out_data.bin";
+  std::string input_path = "./test_data/batchnorm_in_datafp32.bin";
+  std::string mean_path = "./test_data/batchnorm_meanfp32.bin";
+  std::string var_path = "./test_data/batchnorm_varfp32.bin";
+  std::string offset_path = "./test_data/batchnorm_offsetfp32.bin";
+  std::string scale_path = "./test_data/batchnorm_scalefp32.bin";
+  std::string output_path = "./test_data/batchnorm_out_datafp32.bin";
   auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
   auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  size_t mean_size, var_size;
+  size_t mean_size, var_size, scale_size, offset_size;
   auto mean_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(mean_path.c_str(), &mean_size));
   auto var_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(var_path.c_str(), &var_size));
+  auto scale_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
+  auto offset_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
 
   MS_LOG(INFO) << "construct tensors";
   lite::tensor::Tensor *tensor_data =
@@ -131,14 +259,9 @@ TEST_F(TestBatchnormOpenCL, Batchnorminput_dim4) {
   }
   sub_graph->Init();
   MS_LOG(INFO) << "init tensors";
-  std::cout << "init tensors" << std::endl;
   memcpy(inputs[0]->Data(), input_data, input_size);
-  auto &temp = inputs[1];
-  auto tensor_temp = reinterpret_cast<float *>(temp->Data());
-  int UPDIV_tensor_scale = UP_DIV(tensor_scale->ElementsNum(), C4NUM) * 4;
-  for (int i = 0; i < UPDIV_tensor_scale; ++i) {
-    tensor_temp[i] = static_cast<float>(1);
-  }
+  memcpy(inputs[1]->Data(), scale_data, scale_size);
+  memcpy(inputs[2]->Data(), offset_data, offset_size);
   memcpy(inputs[3]->Data(), mean_data, mean_size);
   memcpy(inputs[4]->Data(), var_data, var_size);
   std::cout << "==================output data================" << std::endl;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/caffe_prelu_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
similarity index 63%
rename from mindspore/lite/test/ut/src/runtime/kernel/opencl/caffe_prelu_tests.cc
rename to mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
index ac6591be4e..57960e4094 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/caffe_prelu_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
@@ -18,22 +18,20 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
-#include "nnacl/pack.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.h"
-#include "mindspore/lite/nnacl/prelu_parameter.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h"
 
-using mindspore::kernel::CaffePReluOpenCLKernel;
+using mindspore::kernel::BiasAddOpenCLKernel;
 using mindspore::kernel::LiteKernel;
 using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore {
-class TestCaffePReluOpenCL : public mindspore::CommonTest {};
+class TestBiasAddOpenCL : public mindspore::CommonTest {};
 
-void LoadDataCaffePRelu(void *dst, size_t dst_size, const std::string &file_path) {
+void LoadDataBiasAdd(void *dst, size_t dst_size, const std::string &file_path) {
   if (file_path.empty()) {
     memset(dst, 0x00, dst_size);
   } else {
@@ -42,12 +40,12 @@ void LoadDataCaffePRelu(void *dst, size_t dst_size, const std::string &file_path
   }
 }
 
-void CompareOutCaffePRelu(lite::tensor::Tensor *output_tensor, const std::string &standard_answer_file) {
+void CompareOutBiasAdd(lite::tensor::Tensor *output_tensor, const std::string &standard_answer_file) {
   auto *output_data = reinterpret_cast<float *>(output_tensor->Data());
-  size_t output_size = output_tensor->ElementsC4Num();
+  size_t output_size = output_tensor->ElementsNum();
   auto expect_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(standard_answer_file.c_str(), &output_size));
   constexpr float atol = 0.0002;
-  for (int i = 0; i < output_tensor->ElementsC4Num(); ++i) {
+  for (int i = 0; i < output_tensor->ElementsNum(); ++i) {
     if (std::fabs(output_data[i] - expect_data[i]) > atol) {
       printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
       printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
@@ -60,7 +58,7 @@ void CompareOutCaffePRelu(lite::tensor::Tensor *output_tensor, const std::string
   printf("compare success!\n\n\n");
 }
 
-void printf_tensor_caffeprelu(mindspore::lite::tensor::Tensor *in_data, int size) {
+void printf_tensor_BiasAdd(mindspore::lite::tensor::Tensor *in_data, int size) {
   auto input_data = reinterpret_cast<float *>(in_data->Data());
   for (int i = 0; i < size; ++i) {
     printf("%f ", input_data[i]);
@@ -69,7 +67,7 @@ void printf_tensor_caffeprelu(mindspore::lite::tensor::Tensor *in_data, int size
   MS_LOG(INFO) << "Print tensor done";
 }
 
-void printf_float(float *data, int num = 0) {
+void printf_float_BiasAdd(float *data, int num = 0) {
   float *temp = data;
   for (int i = 0; i < num; ++i) {
     std::cout << *temp << " ";
@@ -78,126 +76,128 @@ void printf_float(float *data, int num = 0) {
   std::cout << std::endl;
 }
 
-TEST_F(TestCaffePReluOpenCL, CaffePReluFp32_dim4) {
+TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
   std::string in_file = "/data/local/tmp/in_data.bin";
   std::string weight_file = "/data/local/tmp/weight_data.bin";
-  std::string standard_answer_file = "/data/local/tmp/caffeprelu.bin";
-  MS_LOG(INFO) << "CaffePRelu Begin test:";
+  std::string standard_answer_file = "/data/local/tmp/biasadd.bin";
+  MS_LOG(INFO) << "BiasAdd Begin test:";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "CaffePRelu init tensors.";
+  MS_LOG(INFO) << "BiasAdd init tensors.";
 
-  std::vector<int> input_shape = {1, 4, 3, 9};
-  std::vector<int> output_shape = {1, 4, 3, 9};
+  std::vector<int> input_shape = {1, 9};
+  std::vector<int> output_shape = {1, 9};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
   auto *input_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
+    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NC, tensor_type);
   if (input_tensor == nullptr) {
-    MS_LOG(ERROR) << "new input tensor error";
+    MS_LOG(ERROR) << "new input tensor error!";
     return;
   }
-  auto *output_tensor = new lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
+  auto *output_tensor =
+    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NC, tensor_type);
   if (output_tensor == nullptr) {
-    MS_LOG(ERROR) << "new output_tensor error";
+    MS_LOG(ERROR) << "new output tensor error!";
     delete input_tensor;
     return;
   }
   auto *weight_tensor = new (std::nothrow)
-    lite::tensor::Tensor(data_type, std::vector<int>{input_shape[3]}, schema::Format_NHWC, tensor_type);
+    lite::tensor::Tensor(data_type, std::vector<int>{input_shape[1]}, schema::Format_NHWC, tensor_type);
   if (weight_tensor == nullptr) {
-    MS_LOG(ERROR) << "new weight_tensor error";
-    delete input_tensor;
+    MS_LOG(ERROR) << "new weight tensor error!";
     delete output_tensor;
+    delete input_tensor;
     return;
   }
-
   std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor};
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
   inputs[0]->MallocData(allocator);
   inputs[1]->MallocData(allocator);
-  std::cout << input_tensor->Size() << std::endl;
-  LoadDataCaffePRelu(input_tensor->Data(), input_tensor->Size(), in_file);
-  MS_LOG(INFO) << "CaffePRelu==================input data================";
-  printf_tensor_caffeprelu(inputs[0], input_tensor->ElementsNum());
-
-  LoadDataCaffePRelu(weight_tensor->Data(), weight_tensor->Size(), weight_file);
-  MS_LOG(INFO) << "CaffePRelu==================weight data================";
-  printf_tensor_caffeprelu(inputs[1], weight_tensor->ElementsNum());
-
-  auto param = new (std::nothrow) PReluParameter();
+  LoadDataBiasAdd(input_tensor->Data(), input_tensor->Size(), in_file);
+  MS_LOG(INFO) << "BiasAdd==================input data================";
+  printf_tensor_BiasAdd(inputs[0], input_tensor->ElementsNum());
+  LoadDataBiasAdd(weight_tensor->Data(), weight_tensor->Size(), weight_file);
+  MS_LOG(INFO) << "BiasAdd==================weight data================";
+  printf_tensor_BiasAdd(inputs[1], weight_tensor->ElementsNum());
+
+  auto *param = new (std::nothrow) OpParameter();
   if (param == nullptr) {
-    MS_LOG(ERROR) << "new param error!";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
+    MS_LOG(ERROR) << "new OpParameter error!";
     return;
   }
-  param->channel_num_ = input_shape[3];
-  auto *caffeprelu_kernel =
-    new (std::nothrow) kernel::CaffePReluOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (caffeprelu_kernel == nullptr) {
-    delete param;
+  auto *biasadd_kernel =
+    new (std::nothrow) kernel::BiasAddOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (biasadd_kernel == nullptr) {
+    MS_LOG(ERROR) << "Create biasadd kernel error.";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
-    MS_LOG(ERROR) << "Create caffe prelu kernel error.";
+    delete param;
     return;
   }
 
-  auto ret = caffeprelu_kernel->Init();
+  auto ret = biasadd_kernel->Init();
   if (ret != RET_OK) {
-    delete param;
+    MS_LOG(ERROR) << "biasadd kernel init error.";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
-    delete caffeprelu_kernel;
-    MS_LOG(ERROR) << "caffeprelu_kernel init error.";
+    delete param;
+    delete biasadd_kernel;
     return;
   }
 
   MS_LOG(INFO) << "initialize sub_graph";
-  std::vector<kernel::LiteKernel *> kernels{caffeprelu_kernel};
+  std::vector<kernel::LiteKernel *> kernels{biasadd_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    delete param;
+    MS_LOG(ERROR) << "Create sub_graph kernel error.";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
-    delete caffeprelu_kernel;
-    MS_LOG(ERROR) << "Create sub_graph kernel error.";
+    delete param;
+    delete biasadd_kernel;
     return;
   }
   ret = sub_graph->Init();
   if (ret != RET_OK) {
-    delete param;
+    MS_LOG(ERROR) << "sub_graph init error.";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
-    delete caffeprelu_kernel;
     delete sub_graph;
-    MS_LOG(ERROR) << "sub_graph init error.";
+    delete param;
+    delete biasadd_kernel;
     return;
   }
   MS_LOG(INFO) << "Sub graph begin running!";
   ret = sub_graph->Run();
   if (ret != RET_OK) {
+    MS_LOG(ERROR) << "sub_graph run error.";
     delete input_tensor;
     delete output_tensor;
     delete weight_tensor;
     delete sub_graph;
-    MS_LOG(ERROR) << "sub_graph run error.";
+    delete param;
+    delete biasadd_kernel;
     return;
   }
 
-  MS_LOG(INFO) << "CaffePRelu==================output data================";
-  printf_tensor_caffeprelu(outputs[0], output_tensor->ElementsC4Num());
-  CompareOutCaffePRelu(output_tensor, standard_answer_file);
+  MS_LOG(INFO) << "BiasAdd==================output data================";
+  printf_tensor_BiasAdd(outputs[0], output_tensor->ElementsNum());
+  CompareOutBiasAdd(output_tensor, standard_answer_file);
   delete input_tensor;
-  delete output_tensor;
   delete weight_tensor;
+  delete output_tensor;
   delete sub_graph;
+  delete param;
+  delete biasadd_kernel;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
index 693cccab2d..08f6223de2 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
@@ -21,9 +21,10 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h"
 
-void ConcatComputeByCPU_2input_dim4_axis3(const float *input0, const float *input1, float *output,
-                                          std::vector<int> input_shape0, std::vector<int> input_shape1,
-                                          std::vector<int> output_shape, const int axis) {
+template <typename T>
+void ConcatComputeByCPU_2input_dim4_axis3(const T *input0, const T *input1, T *output, std::vector<int> input_shape0,
+                                          std::vector<int> input_shape1, std::vector<int> output_shape,
+                                          const int axis) {
   int postion, index0 = 0, index1 = 0;
   for (int i = 0; i < output_shape[0]; i++) {
     for (int j = 0; j < output_shape[1]; j++) {
@@ -43,10 +44,10 @@ void ConcatComputeByCPU_2input_dim4_axis3(const float *input0, const float *inpu
     }
   }
 }
-void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *input2, float *output,
-                                          std::vector<int> input_shape0, std::vector<int> input_shape1,
-                                          std::vector<int> input_shape2, std::vector<int> output_shape,
-                                          const int axis) {
+template <typename T>
+void ConcatComputeByCPU_3input_dim4_axis3(T *input0, T *input1, T *input2, T *output, std::vector<int> input_shape0,
+                                          std::vector<int> input_shape1, std::vector<int> input_shape2,
+                                          std::vector<int> output_shape, const int axis) {
   int postion, index0 = 0, index1 = 0, index2 = 0;
   for (int i = 0; i < output_shape[0]; i++) {
     for (int j = 0; j < output_shape[1]; j++) {
@@ -82,9 +83,13 @@ void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *i
 }
 
 namespace mindspore {
-class TestConcatOpenCL : public mindspore::CommonTest {
+class TestConcatOpenCLfp32 : public mindspore::CommonTest {
+ public:
+  TestConcatOpenCLfp32() {}
+};
+class TestConcatOpenCLfp16 : public mindspore::CommonTest {
  public:
-  TestConcatOpenCL() {}
+  TestConcatOpenCLfp16() {}
 };
 
 template <typename T>
@@ -94,18 +99,139 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou
     ASSERT_LE(abs, err_bound);
   }
 }
+TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
+  MS_LOG(INFO) << "begin test";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->SetFp16Enable(true);
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(INFO) << "init tensors";
+  constexpr int INPUT_NUM = 3;
+  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
+    std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
+  std::vector<int> output_shape = {1, 16, 256, 240};
+  auto data_type = kNumberTypeFloat16;
+  auto tensor_type = schema::NodeType_ValueNode;
+  std::vector<lite::tensor::Tensor *> inputs;
+  for (auto &shape : input_shapes) {
+    auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type);
+    inputs.push_back(input_temp);
+    if (input_temp == nullptr) {
+      MS_LOG(INFO) << "new input_tensor failed";
+      return;
+    }
+  }
+  auto *output_tensor =
+    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
+  if (output_tensor == nullptr) {
+    MS_LOG(INFO) << "new output_tensor failed";
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> outputs{output_tensor};
+  MS_LOG(INFO) << "input_shapes size=: " << input_shapes.size();
+
+  MS_LOG(INFO) << "initialize tensors";
+  auto param = new (std::nothrow) ConcatParameter();
+  if (param == nullptr) {
+    MS_LOG(INFO) << "new ConcatParameter failed";
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    return;
+  }
+  param->axis_ = 3;
+  auto *concat_kernel =
+    new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (concat_kernel == nullptr) {
+    MS_LOG(INFO) << "new kernel::ConcatOpenCLKernel failed";
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    return;
+  }
+  concat_kernel->Init();
+  // to do allocate memory for inputs and outputs
+  for (auto &input_tensor : inputs) {
+    input_tensor->MallocData(allocator);
+  }
+
+  MS_LOG(INFO) << "initialize sub_graph";
+  std::vector<kernel::LiteKernel *> kernels{concat_kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    delete concat_kernel;
+    return;
+  }
+  sub_graph->Init();
+  unsigned int seed = 123;
+  MS_LOG(INFO) << "initialize input data";
+  for (auto &input_tensor : inputs) {
+    auto input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
+    for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
+      input_data[i] = static_cast<float16_t>(rand_r(&seed) % 10 + 1);
+    }
+  }
+
+  // compute the result for CPU
+  auto *input_data0 = reinterpret_cast<float16_t *>(inputs[0]->Data());
+  auto *input_data1 = reinterpret_cast<float16_t *>(inputs[1]->Data());
+  std::vector<float16_t> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
+  if (inputs.size() == 2) {
+    ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0],
+                                         input_shapes[1], output_shape, param->axis_);
+  }
+  if (inputs.size() == 3) {
+    auto *input_data2 = reinterpret_cast<float16_t *>(inputs[2]->Data());
+    ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
+                                         input_shapes[1], input_shapes[2], output_shape, param->axis_);
+  }
+
+  std::cout << "==================output data================" << std::endl;
+  sub_graph->Run();
+  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
+  CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
+  for (auto tensor : inputs) {
+    delete tensor;
+  }
+  for (auto tensor : outputs) {
+    delete tensor;
+  }
+  delete param;
+  delete concat_kernel;
+  delete sub_graph;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
 
-TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
+TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
   MS_LOG(INFO) << "begin test";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
   MS_LOG(INFO) << "init tensors";
-  constexpr int INPUT_NUM = 2;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 16, 256, 80},
-                                                          std::vector<int>{1, 16, 256, 80}};
-  std::vector<int> output_shape = {1, 16, 256, 160};
+  constexpr int INPUT_NUM = 3;
+  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
+    std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
+  std::vector<int> output_shape = {1, 16, 256, 240};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
   std::vector<lite::tensor::Tensor *> inputs;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
index 4ec2d16697..cf37a850e4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
@@ -22,6 +22,7 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
 #include "mindspore/core/utils/log_adapter.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
 namespace mindspore {
 class TestConv2dTransposeOpenCL : public mindspore::CommonTest {
@@ -29,46 +30,29 @@ class TestConv2dTransposeOpenCL : public mindspore::CommonTest {
   TestConv2dTransposeOpenCL() {}
 };
 
-TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
+void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data, void *weight_data, void *bias_data,
+                                void *output_data, bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
+  }
   auto allocator = ocl_runtime->GetAllocator();
-  int pad = 0;
-  int n = 1;
-  int h = 240;
-  int w = 240;
-  int kh = 2;
-  int kw = 2;
-  int ci = 128;
-  int co = 128;
+  int pad = shape[0];
+  int n = shape[1];
+  int h = shape[2];
+  int w = shape[3];
+  int kh = shape[4];
+  int kw = shape[5];
+  int ci = shape[6];
+  int co = shape[7];
   int oh = 2 * h - 1 + 2 * (kh - 1 - pad) - kh + 1;
   int ow = 2 * w - 1 + 2 * (kw - 1 - pad) - kw + 1;
-
-  size_t input_size;
-  std::string input_path = "./test_data/conv2d_transpose/conv2d_transpose_fp32_input.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  if (input_data == nullptr) {
-    MS_LOG(ERROR) << "input_data load error.";
-    return;
-  }
-
-  size_t weight_size;
-  std::string weight_path = "./test_data/conv2d_transpose/conv2d_transpose_fp32_weight.bin";
-  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
-  if (weight_data == nullptr) {
-    MS_LOG(ERROR) << "weight_data load error.";
-    return;
-  }
-
-  size_t bias_size;
-  std::string bias_path = "./test_data/conv2d_transpose/conv2d_transpose_fp32_bias.bin";
-  auto bias_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(bias_path.c_str(), &bias_size));
-  if (bias_data == nullptr) {
-    MS_LOG(ERROR) << "bias_data load error.";
-    return;
-  }
   std::vector<int> input_shape = {n, h, w, ci};
-  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), input_shape);
+  auto tensor_x_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape);
   auto tensor_x = tensor_x_ptr.get();
   if (tensor_x == nullptr) {
     MS_LOG(ERROR) << "tensor_x create error.";
@@ -76,7 +60,8 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
   }
 
   std::vector<int> weight_shape = {co, kh, kw, ci};
-  auto tensor_w_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), weight_shape);
+  auto tensor_w_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape);
   auto tensor_w = tensor_w_ptr.get();
   if (tensor_w == nullptr) {
     MS_LOG(ERROR) << "tensor_w create error.";
@@ -85,7 +70,8 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
   tensor_w->SetData(weight_data);
 
   std::vector<int> bias_shape = {co};
-  auto tensor_bias_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), bias_shape);
+  auto tensor_bias_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), bias_shape);
   auto tensor_bias = tensor_bias_ptr.get();
   if (tensor_bias == nullptr) {
     MS_LOG(ERROR) << "tensor_bias create error.";
@@ -94,7 +80,8 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
   tensor_bias->SetData(bias_data);
 
   std::vector<int> out_shape = {1, oh, ow, co};
-  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), out_shape);
+  auto tensor_out_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -112,21 +99,22 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
   opParameter->kernel_w_ = kw;
   opParameter->stride_h_ = 2;
   opParameter->stride_w_ = 2;
-  opParameter->pad_h_ = pad;
-  opParameter->pad_w_ = pad;
+  opParameter->pad_u_ = pad;
+  opParameter->pad_l_ = pad;
   opParameter->input_channel_ = ci;
   opParameter->output_channel_ = co;
-  auto arith_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>(
+  auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>(
     reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
-  auto arith_kernel = arith_kernel_ptr.get();
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
+  auto op_kernel = op_kernel_ptr.get();
+  if (op_kernel == nullptr) {
+    MS_LOG(ERROR) << "op_kernel create error.";
     return;
   }
-  arith_kernel->Init();
+  op_kernel->set_name("DeConv");
+  op_kernel->Init();
 
   inputs[0]->MallocData(allocator);
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  std::vector<kernel::LiteKernel *> kernels{op_kernel};
   std::vector<lite::tensor::Tensor *> inputs_g{tensor_x};
   auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
   auto pGraph = pGraph_ptr.get();
@@ -136,34 +124,122 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
   }
 
   pGraph->Init();
-  memcpy(inputs[0]->Data(), input_data, input_size);
+  memcpy(inputs[0]->Data(), input_data, n * h * w * ci * dtype_size);
   pGraph->Run();
-
-  std::cout << "==================output data=================" << std::endl;
-  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
-  std::cout << std::endl;
-  size_t output_size;
-  std::string output_path = "./test_data/conv2d_transpose/conv2d_transpose_fp32_output.bin";
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  if (correct_data == nullptr) {
-    MS_LOG(ERROR) << "correct_data create error.";
-    return;
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, n * oh * ow * co, static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, n * oh * ow * co, static_cast<float>(1e-5));
   }
-  int size_n = oh * ow * co;
-  size_n = size_n > 100 ? 100 : size_n;
-  for (int i = 0; i < size_n; i++) {
-    std::cout << output_data[i] << ", ";
-    if ((i + 1) % co == 0) {
-      std::cout << std::endl;
-    }
-  }
-  std::cout << std::endl;
 
-  // compare
-  CompareOutputData(output_data, correct_data, oh * ow * co, 0.00001);
   inputs[0]->SetData(nullptr);
   outputs[0]->SetData(nullptr);
-  MS_LOG(INFO) << "Test Conv2dTransposeFp32 passed";
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
+
+void RunTestCaseConv2dTranspose(const std::vector<int> shape, const std::vector<std::string> file_path,
+                                bool enable_fp16) {
+  size_t input_size;
+  std::string input_path = file_path[0];
+  auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size);
+  if (input_data == nullptr) {
+    MS_LOG(ERROR) << "input_data load error.";
+    return;
+  }
+
+  size_t weight_size;
+  std::string weight_path = file_path[1];
+  auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size);
+  if (weight_data == nullptr) {
+    MS_LOG(ERROR) << "weight_data load error.";
+    return;
+  }
+
+  size_t bias_size;
+  std::string bias_path = file_path[2];
+  auto bias_data = mindspore::lite::ReadFile(bias_path.c_str(), &bias_size);
+  if (bias_data == nullptr) {
+    MS_LOG(ERROR) << "bias_data load error.";
+    return;
+  }
+  size_t output_size;
+  std::string output_path = file_path[3];
+  auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size);
+  if (output_data == nullptr) {
+    MS_LOG(ERROR) << "output_data load error.";
+    return;
+  }
+  RunTestCaseConv2dTranspose(shape, input_data, weight_data, bias_data, output_data, enable_fp16);
+}
+
+TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
+  int pad = 0;
+  int n = 1;
+  int h = 240;
+  int w = 240;
+  int kh = 2;
+  int kw = 2;
+  int ci = 128;
+  int co = 128;
+  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
+  std::vector<std::string> file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp32_input.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp32_weight.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp32_bias.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp32_output.bin"};
+  RunTestCaseConv2dTranspose(shape, file_path, false);
+}
+
+TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) {
+  int pad = 0;
+  int n = 1;
+  int h = 240;
+  int w = 240;
+  int kh = 2;
+  int kw = 2;
+  int ci = 128;
+  int co = 128;
+  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
+  std::vector<std::string> file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp16_input.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp16_weight.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp16_bias.bin",
+                                        "./test_data/conv2d_transpose/conv2d_transpose_fp16_output.bin"};
+  RunTestCaseConv2dTranspose(shape, file_path, true);
+}
+
+TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32_2) {
+  int pad = 0;
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int kh = 2;
+  int kw = 2;
+  int ci = 2;
+  int co = 1;
+  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  std::vector<float> bias_data = {0.5f};
+  std::vector<float> output_data = {5.5f,  6.5f,  17.5f, 22.5f, 7.5f,  8.5f,  27.5f, 32.5f,
+                                    29.5f, 38.5f, 41.5f, 54.5f, 47.5f, 56.5f, 67.5f, 80.5f};
+  RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16_2) {
+  int pad = 0;
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int kh = 2;
+  int kw = 2;
+  int ci = 2;
+  int co = 1;
+  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float16_t> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  std::vector<float16_t> bias_data = {0.5f};
+  std::vector<float16_t> output_data = {5.5f,  6.5f,  17.5f, 22.5f, 7.5f,  8.5f,  27.5f, 32.5f,
+                                        29.5f, 38.5f, 41.5f, 54.5f, 47.5f, 56.5f, 67.5f, 80.5f};
+
+  RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true);
+}
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
index ecf7310701..457021b8a7 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -18,9 +18,10 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "nnacl/pack.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
+#include "nnacl/pack.h"
+#include "nnacl/fp32/common_func.h"
 
 using mindspore::kernel::ConvolutionOpenCLKernel;
 using mindspore::kernel::LiteKernel;
@@ -34,22 +35,37 @@ void LoadData(void *dst, size_t dst_size, const std::string &file_path) {
   if (file_path.empty()) {
     memset(dst, 0x00, dst_size);
   } else {
-    auto src_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &dst_size));
+    auto src_data = mindspore::lite::ReadFile(file_path.c_str(), &dst_size);
     memcpy(dst, src_data, dst_size);
   }
 }
 
-void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) {
-  auto *output_data = reinterpret_cast<float *>(output_tensor->Data());
+void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path, const TypeId data_type,
+                     const float atol) {
+  size_t output_size = output_tensor->Size();
+  auto output_data_ori = output_tensor->Data();
+  auto expect_data_ori = mindspore::lite::ReadFile(file_path.c_str(), &output_size);
+  std::vector<float> output_data_vec(output_tensor->ElementsC4Num());
+  std::vector<float> expect_data_vec(output_tensor->ElementsC4Num());
+  float *output_data, *expect_data;
+  if (data_type == kNumberTypeFloat16) {
+    for (int i = 0; i < output_data_vec.size(); ++i) {
+      output_data_vec[i] = ShortToFloat32(reinterpret_cast<uint16_t *>(output_data_ori)[i]);
+      expect_data_vec[i] = ShortToFloat32(reinterpret_cast<uint16_t *>(expect_data_ori)[i]);
+    }
+    output_data = output_data_vec.data();
+    expect_data = expect_data_vec.data();
+  } else {
+    output_data = reinterpret_cast<float *>(output_data_ori);
+    expect_data = reinterpret_cast<float *>(expect_data_ori);
+  }
+
   printf("\noutput[0:10]:");
   for (int i = 0; i < 10; i++) {
     printf("%d:%.3f ", i, output_data[i]);
   }
   printf("\n");
 
-  size_t output_size = output_tensor->Size();
-  auto expect_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
-  constexpr float atol = 0.5;
   for (int i = 0; i < output_tensor->ElementsNum(); ++i) {
     if (std::fabs(output_data[i] - expect_data[i]) > atol) {
       printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
@@ -61,8 +77,8 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil
   printf("COMPARE SUCCESS!\n\n\n");
 }
 
-void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path,
-               std::string attr_str) {
+void TEST_MAIN(schema::Format input_format, schema::Format output_format, const TypeId data_type,
+               const std::string &data_path, std::string attr_str) {
   auto param = new (std::nothrow) ConvParameter;
   if (param == nullptr) {
     return;
@@ -87,6 +103,7 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   std::cout << "initialize OpenCLRuntime and OpenCLAllocator";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
+  ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16);
   auto allocator = ocl_runtime->GetAllocator();
 
   std::cout << "create Tensors";
@@ -94,7 +111,6 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
   std::vector<int> bias_shape = {param->output_channel_};
   std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
-  auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
   auto input_tensor = lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type);
   auto weight_tensor = lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type);
@@ -121,11 +137,17 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   input_tensor.MallocData(allocator);  // before MapBuffer()
   sub_graph->Init();
   LoadData(input_tensor.Data(), input_tensor.Size(), input_file);  // after MapBuffer()
-  printf("input[0-2] =%.3f\n", reinterpret_cast<float *>(input_tensor.Data())[0]);
-  printf("weight[0-2]=%.3f\n", reinterpret_cast<float *>(weight_tensor.Data())[0]);
-  printf("bias[0-2]  =%.3f\n", reinterpret_cast<float *>(bias_tensor.Data())[0]);
+  if (data_type == kNumberTypeFloat16) {
+    printf("input[0] =%.3f\n", ShortToFloat32(reinterpret_cast<uint16_t *>(input_tensor.Data())[0]));
+    printf("weight[0]=%.3f\n", ShortToFloat32(reinterpret_cast<uint16_t *>(weight_tensor.Data())[0]));
+    printf("bias[0]  =%.3f\n", ShortToFloat32(reinterpret_cast<uint16_t *>(bias_tensor.Data())[0]));
+  } else {
+    printf("input[0] =%.3f\n", reinterpret_cast<float *>(input_tensor.Data())[0]);
+    printf("weight[0]=%.3f\n", reinterpret_cast<float *>(weight_tensor.Data())[0]);
+    printf("bias[0]  =%.3f\n", reinterpret_cast<float *>(bias_tensor.Data())[0]);
+  }
   sub_graph->Run();
-  MyCompareOutput(&output_tensor, expect_file);
+  MyCompareOutput(&output_tensor, expect_file, data_type, (data_type == kNumberTypeFloat16 ? 0.7f : 0.1f));
 
   std::cout << "release resources";
   weight_tensor.FreeData();
@@ -139,72 +161,30 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
-TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
-  // change W/H
+TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp32) {
   TEST_MAIN(
-    schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-    "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_"
+    schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/mobilenetv2_fp32/",
+    "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
     "1x1");
 }
 
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) {
-  // speed up
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
-            "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
-
-TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
+TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp16) {
   TEST_MAIN(
-    schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/",
+    schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/mobilenetv2_fp16/",
     "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
     "1x1");
 }
 
-TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
+TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp32) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/test_fp32/",
             "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
             "dilationHW_1x1");
 }
-TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
-            "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
-
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-            "inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
 
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x250) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-            "inputNHWC_1x8x128x100_outputNHWC_1x8x128x250_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
-
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x300) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-            "inputNHWC_1x8x128x100_outputNHWC_1x8x128x300_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
-
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x350) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-            "inputNHWC_1x4x64x150_outputNHWC_1x4x64x350_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
-            "dilationHW_1x1");
-}
-TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x400) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
-            "inputNHWC_1x4x64x150_outputNHWC_1x4x64x400_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp16) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/test_fp16/",
+            "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
             "dilationHW_1x1");
 }
 
-TEST_F(TestConvolutionOpenCL, winograd_08_origin_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) {
-  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
-            "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
-            "1x1x1x1_dilationHW_1x1");
-}
-
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
index a2747c3586..e1e6704a18 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
@@ -19,20 +19,20 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "nnacl/pack.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h"
 
-bool IMAGE2D_OPEN = true;
-
 namespace mindspore {
 class TestConvolutionDwOpenCL : public mindspore::CommonTest {
  public:
   TestConvolutionDwOpenCL() {}
 };
 
-void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *weight_data, float_t *gnd_data,
-                       schema::Format format, bool is_compare = true) {
+template <class T1, class T2>
+void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_data, T2 *gnd_data, schema::Format format,
+                       TypeId dtype = kNumberTypeFloat32, bool is_compare = true, T2 err_max = 1e-5) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
@@ -40,45 +40,53 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
   // pack input
   int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
   int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  auto packed_input = std::make_unique<float>(pack_input_size);
-  if (packed_input.get() == nullptr) {
+  auto packed_input = new (std::nothrow) T2[pack_input_size];
+  if (packed_input == nullptr) {
     return;
   }
-  memset(packed_input.get(), 0, pack_input_size * sizeof(float));
+  memset(packed_input, 0, pack_input_size * sizeof(T2));
   int plane = conv_param->input_w_ * conv_param->input_h_;
+  std::function<T2(T2)> to_dtype = [](T2 x) -> T2 { return x; };
   if (format == schema::Format_NHWC4) {
-    PackNHWCToNHWC4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
+    kernel::PackNHWCToNHWC4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype);
   } else {
-    PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
+    kernel::PackNHWCToNC4HW4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype);
   }
 
   // pack weight
-  int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
   int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
+  T1 *packed_weight = weight_data;
 
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+  // T1 bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
+  T1 bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
   size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
                               conv_param->output_h_ * conv_param->output_w_;
 
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               conv_param->input_channel_};  // Note!!!actual is NHWC4
   std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
   std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in, format);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, format);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-  if (tensor_a.get() == nullptr || tensor_b.get() == nullptr || tensor_c.get() == nullptr ||
-      tensor_d.get() == nullptr) {
+  std::vector<int> shape_out;
+  std::vector<int> shape_in;
+  if (format == schema::Format_NHWC || format == schema::Format_NHWC4) {
+    shape_in = std::vector<int>(
+      {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, conv_param->input_channel_});
+    shape_out = std::vector<int>(
+      {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, conv_param->output_channel_});
+  } else if (format == schema::Format_NCHW || format == schema::Format_NC4HW4) {
+    shape_in = std::vector<int>(
+      {conv_param->input_batch_, conv_param->input_channel_, conv_param->input_h_, conv_param->input_w_});
+    shape_out = std::vector<int>(
+      {conv_param->output_batch_, conv_param->output_channel_, conv_param->output_h_, conv_param->output_w_});
+  } else {
+    MS_LOG(ERROR) << "Unsupported format: " << format;
+    delete[] packed_input;
     return;
   }
+  auto tensor_a = lite::tensor::Tensor(TypeId(dtype), shape_in, format);
+  auto tensor_b = lite::tensor::Tensor(TypeId(dtype), shape_filter, schema::Format_NHWC);
+  auto tensor_c = lite::tensor::Tensor(TypeId(dtype), shape_bias, schema::Format_NHWC);
+  auto tensor_d = lite::tensor::Tensor(TypeId(dtype), shape_out, format);
+  std::vector<lite::tensor::Tensor *> inputs{&tensor_a, &tensor_b, &tensor_c};
+  std::vector<lite::tensor::Tensor *> outputs{&tensor_d};
 
   // freamework to do!!!
   inputs[1]->SetData(packed_weight);
@@ -87,43 +95,47 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
   OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param);
   auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
   if (pKernel.get() == nullptr) {
+    delete[] packed_input;
     return;
   }
   pKernel->Init();
 
   std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
-  size_t C4 = UP_DIV(inputs[0]->Channel(), C4NUM);
-  inputs[0]->MallocData(allocator);
+  std::vector<lite::tensor::Tensor *> inputs_{&tensor_a};
   auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  if (pKernel.get() == nullptr) {
+  if (pGraph.get() == nullptr) {
+    delete[] packed_input;
     return;
   }
   pGraph->Init();
 
   // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size);
+  inputs[0]->MallocData(allocator);
+  memcpy(inputs[0]->Data(), packed_input, sizeof(T2) * pack_input_size);
 
   pGraph->Run();
   if (is_compare) {
-    float_t *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
-    auto packed_correct_data = std::make_unique<float_t>(packed_output_size);
-    if (packed_correct_data) {
+    T2 *packed_output = reinterpret_cast<T2 *>(outputs[0]->Data());
+    auto packed_correct_data = std::make_unique<T2>(packed_output_size);
+    if (packed_correct_data.get() == nullptr) {
+      delete[] packed_input;
       return;
     }
-    memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float_t));
+    memset(packed_correct_data.get(), 0, packed_output_size * sizeof(T2));
     if (format == schema::Format_NC4HW4) {
-      PackNHWCToNC4HW4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
-                           conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
+      kernel::PackNHWCToNC4HW4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
+                                       conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
+                                       to_dtype);
     } else {
-      PackNHWCToNHWC4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
-                          conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
+      kernel::PackNHWCToNHWC4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
+                                      conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
+                                      to_dtype);
     }
 
     printf("==================input_data=================\n");
     std::cout << std::endl;
     for (int i = 0; i < pack_input_size; i++) {
-      std::cout << packed_input.get()[i] << ", ";
+      std::cout << packed_input[i] << ", ";
     }
     std::cout << std::endl;
     printf("==================weight data=================\n");
@@ -134,7 +146,7 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
     std::cout << std::endl;
     printf("==================output data=================\n");
     std::cout << std::endl;
-    for (int i = 0; i < 80 /*packed_output_size*/; i++) {
+    for (int i = 0; i < packed_output_size; i++) {
       std::cout << packed_output[i] << ", ";
     }
     std::cout << std::endl;
@@ -144,11 +156,13 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
     }
     std::cout << std::endl;
     // compare
-    CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001);
+    CommonTest::CompareOutputData<T2>(packed_output, packed_correct_data.get(), packed_output_size, err_max);
   }
 
   inputs[1]->SetData(nullptr);
   inputs[2]->SetData(nullptr);
+  delete[] packed_input;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
   return;
 }
 
@@ -169,8 +183,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 0;
-    conv_param->pad_w_ = 0;
+    conv_param->pad_u_ = 0;
+    conv_param->pad_l_ = 0;
   }
 
   // nhwc
@@ -194,8 +208,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
   float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
                       2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
 }
 
 TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
@@ -215,8 +228,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 1;
-    conv_param->pad_w_ = 1;
+    conv_param->pad_u_ = 1;
+    conv_param->pad_l_ = 1;
   }
 
   // nhwc
@@ -267,8 +280,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
                       0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                       1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
 }
 
 TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
@@ -288,8 +300,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 0;
-    conv_param->pad_w_ = 0;
+    conv_param->pad_u_ = 0;
+    conv_param->pad_l_ = 0;
   }
 
   // nhwc
@@ -313,8 +325,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
   float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
                       2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
+  // delete conv_param;
 }
 
 TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
@@ -334,8 +346,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 1;
-    conv_param->pad_w_ = 1;
+    conv_param->pad_u_ = 1;
+    conv_param->pad_l_ = 1;
   }
 
   // nhwc
@@ -386,13 +398,10 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
                       0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                       1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->Init();
+TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
   auto conv_param = std::make_unique<ConvParameter>();
   {
     conv_param->input_batch_ = 1;
@@ -409,115 +418,38 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 0;
-    conv_param->pad_w_ = 0;
+    conv_param->pad_u_ = 0;
+    conv_param->pad_l_ = 0;
   }
 
   // nhwc
-  float input_data[] = {0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
-                        0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
-                        0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
-                        0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
-                        0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
-                        0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
-                        0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
-                        0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
-
-  // pack input
-  int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
-  int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  float *packed_input = input_data;
+  float16_t input_data[] = {
+    0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
+    0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
+    0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
+    0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
+    0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
+    0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
+    0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
+    0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
 
   // co h w ci
-  float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
-                         0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772,
-                         0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051,
-                         0.5759465,  0.9292962,  0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136,
-                         0.5865129,  0.02010755, 0.82894003, 0.00469548};
-
-  // pack weight
-  int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
-
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
-                              conv_param->output_h_ * conv_param->output_w_;
-
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               IC4 * C4NUM};  // Note!!!actual is NHWC4
-  std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
-  std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in,
-                                                         schema::Format_NC4HW4);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-
-  // freamework to do!!!
-  inputs[1]->SetData(packed_weight);
-  inputs[2]->SetData(bias_data);
-
-  OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get());
-  auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
-  pKernel->Init();
-
-  std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
-  inputs[0]->MallocData();
-  auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  pGraph->Init();
-
-  // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input, sizeof(float) * pack_input_size);
-
-  pGraph->Run();
-  float *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
+  float16_t weight_data[] = {
+    0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,  0.9767611,
+    0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298,
+    0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962,  0.31856894,
+    0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136, 0.5865129,  0.02010755, 0.82894003, 0.00469548};
 
   // pack correct data, nhwc
-  float packed_correct_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                                 2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
+  float16_t gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
+                          2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  printf("==================input_data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_input_size; i++) {
-    std::cout << packed_input[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================packed_weight data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_weight_size; i++) {
-    std::cout << packed_weight[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================output data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_output[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================expected output data=================\n");
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_correct_data[i] << ", ";
-  }
-  std::cout << std::endl;
-  // compare
-  CommonTest::CompareOutputData(packed_output, packed_correct_data, packed_output_size, 0.00001);
-
-  inputs[1]->SetData(nullptr);
-  inputs[2]->SetData(nullptr);
-  MS_LOG(INFO) << "TestConvolutionDwNoPadFp32 passed";
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
+  DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
+                                          kNumberTypeFloat16, true, 1e-2);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->Init();
+TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
   auto conv_param = std::make_unique<ConvParameter>();
   {
     conv_param->input_batch_ = 1;
@@ -534,18 +466,18 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
     conv_param->stride_w_ = 1;
     conv_param->dilation_h_ = 1;
     conv_param->dilation_w_ = 1;
-    conv_param->pad_h_ = 1;
-    conv_param->pad_w_ = 1;
+    conv_param->pad_u_ = 1;
+    conv_param->pad_l_ = 1;
   }
 
   // nhwc
-  float input_data[] = {0.5488135,  0.3834415,  0.77815676, 0.9446689, 0.6120957,  0.71518934, 0.79172504, 0.87001216,
-                        0.5218483,  0.616934,   0.60276335, 0.5288949, 0.9786183,  0.41466194, 0.94374806, 0.5448832,
-                        0.56804454, 0.7991586,  0.2645556,  0.6818203, 0.4236548,  0.92559665, 0.46147937, 0.7742337,
-                        0.3595079,  0.6458941,  0.07103606, 0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,
-                        0.11827443, 0.56843394, 0.6976312,  0.891773,  0.0202184,  0.639921,   0.0187898,  0.06022547,
-                        0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667};
-  // float input_data[]={
+  float16_t input_data[] = {
+    0.5488135, 0.3834415,  0.77815676, 0.9446689,  0.6120957,  0.71518934, 0.79172504, 0.87001216, 0.5218483,
+    0.616934,  0.60276335, 0.5288949,  0.9786183,  0.41466194, 0.94374806, 0.5448832,  0.56804454, 0.7991586,
+    0.2645556, 0.6818203,  0.4236548,  0.92559665, 0.46147937, 0.7742337,  0.3595079,  0.6458941,  0.07103606,
+    0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,  0.11827443, 0.56843394, 0.6976312,  0.891773,
+    0.0202184, 0.639921,   0.0187898,  0.06022547, 0.96366274, 0.83261985, 0.14335328, 0.6176355,  0.6667667};
+  // float16_t input_data[]={
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
@@ -555,23 +487,14 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  };
-
-  // pack input
-  int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
-  int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  auto packed_input = std::make_unique<float>(pack_input_size);
-  memset(packed_input.get(), 0, pack_input_size * sizeof(float));
-  int plane = conv_param->input_w_ * conv_param->input_h_;
-  PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
-
   // co h w ci
-  float weight_data[] = {0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,
-                         0.10204481, 0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958,
-                         0.11037514, 0.6563296,  0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,
-                         0.09609841, 0.97645944, 0.4686512,  0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696,
-                         0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146,
-                         0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
-  // float weight_data[]={
+  float16_t weight_data[] = {
+    0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481,
+    0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,
+    0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
+    0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318,
+    0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
+  // float16_t weight_data[]={
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
@@ -587,96 +510,20 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 };
-
-  // pack weight
-  int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
-  int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
-
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
-                              conv_param->output_h_ * conv_param->output_w_;
-
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               IC4 * C4NUM};  // Note!!!actual is NHWC4
-  std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
-  std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in,
-                                                         schema::Format_NC4HW4);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-
-  // freamework to do!!!
-  inputs[1]->SetData(packed_weight);
-  inputs[2]->SetData(bias_data);
-
-  OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get());
-  auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
-  pKernel->Init();
-
-  std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
-  inputs[0]->MallocData();
-  auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  pGraph->Init();
-
-  // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size);
-
-  pGraph->Run();
-  float *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
-
   // pack correct data, nhwc
-  float correct_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
+  float16_t gnd_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
                           0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
                           1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
                           2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
                           0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                           1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
-  auto packed_correct_data = std::make_unique<float>(packed_output_size);
-  memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float));
-  PackNHWCToNC4HW4Fp32(correct_data, packed_correct_data.get(), conv_param->output_batch_,
-                       conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
-
-  printf("==================input_data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_input_size; i++) {
-    std::cout << packed_input.get()[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================weight data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_weight_size; i++) {
-    std::cout << packed_weight[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================output data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_output[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================expected output data=================\n");
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_correct_data.get()[i] << ", ";
-  }
-  std::cout << std::endl;
-  // compare
-  CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001);
 
-  inputs[1]->SetData(nullptr);
-  inputs[2]->SetData(nullptr);
-  MS_LOG(INFO) << "TestConvolutionDwPadFp32 passed";
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+  lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
+  DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
+                                          kNumberTypeFloat16, true, 1e-2);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
+TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2Fp32) {
   std::vector<std::vector<int>> src_shape{
     {1, 32, 112, 112}, {1, 96, 112, 112}, {1, 144, 56, 56}, {1, 144, 56, 56}, {1, 192, 28, 28},
     {1, 192, 28, 28},  {1, 384, 14, 14},  {1, 576, 14, 14}, {1, 576, 14, 14}, {1, 960, 7, 7},
@@ -691,103 +538,57 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
   };
 
   // nhwc
-  size_t in_size = 96 * 112 * 112;
-  auto input_data = std::make_unique<float_t>(in_size);
-  memset(input_data.get(), 0, in_size);
+  const size_t in_size = 96 * 112 * 112;
+  float *input_data = new (std::nothrow) float[in_size];
+  if (input_data == nullptr) {
+    return;
+  }
+  memset(input_data, 0, in_size * sizeof(float_t));
   for (auto i = 0; i < in_size; ++i) {
-    input_data.get()[i] = 1;
+    input_data[i] = 1;
   }
   // co h w ci
-  size_t wt_size = 576 * 3 * 3;
-  auto weight_data = std::make_unique<float_t>(wt_size);
-  memset(weight_data.get(), 0, wt_size);
+  const size_t wt_size = 576 * 3 * 3;
+  float *weight_data = new (std::nothrow) float[wt_size];
+  if (weight_data == nullptr) {
+    delete [] input_data;
+    return;
+  }
+  memset(weight_data, 0, wt_size);
   for (auto i = 0; i < wt_size; ++i) {
-    weight_data.get()[i] = 1;
+    weight_data[i] = 1;
   }
-  size_t out_size = 96 * 112 * 112;
-  auto gnd_data = std::make_unique<float_t>(out_size);
-  memset(gnd_data.get(), 0, out_size);
-  //  for (auto i = 0; i < in_size; ++i) {
-  //    gnd_data[i] = 1;
-  //  }
   for (size_t i = 0; i < src_shape.size(); ++i) {
     const int MAX_RUN_TIMES = 1;
     for (int j = 0; j < MAX_RUN_TIMES; ++j) {
       printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n",
              src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], dst_shape[i][0], dst_shape[i][1],
              dst_shape[i][2], dst_shape[i][3], j);
-      auto conv_param = std::make_unique<ConvParameter>();
+      auto conv_param = ConvParameter();
       {
-        conv_param->input_batch_ = 1;
-        conv_param->input_h_ = src_shape[i][2];
-        conv_param->input_w_ = src_shape[i][3];
-        conv_param->input_channel_ = src_shape[i][1];
-        conv_param->output_batch_ = 1;
-        conv_param->output_h_ = dst_shape[i][2];
-        conv_param->output_w_ = dst_shape[i][3];
-        conv_param->output_channel_ = dst_shape[i][1];
-        conv_param->kernel_h_ = filter_shape[i][1];
-        conv_param->kernel_w_ = filter_shape[i][2];
-        conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
-        conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
-        conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
-        conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
-        conv_param->dilation_h_ = 1;
-        conv_param->dilation_w_ = 1;
+        conv_param.input_batch_ = 1;
+        conv_param.input_h_ = src_shape[i][2];
+        conv_param.input_w_ = src_shape[i][3];
+        conv_param.input_channel_ = src_shape[i][1];
+        conv_param.output_batch_ = 1;
+        conv_param.output_h_ = dst_shape[i][2];
+        conv_param.output_w_ = dst_shape[i][3];
+        conv_param.output_channel_ = dst_shape[i][1];
+        conv_param.kernel_h_ = filter_shape[i][1];
+        conv_param.kernel_w_ = filter_shape[i][2];
+        conv_param.stride_h_ = conv_param.output_h_ / conv_param.input_h_;
+        conv_param.stride_w_ = conv_param.output_w_ / conv_param.input_w_;
+        conv_param.pad_u_ = (conv_param.kernel_h_ - 1) / 2;
+        conv_param.pad_l_ = (conv_param.kernel_w_ - 1) / 2;
+        conv_param.dilation_h_ = 1;
+        conv_param.dilation_w_ = 1;
       }
-      //      DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, false);
-      DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), nullptr, schema::Format_NHWC4, false);
+      DepthWiseTestMain<float, float>(&conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4,
+                                      kNumberTypeFloat32, false);
     }
   }
-  lite::opencl::OpenCLRuntime::DeleteInstance();
-}
-
-TEST_F(TestConvolutionDwOpenCL, Buffer2Image) {
-  std::vector<int> src_shape{1, 96, 64, 64};
-  std::vector<int> dst_shape{1, 96, 32, 32};
-  std::vector<int> filter_shape{96, 3, 3, 1};
-
-  // nhwc
-  size_t in_size = 96 * 112 * 112;
-  auto input_data = std::make_unique<float_t>(in_size);
-  memset(input_data.get(), 0, in_size);
-  for (auto i = 0; i < in_size; ++i) {
-    input_data.get()[i] = 1;
-  }
-  // co h w ci
-  size_t wt_size = 576 * 3 * 3;
-  auto weight_data = std::make_unique<float_t>(wt_size);
-  memset(weight_data.get(), 0, wt_size);
-  for (auto i = 0; i < wt_size; ++i) {
-    weight_data.get()[i] = 1;
-  }
-  size_t out_size = 96 * 112 * 112;
-  auto gnd_data = std::make_unique<float_t>(out_size);
-  memset(gnd_data.get(), 0, out_size);
-  //  for (auto i = 0; i < in_size; ++i) {
-  //    gnd_data[i] = 1;
-  //  }
-  auto conv_param = std::make_unique<ConvParameter>();
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = src_shape[2];
-    conv_param->input_w_ = src_shape[3];
-    conv_param->input_channel_ = src_shape[1];
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = dst_shape[2];
-    conv_param->output_w_ = dst_shape[3];
-    conv_param->output_channel_ = dst_shape[1];
-    conv_param->kernel_h_ = filter_shape[1];
-    conv_param->kernel_w_ = filter_shape[2];
-    conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
-    conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
-    conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
-    conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-  }
-  //      DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, true);
-  DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), gnd_data.get(), schema::Format_NHWC4, true);
+  delete [] input_data;
+  delete [] weight_data;
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
index 3e10b5ce48..8b21e5f845 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@@ -21,6 +21,7 @@
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
 namespace mindspore {
 class TestMatMulOpenCL : public mindspore::CommonTest {
@@ -28,29 +29,21 @@ class TestMatMulOpenCL : public mindspore::CommonTest {
   TestMatMulOpenCL() {}
 };
 
-TEST_F(TestMatMulOpenCL, MatMulFp32) {
+void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data,
+                       bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  size_t input_size;
-  int ci = 1280;
-  int co = 1001;
-  std::string input_path = "./test_data/matmul/matmul_fp32_input.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  if (input_data == nullptr) {
-    MS_LOG(ERROR) << "input_data load error.";
-    return;
-  }
-  size_t weight_size;
-  std::string weight_path = "./test_data/matmul/matmul_fp32_weight.bin";
-  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
-  if (weight_data == nullptr) {
-    MS_LOG(ERROR) << "weight_data load error.";
-    return;
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
   }
+  auto allocator = ocl_runtime->GetAllocator();
+  int ci = shape[0];
+  int co = shape[1];
   std::vector<int> input_shape = {1, ci};
-  auto tensor_x_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), input_shape, schema::Format_NC);
+  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NC);
   auto tensor_x = tensor_x_ptr.get();
   if (tensor_x == nullptr) {
     MS_LOG(ERROR) << "tensor_x create error.";
@@ -58,7 +51,8 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
   }
 
   std::vector<int> w_shape = {co, ci};
-  auto tensor_w_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), w_shape);
+  auto tensor_w_ptr =
+    std::make_unique<lite::tensor::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), w_shape);
   auto tensor_w = tensor_w_ptr.get();
   if (tensor_w == nullptr) {
     MS_LOG(ERROR) << "tensor_w create error.";
@@ -67,8 +61,8 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
   tensor_w->SetData(weight_data);
 
   std::vector<int> out_shape = {1, co};
-  auto tensor_out_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), out_shape, schema::Format_NC);
+  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NC);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -76,16 +70,16 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
   }
   std::vector<lite::tensor::Tensor *> inputs{tensor_x, tensor_w};
   std::vector<lite::tensor::Tensor *> outputs{tensor_out};
-  auto arith_kernel_ptr = std::make_unique<kernel::MatMulOpenCLKernel>(nullptr, inputs, outputs, false);
-  auto arith_kernel = arith_kernel_ptr.get();
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
+  auto op_kernel_ptr = std::make_unique<kernel::MatMulOpenCLKernel>(nullptr, inputs, outputs, false);
+  auto op_kernel = op_kernel_ptr.get();
+  if (op_kernel == nullptr) {
+    MS_LOG(ERROR) << "op_kernel create error.";
     return;
   }
-  arith_kernel->Init();
+  op_kernel->Init();
   inputs[0]->MallocData(allocator);
 
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  std::vector<kernel::LiteKernel *> kernels{op_kernel};
 
   std::vector<lite::tensor::Tensor *> inputs_g{tensor_x};
   auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
@@ -95,27 +89,84 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
     return;
   }
   pGraph->Init();
-  memcpy(inputs[0]->Data(), input_data, input_size);
+  memcpy(inputs[0]->Data(), input_data, ci * dtype_size);
   pGraph->Run();
-
-  size_t output_size;
-  std::string output_path = "./test_data/matmul/matmul_fp32_output.bin";
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  printf("==================output data=================\n");
-  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
-  std::cout << std::endl;
-  int size_n = co;
-  size_n = size_n > 100 ? 100 : size_n;
-  for (int i = 0; i < size_n; i++) {
-    std::cout << output_data[i] << " ";
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, co, static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, co, static_cast<float>(1e-5));
   }
-  std::cout << std::endl;
 
-  // compare
-  CompareOutputData(output_data, correct_data, co, 0.0001);
   tensor_x->SetData(nullptr);
   tensor_out->SetData(nullptr);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
   MS_LOG(INFO) << "TestMatMulFp32 passed";
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+
+void RunTestCaseMatMul(const std::vector<int> shape, const std::vector<std::string> file_path, bool enable_fp16) {
+  size_t input_size;
+  std::string input_path = file_path[0];
+  auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size);
+  if (input_data == nullptr) {
+    MS_LOG(ERROR) << "input_data load error.";
+    return;
+  }
+  size_t weight_size;
+  std::string weight_path = file_path[1];
+  auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size);
+  if (weight_data == nullptr) {
+    MS_LOG(ERROR) << "weight_data load error.";
+    return;
+  }
+  size_t output_size;
+  std::string output_path = file_path[2];
+  auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size);
+  if (output_data == nullptr) {
+    MS_LOG(ERROR) << "output_data load error.";
+    return;
+  }
+  RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp32) {
+  int ci = 1280;
+  int co = 1001;
+  std::vector<int> shape = {ci, co};
+  std::vector<std::string> file_path = {"./test_data/matmul/matmul_fp32_input.bin",
+                                        "./test_data/matmul/matmul_fp32_weight.bin",
+                                        "./test_data/matmul/matmul_fp32_output.bin"};
+  RunTestCaseMatMul(shape, file_path, false);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp16) {
+  int ci = 1280;
+  int co = 1001;
+  std::vector<int> shape = {ci, co};
+  std::vector<std::string> file_path = {"./test_data/matmul/matmul_fp16_input.bin",
+                                        "./test_data/matmul/matmul_fp16_weight.bin",
+                                        "./test_data/matmul/matmul_fp16_output.bin"};
+  RunTestCaseMatMul(shape, file_path, true);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp32_2) {
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {ci, co};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float> output_data = {10.f, 10.f, 10.f};
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {ci, co};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float16_t> output_data = {10.f, 10.f, 10.f};
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
index aa13e43a82..321b1ccd06 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
@@ -35,8 +35,7 @@ void InitParameter(PoolingParameter *param) {
   param->pad_d_ = 0;
   param->pad_l_ = 0;
   param->pad_r_ = 0;
-  param->avg_pooling_ = false;
-  param->max_pooling_ = true;
+  param->pool_mode_ = PoolMode_MaxPool;
 }
 
 TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
@@ -109,7 +108,7 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
 
   MS_LOG(INFO) << "compare result";
   std::cout << "compare result" << std::endl;
-  CompareOutput(output_tensor, expect_file);
+  CompareOutput(output_tensor, expect_file, static_cast<float>(1e-5));
   for (auto tensor : inputs) {
     delete tensor;
   }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
index 91dc9c6559..64fc23c43c 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
@@ -62,26 +62,27 @@ void CompareOutPRelu(lite::tensor::Tensor *output_tensor, const std::string &sta
 
 TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
   std::string in_file = "/data/local/tmp/in_data.bin";
-  std::string standard_answer_file = "/data/local/tmp/leaky_relu.bin";
+  std::string weight_file = "/data/local/tmp/weight_data.bin";
+  std::string standard_answer_file = "/data/local/tmp/caffe_prelu.bin";
   MS_LOG(INFO) << "-------------------->> Begin test PRelu!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
   MS_LOG(INFO) << "Init tensors.";
-  std::vector<int> input_shape = {1, 4, 3, 8};
+  std::vector<int> input_shape = {1, 4, 3, 9};
 
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
   auto input_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensor_type);
+    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   if (input_tensor == nullptr) {
     MS_LOG(ERROR) << "new input_tensor error!";
     return;
   }
 
   auto output_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensor_type);
+    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   if (output_tensor == nullptr) {
     MS_LOG(ERROR) << "new output_tensor error";
     delete input_tensor;
@@ -89,7 +90,7 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
   }
 
   auto weight_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, std::vector<int>{1}, schema::Format_NHWC, tensor_type);
+    new (std::nothrow) lite::tensor::Tensor(data_type, std::vector<int>{9}, schema::Format_NHWC, tensor_type);
   if (weight_tensor == nullptr) {
     MS_LOG(ERROR) << "new weight_tensor error";
     delete input_tensor;
@@ -105,11 +106,13 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
 
   MS_LOG(INFO) << "initialize input data";
   LoadDataPRelu(input_tensor->Data(), input_tensor->Size(), in_file);
+  LoadDataPRelu(weight_tensor->Data(), weight_tensor->Size(), weight_file);
   auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
-  weight_data[0] = 0.3;
+  PrintData("Weight data", weight_data, inputs[1]->ElementsNum());
   auto *input_data = reinterpret_cast<float *>(inputs[0]->Data());
-  PrintData("PRelu input data", input_data, inputs[0]->ElementsC4Num());
-
+  PrintData("PRelu input data", input_data, inputs[0]->ElementsNum());
+  std::cout << inputs[0]->ElementsNum() << std::endl;
+  std::cout << "--------------------------------------------" << std::endl;
   auto param = new (std::nothrow) PReluParameter();
   if (param == nullptr) {
     MS_LOG(ERROR) << "new PreluParameter error";
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
index bbed448f85..0172df1a31 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
@@ -21,6 +21,7 @@
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
 namespace mindspore {
 class TestReshapeOpenCL : public mindspore::CommonTest {
@@ -28,29 +29,27 @@ class TestReshapeOpenCL : public mindspore::CommonTest {
   TestReshapeOpenCL() {}
 };
 
-TEST_F(TestReshapeOpenCL, ReshapeFp32) {
+void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  int c = 63;
-  size_t input_size;
-  std::string input_path = "./test_data/reshape/reshape_fp32_input.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  if (input_data == nullptr) {
-    MS_LOG(ERROR) << "input_data load error.";
-    return;
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
   }
+  auto allocator = ocl_runtime->GetAllocator();
+  int c = shape[0];
   std::vector<int> input_shape = {1, 1, 1, c};
-  auto tensor_x_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), input_shape, schema::Format_NHWC);
+  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC);
   auto tensor_x = tensor_x_ptr.get();
   if (tensor_x == nullptr) {
     MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
   std::vector<int> out_shape = {1, c};
-  auto tensor_out_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), out_shape, schema::Format_NC);
+  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NC);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -76,36 +75,36 @@ TEST_F(TestReshapeOpenCL, ReshapeFp32) {
     return;
   }
   pGraph->Init();
-  memcpy(inputs[0]->Data(), input_data, input_size);
+  memcpy(inputs[0]->Data(), input_data, c * dtype_size);
   pGraph->Run();
 
-  size_t output_size;
-  std::string output_path = "./test_data/reshape/reshape_fp32_output.bin";
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  if (correct_data == nullptr) {
-    MS_LOG(ERROR) << "correct_data create error.";
-    return;
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, c, static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, c, static_cast<float>(1e-5));
   }
-  printf("==================output data=================\n");
-  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
-  std::cout << std::endl;
-  int size_n = c;
-  size_n = size_n > 100 ? 100 : size_n;
-  for (int i = 0; i < size_n; i++) {
-    std::cout << output_data[i] << " ";
-    if ((i + 1) % c == 0) {
-      std::cout << std::endl;
-    }
-  }
-  std::cout << std::endl;
-
-  // compare
-  CompareOutputData(output_data, correct_data, c, 0.00001);
-
   inputs[0]->SetData(nullptr);
   outputs[0]->SetData(nullptr);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 
   MS_LOG(INFO) << "Test ReshapeFp32 passed";
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+
+TEST_F(TestReshapeOpenCL, ReshapeFp32) {
+  int c = 7;
+  std::vector<int> shape = {c};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestReshapeOpenCL, ReshapeFp16) {
+  int c = 7;
+  std::vector<int> shape = {c};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), true);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
index e2cc3215ae..33ff054d32 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
@@ -23,9 +23,13 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/slice.h"
 
 namespace mindspore {
-class TestSliceOpenCL : public mindspore::CommonTest {
+class TestSliceOpenCLfp32 : public mindspore::CommonTest {
  public:
-  TestSliceOpenCL() {}
+  TestSliceOpenCLfp32() {}
+};
+class TestSliceOpenCLfp16 : public mindspore::CommonTest {
+ public:
+  TestSliceOpenCLfp16() {}
 };
 
 template <typename T>
@@ -36,7 +40,7 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou
   }
 }
 
-TEST_F(TestSliceOpenCL, Sliceinput_dim4) {
+TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
   MS_LOG(INFO) << "begin test";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
@@ -52,8 +56,8 @@ TEST_F(TestSliceOpenCL, Sliceinput_dim4) {
 
   // get the input from .bin
   size_t input_size, output_size;
-  std::string input_path = "./test_data/in_data.bin";
-  std::string output_path = "./test_data/out_data.bin";
+  std::string input_path = "./test_data/in_datafp32.bin";
+  std::string output_path = "./test_data/out_datafp32.bin";
   auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
   auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
 
@@ -86,7 +90,7 @@ TEST_F(TestSliceOpenCL, Sliceinput_dim4) {
     MS_LOG(INFO) << "new SliceParameter failed";
     return;
   }
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < input_shape.size(); i++) {
     param->begin_[i] = begin[i];
     param->size_[i] = size[i];
   }
@@ -146,4 +150,115 @@ TEST_F(TestSliceOpenCL, Sliceinput_dim4) {
   delete sub_graph;
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
+TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
+  MS_LOG(INFO) << "begin test";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->SetFp16Enable(true);
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(INFO) << "Read tensors from .bin";
+  std::vector<int> input_shape = {1, 256, 256, 48};
+  std::vector<int> output_shape = {1, 255, 255, 15};
+  std::vector<int> begin = {0, 1, 1, 7};
+  std::vector<int> size = {1, 255, 255, 15};
+  auto data_type = kNumberTypeFloat16;
+  auto tensor_type = schema::NodeType_ValueNode;
+
+  // get the input from .bin
+  size_t input_size, output_size;
+  std::string input_path = "./test_data/in_data.bin";
+  std::string output_path = "./test_data/out_data.bin";
+  auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
+
+  MS_LOG(INFO) << "construct tensors";
+  lite::tensor::Tensor *tensor_data =
+    new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
+  if (tensor_data == nullptr) {
+    MS_LOG(INFO) << "init tensor failed";
+    return;
+  }
+  auto *output_tensor =
+    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
+  if (output_tensor == nullptr) {
+    delete tensor_data;
+    MS_LOG(INFO) << "init tensor failed";
+    return;
+  }
+  std::vector<lite::tensor::Tensor *> inputs = {tensor_data};
+  std::vector<lite::tensor::Tensor *> outputs = {output_tensor};
+
+  MS_LOG(INFO) << "setting  SliceParameter";
+  auto param = new (std::nothrow) SliceParameter();
+  if (param == nullptr) {
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    MS_LOG(INFO) << "new SliceParameter failed";
+    return;
+  }
+  for (int i = 0; i < 4; i++) {
+    param->begin_[i] = begin[i];
+    param->size_[i] = size[i];
+  }
+
+  auto *slice_kernel =
+    new (std::nothrow) kernel::SliceOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (slice_kernel == nullptr) {
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    MS_LOG(INFO) << "new kernel::slice_kernel failed";
+    return;
+  }
+  slice_kernel->Init();
+
+  // to do allocate memory for inputs and outputs
+  for (auto &input_tensor : inputs) {
+    input_tensor->MallocData(allocator);
+  }
+
+  MS_LOG(INFO) << "initialize sub_graph";
+  std::vector<kernel::LiteKernel *> kernels{slice_kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    for (auto tensor : inputs) {
+      delete tensor;
+    }
+    for (auto tensor : outputs) {
+      delete tensor;
+    }
+    delete param;
+    delete slice_kernel;
+    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    return;
+  }
+  sub_graph->Init();
+
+  MS_LOG(INFO) << "init tensors";
+  memcpy(inputs[0]->Data(), input_data, input_size);
+
+  std::cout << "==================output data================" << std::endl;
+  sub_graph->Run();
+
+  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
+  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
+  for (auto tensor : inputs) {
+    delete tensor;
+  }
+  for (auto tensor : outputs) {
+    delete tensor;
+  }
+  delete slice_kernel;
+  delete sub_graph;
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
index 7a64cb2058..2913d322d9 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
@@ -83,7 +83,7 @@ void RunTestCase(std::vector<int> input_shape, std::vector<int> output_shape, st
   pGraph->Run();
 
   MS_LOG(INFO) << "compare result";
-  CompareOutput(output_tensor, expect_file);
+  CompareOutput(output_tensor, expect_file, static_cast<float>(1e-5));
   for (auto tensor : inputs) {
     delete tensor;
   }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
index 03516ff33f..925b45de20 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@@ -21,6 +21,7 @@
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
 namespace mindspore {
 class TestTransposeOpenCL : public mindspore::CommonTest {
@@ -28,31 +29,29 @@ class TestTransposeOpenCL : public mindspore::CommonTest {
   TestTransposeOpenCL() {}
 };
 
-TEST_F(TestTransposeOpenCL, TransposeFp32) {
+void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  int h = 64;
-  int w = 1;
-  int c = 7360;
-  size_t input_size;
-  std::string input_path = "./test_data/transpose/transpose_fp32_input.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  if (input_data == nullptr) {
-    MS_LOG(ERROR) << "input_data load error.";
-    return;
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
   }
+  auto allocator = ocl_runtime->GetAllocator();
+  int h = shape[0];
+  int w = shape[1];
+  int c = shape[2];
   std::vector<int> input_shape = {1, h, w, c};
-  auto tensor_x_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), input_shape, schema::Format_NHWC);
+  auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC);
   auto tensor_x = tensor_x_ptr.get();
   if (tensor_x == nullptr) {
     MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
   std::vector<int> out_shape = {1, c, h, w};
-  auto tensor_out_ptr =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), out_shape, schema::Format_NCHW);
+  auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(
+    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NCHW);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -78,9 +77,35 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
     return;
   }
   pGraph->Init();
-  memcpy(inputs[0]->Data(), input_data, input_size);
+  memcpy(inputs[0]->Data(), input_data, h * w * c * dtype_size);
   pGraph->Run();
 
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->Data(), output_data, h * w * c, static_cast<float16_t>(1e-3), 2e-2);
+  } else {
+    CompareOutput(outputs[0]->Data(), output_data, h * w * c, static_cast<float>(1e-5));
+  }
+
+  inputs[0]->SetData(nullptr);
+  outputs[0]->SetData(nullptr);
+
+  MS_LOG(INFO) << "Test TransposeFp32 passed";
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+
+TEST_F(TestTransposeOpenCL, TransposeFp32) {
+  int h = 64;
+  int w = 1;
+  int c = 7360;
+  std::vector<int> shape = {h, w, c};
+  size_t input_size;
+  std::string input_path = "./test_data/transpose/transpose_fp32_input.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  if (input_data == nullptr) {
+    MS_LOG(ERROR) << "input_data load error.";
+    return;
+  }
+
   size_t output_size;
   std::string output_path = "./test_data/transpose/transpose_fp32_output.bin";
   auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
@@ -88,26 +113,17 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
     MS_LOG(ERROR) << "correct_data create error.";
     return;
   }
-  printf("==================output data=================\n");
-  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
-  std::cout << std::endl;
-  int size_n = h * w * c;
-  size_n = size_n > 100 ? 100 : size_n;
-  for (int i = 0; i < size_n; i++) {
-    std::cout << output_data[i] << " ";
-    if ((i + 1) % c == 0) {
-      std::cout << std::endl;
-    }
-  }
-  std::cout << std::endl;
-
-  // compare
-  CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
+  RunTestTranspose(shape, input_data, correct_data, false);
+}
 
-  inputs[0]->SetData(nullptr);
-  outputs[0]->SetData(nullptr);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+TEST_F(TestTransposeOpenCL, TransposeFp16) {
+  int h = 4;
+  int w = 1;
+  int c = 3;
+  std::vector<int> shape = {h, w, c};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float16_t> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
 
-  MS_LOG(INFO) << "Test TransposeFp32 passed";
+  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
index 5c02760ca0..57807c13c5 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
@@ -35,34 +35,4 @@ void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
   }
 }
 
-void CompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) {
-  float *output_data = reinterpret_cast<float *>(output_tensor->Data());
-  size_t output_size = output_tensor->Size();
-  float *expect_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
-
-  printf("output[0:12]:");
-  for (int i = 0; i < 12; i++) {
-    printf("[%d]:%.3f ", i, output_data[i]);
-  }
-  printf("\n");
-  printf("expect[0:12]:");
-  for (int i = 0; i < 12; i++) {
-    printf("[%d]:%.3f ", i, expect_data[i]);
-  }
-  printf("\n");
-
-  constexpr float atol = 1e-5;
-  for (int i = 0; i < output_tensor->ElementsNum(); ++i) {
-    if (std::fabs(output_data[i] - expect_data[i]) > atol) {
-      printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]);
-      printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]);
-      printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]);
-      return;
-    }
-  }
-  printf("compare success!\n");
-  printf("compare success!\n");
-  printf("compare success!\n\n\n");
-}
-
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
index 90038c2ab3..80c141c8ba 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
@@ -29,7 +29,36 @@ namespace mindspore {
 
 void LoadTestData(void *dst, size_t dst_size, const std::string &file_path);
 
-void CompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path);
+template <typename T>
+void CompareOutput(void *output, void *expect, size_t elem_num, T atol, float rtol = 1e-5) {
+  T *output_data = reinterpret_cast<T *>(output);
+  T *expect_data = reinterpret_cast<T *>(expect);
+
+  printf("output[0:12]:");
+  for (int i = 0; i < 12 && i < elem_num; i++) {
+    printf("[%d]:%.3f ", i, output_data[i]);
+  }
+  printf("\n");
+  printf("expect[0:12]:");
+  for (int i = 0; i < 12 && i < elem_num; i++) {
+    printf("[%d]:%.3f ", i, expect_data[i]);
+  }
+  printf("\n");
+  for (int i = 0; i < elem_num; ++i) {
+    if (std::fabs(output_data[i] - expect_data[i]) > atol + rtol * std::fabs(expect_data[i])) {
+      printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]);
+      return;
+    }
+  }
+  printf("compare success!\n");
+}
+
+template <typename T>
+void CompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path, T atol, float rtol = 1e-5) {
+  size_t output_size;
+  auto expect_data = mindspore::lite::ReadFile(file_path.c_str(), &output_size);
+  CompareOutput(output_tensor->Data(), expect_data, output_tensor->ElementsNum(), atol, rtol);
+}
 
 }  // namespace mindspore
 
diff --git a/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_activation_parser_test.cc b/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_activation_parser_test.cc
index 84ec400efc..7505ff4c9a 100644
--- a/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_activation_parser_test.cc
+++ b/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_activation_parser_test.cc
@@ -119,15 +119,6 @@ TEST_F(TestTfliteParserPrelu, OpType) {
   ASSERT_NE(meta_graph, nullptr);
   ASSERT_GT(meta_graph->nodes.size(), 0);
   ASSERT_NE(meta_graph->nodes.front()->primitive.get(), nullptr);
-  ASSERT_EQ(meta_graph->nodes.front()->primitive->value.type, schema::PrimitiveType_Prelu) << "wrong Op Type";
-}
-
-TEST_F(TestTfliteParserPrelu, AttrValue) {
-  ASSERT_NE(meta_graph->nodes.front()->primitive->value.AsPrelu(), nullptr);
-  auto val = meta_graph->nodes.front()->primitive->value;
-  std::vector<float> slope(20, 0);
-  ASSERT_EQ(val.AsPrelu()->slope, slope);
-  ASSERT_EQ(val.type, schema::PrimitiveType_Prelu);
 }
 
 class TestTfliteParserLeakyRelu : public TestTfliteParser {
diff --git a/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_parsers_test_utils.cc b/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_parsers_test_utils.cc
index 66ec6ad1ec..6f7bc265a4 100644
--- a/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_parsers_test_utils.cc
+++ b/mindspore/lite/test/ut/tools/converter/parser/tflite/tflite_parsers_test_utils.cc
@@ -23,7 +23,7 @@ namespace mindspore {
 
 schema::MetaGraphT *TestTfliteParser::LoadAndConvert(const string &model_path, const string &weight_path) {
   lite::TfliteModelParser parser;
-  meta_graph = parser.Parse(model_path, weight_path);
+  meta_graph = parser.ParseToFb(model_path, weight_path);
   if (meta_graph == nullptr) {
     MS_LOG(ERROR) << "Parse to metaGraph return nullptr";
     return nullptr;
diff --git a/mindspore/lite/tools/anf_exporter/anf_exporter.cc b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
index 6a7402688c..07f4178664 100644
--- a/mindspore/lite/tools/anf_exporter/anf_exporter.cc
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
@@ -69,6 +69,10 @@ int AnfExporter::ConvertQuantParam(const std::unique_ptr<schema::MetaGraphT> &me
     // activation
     auto input_quant_params = primitive->GetInputQuantParams();
     auto node_type = (schema::PrimitiveType)primitive->Type();
+    if (input_quant_params.empty()) {
+      MS_LOG(ERROR) << "node: " << dst_node->name << " input quant params is empty";
+      return RET_ERROR;
+    }
     for (size_t i = 0; i < input_quant_params.size(); i++) {
       if (i >= dst_node->inputIndex.size()) {
         MS_LOG(ERROR) << "node: " << dst_node->name << " input has " << input_quant_params.size()
@@ -93,7 +97,10 @@ int AnfExporter::ConvertQuantParam(const std::unique_ptr<schema::MetaGraphT> &me
     auto tensor_output = meta_graph->allTensors[output_index].get();
     auto output_quant_params = primitive->GetOutputQuantParams();
     if (output_quant_params.empty()) {
-      MS_LOG(WARNING) << "node: " << dst_node->name << " output quant params is empty";
+      if (node_type != schema::PrimitiveType_QuantDTypeCast) {
+        MS_LOG(ERROR) << "node: " << dst_node->name << " output quant params is empty";
+        return RET_ERROR;
+      }
     } else {
       for (auto output_quant_param : output_quant_params[0]) {
         if (tensor_output->quantParams.empty()) {
@@ -142,7 +149,10 @@ void AnfExporter::SetGraphoutputIndex(const CNodePtr &cnode, const std::unique_p
   MS_ASSERT(nullptr != return_node);
   for (size_t i = 1; i < cnode->inputs().size(); i++) {
     auto input_node = cnode->input(i);
-    if (input_node->isa<CNode>()) {
+    if (input_node == nullptr) {
+      MS_LOG(ERROR) << "output node is nullptr";
+      return;
+    } else if (input_node->isa<CNode>()) {
       auto ret = ConvertInputCNode(input_node, return_node);
       if (ret != RET_OK) {
         MS_LOG(ERROR) << "obtain outputs failed";
@@ -158,24 +168,27 @@ void AnfExporter::SetGraphoutputIndex(const CNodePtr &cnode, const std::unique_p
   }
 }
 
-schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph) {
+schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph, bool keep_graph) {
   auto cnodes = func_graph->GetOrderedCnodes();
   auto meta_graphT = std::make_unique<schema::MetaGraphT>();
   for (const auto &cnode : cnodes) {
-    auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
-    if (primitiveT_value == nullptr) {
-      MS_LOG(ERROR) << "PrimitiveT_value is nullptr";
+    auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+    if (primitive_c == nullptr) {
+      MS_LOG(ERROR) << "primitive_c is nullptr";
       return nullptr;
     }
-    auto primT = primitiveT_value->GetPrimitiveT();
-    if (primitiveT_value->Type() == schema::PrimitiveType_TupleGetItem ||
-        primitiveT_value->Type() == schema::PrimitiveType_MakeTuple) {
+    if (primitive_c->Type() == schema::PrimitiveType_TupleGetItem ||
+        primitive_c->Type() == schema::PrimitiveType_MakeTuple) {
       continue;
     }
     RemoveIfMakeTuple(cnode);
 
+    auto primT = primitive_c->GetPrimitiveT();
     auto node = std::make_unique<schema::CNodeT>();
-
+    if (node == nullptr) {
+      MS_LOG(ERROR) << "object failed to be constructed";
+      return nullptr;
+    }
     if (primT->value.type == schema::PrimitiveType_Return) {
       node->name = "return_node";
       SetGraphoutputIndex(cnode, meta_graphT, node.get());
@@ -190,12 +203,14 @@ schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph) {
       return nullptr;
     }
     SetOpOutputNode(cnode, meta_graphT, node.get());
-    ret = ConvertQuantParam(meta_graphT, primitiveT_value, node);
+    ret = ConvertQuantParam(meta_graphT, primitive_c, node);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "ConvertQuantParam failed";
       return nullptr;
     }
-
+    if (!keep_graph) {
+      primitive_c->ClearPrimitiveT();
+    }
     meta_graphT->nodes.emplace_back(std::move(node));
   }
   // set graph input tensors
@@ -401,15 +416,15 @@ bool AnfExporter::IsPrimitiveCNode(const AnfNodePtr &node, schema::PrimitiveType
     return false;
   }
 
-  const auto &prim = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+  auto prim = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
   if (prim == nullptr) {
     return false;
   }
-  return (schema::PrimitiveType)prim->Type() == type;
+  return (schema::PrimitiveType)(prim->Type()) == type;
 }
 
-schema::MetaGraphT *Export(const FuncGraphPtr &func_graph) {
+schema::MetaGraphT *Export(const FuncGraphPtr &func_graph, bool keep_graph) {
   AnfExporter anf_exporter;
-  return anf_exporter.Export(func_graph);
+  return anf_exporter.Export(func_graph, keep_graph);
 }
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/tools/anf_exporter/anf_exporter.h b/mindspore/lite/tools/anf_exporter/anf_exporter.h
index 607d35ce52..42dcec36a6 100644
--- a/mindspore/lite/tools/anf_exporter/anf_exporter.h
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.h
@@ -30,7 +30,7 @@ class AnfExporter {
  public:
   AnfExporter() = default;
   virtual ~AnfExporter() = default;
-  schema::MetaGraphT *Export(const FuncGraphPtr &func_graph);
+  schema::MetaGraphT *Export(const FuncGraphPtr &func_graph, bool keep_graph = false);
   void SetOpOutputNode(const CNodePtr &cnode, const std::unique_ptr<schema::MetaGraphT> &meta_graphT,
                        schema::CNodeT *fb_node);
   int SetOpInputNode(const CNodePtr &cnode, const std::unique_ptr<schema::MetaGraphT> &meta_graphT,
@@ -55,6 +55,6 @@ class AnfExporter {
   std::vector<schema::CNodeT *> graph_input_nodes_;
 };
 
-schema::MetaGraphT *Export(const FuncGraphPtr &func_graph);
+schema::MetaGraphT *Export(const FuncGraphPtr &func_graph, bool keep_graph = false);
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_ANF_EXPORTER_ANF_EXPORTER_H_
diff --git a/mindspore/lite/tools/anf_importer/import_from_protobuf.cc b/mindspore/lite/tools/anf_importer/import_from_protobuf.cc
index e033296d21..85f8465c49 100644
--- a/mindspore/lite/tools/anf_importer/import_from_protobuf.cc
+++ b/mindspore/lite/tools/anf_importer/import_from_protobuf.cc
@@ -27,7 +27,6 @@
 #include <vector>
 #include "src/ops/primitive_c.h"
 #include "frontend/operator/ops.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "include/errorcode.h"
 #include "ir/anf.h"
 #include "ir/func_graph.h"
@@ -37,6 +36,7 @@
 #include "src/param_value_lite.h"
 #include "tools/converter/parser/onnx/onnx.pb.h"
 #include "utils/log_adapter.h"
+#include "tools/common/protobuf_utils.h"
 
 using string = std::string;
 using int32 = int32_t;
@@ -651,31 +651,11 @@ int AnfImporterFromProtobuf::Import(const schema::QuantType &quantType) {
 }
 
 onnx::ModelProto *AnfImporterFromProtobuf::ReadOnnxFromBinary(const std::string &model_path) {
-  std::unique_ptr<char[]> onnx_file(new (std::nothrow) char[PATH_MAX]{0});
-#ifdef _WIN32
-  if (_fullpath(onnx_file.get(), model_path.c_str(), 1024) == nullptr) {
-    MS_LOG(ERROR) << "open file failed.";
-    return nullptr;
-  }
-#else
-  if (realpath(model_path.c_str(), onnx_file.get()) == nullptr) {
-    MS_LOG(ERROR) << "open file failed.";
-    return nullptr;
-  }
-#endif
-  int fd = open(onnx_file.get(), O_RDONLY);
-  google::protobuf::io::FileInputStream input(fd);
-  google::protobuf::io::CodedInputStream code_input(&input);
-  code_input.SetTotalBytesLimit(INT_MAX, 536870912);
   auto onnx_model = new onnx::ModelProto;
-  bool ret = onnx_model->ParseFromCodedStream(&code_input);
-  if (!ret) {
-    MS_LOG(ERROR) << "load onnx file failed";
-    delete onnx_model;
+  if (ReadProtoFromBinaryFile((const char *)model_path.c_str(), onnx_model) != RET_OK) {
+    MS_LOG(ERROR) << "Read onnx model file failed, model path: " << model_path;
     return nullptr;
   }
-  (void)close(fd);
-  MS_LOG(INFO) << "enter ReadProtoFromBinary success!" << std::endl;
   return onnx_model;
 }
 
diff --git a/mindspore/lite/tools/benchmark/CMakeLists.txt b/mindspore/lite/tools/benchmark/CMakeLists.txt
index 8151e126ed..b369b7d1df 100644
--- a/mindspore/lite/tools/benchmark/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/CMakeLists.txt
@@ -17,8 +17,8 @@ else()
 endif()
 if (PLATFORM_ARM32 OR PLATFORM_ARM64)
     install(TARGETS benchmark
-                RUNTIME DESTINATION ${MAIN_DIR}/benchmark COMPONENT ${COMPONENT_NAME})
+                RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/benchmark COMPONENT ${COMPONENT_NAME})
 else()
     install(TARGETS benchmark
-            RUNTIME DESTINATION ${MAIN_DIR}/benchmark COMPONENT ${RUN_X86_COMPONENT_NAME})
+            RUNTIME DESTINATION ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/benchmark COMPONENT ${RUN_X86_COMPONENT_NAME})
 endif()
diff --git a/mindspore/lite/tools/benchmark/benchmark.cc b/mindspore/lite/tools/benchmark/benchmark.cc
index ada79393ef..a9a39de736 100644
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@@ -25,6 +25,7 @@
 #include "src/common/common.h"
 #include "include/ms_tensor.h"
 #include "include/context.h"
+#include "src/runtime/runtime_api.h"
 
 namespace mindspore {
 namespace lite {
@@ -107,7 +108,7 @@ int Benchmark::ReadInputFile() {
       }
       auto inputData = cur_tensor->MutableData();
       memcpy(inputData, binBuf, tensorDataSize);
-      delete binBuf;
+      delete[](binBuf);
     }
   }
   return RET_OK;
@@ -190,7 +191,7 @@ float Benchmark::CompareData(const std::string &nodeName, std::vector<int> msSha
       }
       oss << ") are different";
       std::cerr << oss.str() << std::endl;
-      MS_LOG(ERROR) << "%s", oss.str().c_str();
+      MS_LOG(ERROR) << oss.str().c_str();
       return RET_ERROR;
     }
     size_t errorCount = 0;
@@ -239,14 +240,16 @@ int Benchmark::CompareOutput() {
   bool hasError = false;
   for (const auto &calibTensor : calibData) {
     std::string nodeName = calibTensor.first;
-    auto tensors = session->GetOutputsByName(nodeName);
+    auto tensors = session->GetOutputsByNodeName(nodeName);
     if (tensors.empty()) {
       MS_LOG(ERROR) << "Cannot find output node: " << nodeName.c_str() << " , compare output data fail.";
+      std::cerr << "Cannot find output node: " << nodeName.c_str() << " , compare output data fail." << std::endl;
       return RET_ERROR;
     }
     // make sure tensor size is 1
     if (tensors.size() != 1) {
       MS_LOG(ERROR) << "Only support 1 tensor with a name now.";
+      std::cerr << "Only support 1 tensor with a name now." << std::endl;
       return RET_ERROR;
     }
     auto &tensor = tensors.front();
@@ -274,13 +277,15 @@ int Benchmark::CompareOutput() {
     std::cout << "=======================================================" << std::endl << std::endl;
 
     if (meanBias > this->_flags->accuracyThreshold) {
-      MS_LOG(ERROR) << "Mean bias of all nodes is too big: " << meanBias << "%%";
+      MS_LOG(ERROR) << "Mean bias of all nodes is too big: " << meanBias << "%";
+      std::cerr << "Mean bias of all nodes is too big: " << meanBias << "%" << std::endl;
       return RET_ERROR;
     } else {
       return RET_OK;
     }
   } else {
     MS_LOG(ERROR) << "Error in CompareData";
+    std::cerr << "Error in CompareData" << std::endl;
     std::cout << "=======================================================" << std::endl << std::endl;
     return RET_ERROR;
   }
@@ -288,15 +293,18 @@ int Benchmark::CompareOutput() {
 
 int Benchmark::MarkPerformance() {
   MS_LOG(INFO) << "Running warm up loops...";
+  std::cout << "Running warm up loops..." << std::endl;
   for (int i = 0; i < _flags->warmUpLoopCount; i++) {
     auto status = session->RunGraph();
     if (status != 0) {
-      MS_LOG(ERROR) << "Inference error %d" << status;
+      MS_LOG(ERROR) << "Inference error " << status;
+      std::cerr << "Inference error " << status << std::endl;
       return status;
     }
   }
 
   MS_LOG(INFO) << "Running benchmark loops...";
+  std::cout << "Running benchmark loops..." << std::endl;
   uint64_t timeMin = 1000000;
   uint64_t timeMax = 0;
   uint64_t timeAvg = 0;
@@ -306,7 +314,8 @@ int Benchmark::MarkPerformance() {
     auto start = GetTimeUs();
     auto status = session->RunGraph();
     if (status != 0) {
-      MS_LOG(ERROR) << "Inference error %d" << status;
+      MS_LOG(ERROR) << "Inference error " << status;
+      std::cerr << "Inference error " << status;
       return status;
     }
 
@@ -332,6 +341,7 @@ int Benchmark::MarkPerformance() {
 
 int Benchmark::MarkAccuracy() {
   MS_LOG(INFO) << "MarkAccuracy";
+  std::cout << "MarkAccuracy" << std::endl;
   for (size_t i = 0; i < msInputs.size(); i++) {
     MS_ASSERT(msInputs.at(i) != nullptr);
     MS_ASSERT(msInputs.at(i)->data_type() == TypeId::kNumberTypeFloat32);
@@ -345,18 +355,21 @@ int Benchmark::MarkAccuracy() {
   auto status = session->RunGraph();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Inference error " << status;
+    std::cerr << "Inference error " << status << std::endl;
     return status;
   }
 
   status = ReadCalibData();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Read calib data error " << status;
+    std::cerr << "Read calib data error " << status << std::endl;
     return status;
   }
 
   status = CompareOutput();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Compare output error " << status;
+    std::cerr << "Compare output error " << status << std::endl;
     return status;
   }
   return RET_OK;
@@ -368,22 +381,26 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
   std::string modelName = _flags->modelPath.substr(_flags->modelPath.find_last_of(DELIM_SLASH) + 1);
 
   MS_LOG(INFO) << "start reading model file";
+  std::cout << "start reading model file" << std::endl;
   size_t size = 0;
   char *graphBuf = ReadFile(_flags->modelPath.c_str(), &size);
   if (graphBuf == nullptr) {
-    MS_LOG(ERROR) << "Read model file failed while running %s", modelName.c_str();
+    MS_LOG(ERROR) << "Read model file failed while running " << modelName.c_str();
+    std::cerr << "Read model file failed while running " << modelName.c_str() << std::endl;
     return RET_ERROR;
   }
   auto model = lite::Model::Import(graphBuf, size);
   if (model == nullptr) {
-    MS_LOG(ERROR) << "Import model file failed while running %s", modelName.c_str();
+    MS_LOG(ERROR) << "Import model file failed while running " << modelName.c_str();
+    std::cerr << "Import model file failed while running " << modelName.c_str() << std::endl;
     delete[](graphBuf);
     return RET_ERROR;
   }
   delete[](graphBuf);
   auto context = new (std::nothrow) lite::Context;
   if (context == nullptr) {
-    MS_LOG(ERROR) << "New context failed while running %s", modelName.c_str();
+    MS_LOG(ERROR) << "New context failed while running " << modelName.c_str();
+    std::cerr << "New context failed while running " << modelName.c_str() << std::endl;
     return RET_ERROR;
   }
   if (_flags->device == "CPU") {
@@ -406,12 +423,14 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
   session = session::LiteSession::CreateSession(context);
   delete (context);
   if (session == nullptr) {
-    MS_LOG(ERROR) << "CreateSession failed while running %s", modelName.c_str();
+    MS_LOG(ERROR) << "CreateSession failed while running ", modelName.c_str();
+    std::cout << "CreateSession failed while running ", modelName.c_str();
     return RET_ERROR;
   }
   auto ret = session->CompileGraph(model);
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "CompileGraph failed while running %s", modelName.c_str();
+    MS_LOG(ERROR) << "CompileGraph failed while running ", modelName.c_str();
+    std::cout << "CompileGraph failed while running ", modelName.c_str();
     delete (session);
     delete (model);
     return ret;
@@ -437,8 +456,15 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
   }
   if (!_flags->calibDataPath.empty()) {
     status = MarkAccuracy();
+    for (auto &data : calibData) {
+      data.second->shape.clear();
+      data.second->data.clear();
+      delete data.second;
+    }
+    calibData.clear();
     if (status != 0) {
-      MS_LOG(ERROR) << "Run MarkAccuracy error: %d" << status;
+      MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
+      std::cout << "Run MarkAccuracy error: " << status << std::endl;
       delete (session);
       delete (model);
       return status;
@@ -446,22 +472,13 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
   } else {
     status = MarkPerformance();
     if (status != 0) {
-      MS_LOG(ERROR) << "Run MarkPerformance error: %d" << status;
+      MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
+      std::cout << "Run MarkPerformance error: " << status << std::endl;
       delete (session);
       delete (model);
       return status;
     }
   }
-
-  if (cleanData) {
-    for (auto &data : calibData) {
-      data.second->shape.clear();
-      data.second->data.clear();
-      delete data.second;
-    }
-    calibData.clear();
-  }
-
   delete (session);
   delete (model);
   return RET_OK;
@@ -515,32 +532,45 @@ int Benchmark::Init() {
 
   if (this->_flags->loopCount < 1) {
     MS_LOG(ERROR) << "LoopCount:" << this->_flags->loopCount << " must be greater than 0";
+    std::cerr << "LoopCount:" << this->_flags->loopCount << " must be greater than 0" << std::endl;
     return RET_ERROR;
   }
 
   if (this->_flags->numThreads < 1) {
     MS_LOG(ERROR) << "numThreads:" << this->_flags->numThreads << " must be greater than 0";
+    std::cerr << "numThreads:" << this->_flags->numThreads << " must be greater than 0" << std::endl;
     return RET_ERROR;
   }
 
   if (this->_flags->cpuBindMode == -1) {
     MS_LOG(INFO) << "cpuBindMode = MID_CPU";
+    std::cout << "cpuBindMode = MID_CPU" << std::endl;
   } else if (this->_flags->cpuBindMode == 1) {
     MS_LOG(INFO) << "cpuBindMode = HIGHER_CPU";
+    std::cout << "cpuBindMode = HIGHER_CPU" << std::endl;
   } else {
     MS_LOG(INFO) << "cpuBindMode = NO_BIND";
+    std::cout << "cpuBindMode = NO_BIND" << std::endl;
   }
 
   this->_flags->inDataType = this->_flags->inDataTypeIn == "img" ? kImage : kBinary;
 
   if (_flags->modelPath.empty()) {
     MS_LOG(ERROR) << "modelPath is required";
+    std::cerr << "modelPath is required" << std::endl;
     return 1;
   }
   _flags->InitInputDataList();
   _flags->InitResizeDimsList();
   if (!_flags->resizeDims.empty() && _flags->resizeDims.size() != _flags->input_data_list.size()) {
     MS_LOG(ERROR) << "Size of input resizeDims should be equal to size of input inDataPath";
+    std::cerr << "Size of input resizeDims should be equal to size of input inDataPath" << std::endl;
+    return RET_ERROR;
+  }
+
+  if (_flags->device != "CPU" && _flags->device != "GPU") {
+    MS_LOG(ERROR) << "Device type:" << _flags->device << " is not supported.";
+    std::cerr << "Device type:" << _flags->device << " is not supported." << std::endl;
     return RET_ERROR;
   }
 
@@ -573,23 +603,32 @@ int RunBenchmark(int argc, const char **argv) {
   auto status = mBenchmark.Init();
   if (status != 0) {
     MS_LOG(ERROR) << "Benchmark init Error : " << status;
+    std::cerr << "Benchmark init Error : " << status << std::endl;
     return RET_ERROR;
   }
 
-  if (flags.device == "NPU") {
-    status = mBenchmark.RunBenchmark("NPU");
-  } else {
+  if (flags.device == "GPU") {
+    status = mBenchmark.RunBenchmark("GPU");
+  } else if (flags.device == "CPU") {
     status = mBenchmark.RunBenchmark("CPU");
+  } else {
+    MS_LOG(ERROR) << "Device type" << flags.device << " not support.";
+    std::cerr << "Device type" << flags.device << " not support." << std::endl;
+    return RET_ERROR;
   }
 
   if (status != 0) {
     MS_LOG(ERROR) << "Run Benchmark " << flags.modelPath.substr(flags.modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
                   << " Failed : " << status;
+    std::cerr << "Run Benchmark " << flags.modelPath.substr(flags.modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
+              << " Failed : " << status << std::endl;
     return RET_ERROR;
   }
 
   MS_LOG(INFO) << "Run Benchmark " << flags.modelPath.substr(flags.modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
                << " Success.";
+  std::cout << "Run Benchmark " << flags.modelPath.substr(flags.modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
+            << " Success." << std::endl;
   return RET_OK;
 }
 }  // namespace lite
diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h
index 7aeaee5598..0df9f5424b 100644
--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@@ -56,7 +56,7 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
     AddFlag(&BenchmarkFlags::inDataPath, "inDataPath", "Input data path, if not set, use random input", "");
     AddFlag(&BenchmarkFlags::inDataTypeIn, "inDataType", "Input data type. img | bin", "bin");
     AddFlag(&BenchmarkFlags::omModelPath, "omModelPath", "OM model path, only required when device is NPU", "");
-    AddFlag(&BenchmarkFlags::device, "device", "CPU | NPU | GPU", "CPU");
+    AddFlag(&BenchmarkFlags::device, "device", "CPU | GPU", "CPU");
     AddFlag(&BenchmarkFlags::cpuBindMode, "cpuBindMode",
             "Input -1 for MID_CPU, 1 for HIGHER_CPU, 0 for NO_BIND, defalut value: 1", 1);
     // MarkPerformance
@@ -138,7 +138,6 @@ class MS_API Benchmark {
   std::vector<mindspore::tensor::MSTensor *> msInputs;
   std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> msOutputs;
   std::unordered_map<std::string, CheckTensor *> calibData;
-  bool cleanData = true;
 };
 
 int MS_API RunBenchmark(int argc, const char **argv);
diff --git a/mindspore/lite/tools/common/graph_util.cc b/mindspore/lite/tools/common/graph_util.cc
index 71bd873582..1d2f13fda1 100644
--- a/mindspore/lite/tools/common/graph_util.cc
+++ b/mindspore/lite/tools/common/graph_util.cc
@@ -35,7 +35,7 @@ OpDefCopyer GetSimpleOpCopyer() {
     newCNode->quantType = inCNode->quantType;
     newCNode->primitive = std::make_unique<schema::PrimitiveT>();
     newCNode->primitive->value.type = inCNode->primitive->value.type;
-    return std::move(newCNode);
+    return newCNode;
   };
 }
 
@@ -96,7 +96,7 @@ std::vector<size_t> GetLinkedPreIdx(const schema::MetaGraphT &graphT, const size
       preNodeIdx.emplace_back(i);
     }
   }
-  return std::move(preNodeIdx);
+  return preNodeIdx;
 }
 
 std::vector<size_t> GetLinkedPostIdx(const schema::MetaGraphT &graphT, const size_t &tensorIdx) {
@@ -111,7 +111,7 @@ std::vector<size_t> GetLinkedPostIdx(const schema::MetaGraphT &graphT, const siz
       postNodeIdx.emplace_back(i);
     }
   }
-  return std::move(postNodeIdx);
+  return postNodeIdx;
 }
 
 STATUS IsolateNode(schema::MetaGraphT *graphT, CNodeT *node) {
@@ -587,79 +587,5 @@ std::string GetModelName(const std::string &modelFile) {
 
   return modelName;
 }
-
-OpGraphT *OpGraphT::Build(const schema::MetaGraphT *subGraphDef) {
-  if (subGraphDef == nullptr) {
-    MS_LOG(ERROR) << "subGraphDef is nullptr";
-    return nullptr;
-  }
-  auto graph = std::unique_ptr<OpGraphT>(new OpGraphT());
-  if (graph == nullptr) {
-    MS_LOG(ERROR) << "malloc opgraph failed";
-    return nullptr;
-  }
-
-  auto &opDefs = subGraphDef->nodes;
-
-  for (auto &opDef : opDefs) {
-    auto ret = graph->AddEdge(opDef.get(), &opDefs);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << opDef->name.c_str() << " add edge failed. ret: " << ret;
-      return nullptr;
-    }
-  }
-
-  return graph.release();
-}
-
-int OpGraphT::AddEdge(const schema::CNodeT *srcNodeDef, const std::vector<std::unique_ptr<schema::CNodeT>> *nodeDefs) {
-  MS_ASSERT(srcNodeDef != nullptr);
-  MS_ASSERT(nodeDefs != nullptr);
-  NODE_ID srcId = std::string(srcNodeDef->name);
-  // for single op condition
-  AddNode(srcId);
-  for (auto index : srcNodeDef->outputIndex) {
-    for (auto &dstNodeDef : *nodeDefs) {
-      bool find = false;
-      auto inputIndex = dstNodeDef->inputIndex;
-      if (std::any_of(inputIndex.begin(), inputIndex.end(), [&index](size_t i) { return i == index; })) {
-        find = true;
-      }
-
-      if (!find) {
-        continue;
-      }
-      NODE_ID dstId = std::string(dstNodeDef->name.c_str());
-      auto ret = AddEdge(srcId, dstId);
-      if (ret != RET_OK) {
-        return ret;
-      }
-    }
-  }
-  return RET_OK;
-}
-
-int OpGraphT::AddEdge(NODE_ID srcId, NODE_ID dstId) {
-  auto srcNode = AddNode(srcId);
-  if (srcNode == nullptr) {
-    MS_LOG(ERROR) << "add srcNode failed";
-    return RET_ERROR;
-  }
-  srcNode->AddOutEdge(dstId);
-  auto dstNode = AddNode(dstId);
-  if (dstNode == nullptr) {
-    MS_LOG(ERROR) << "add dstNode failed";
-    return RET_ERROR;
-  }
-  dstNode->AddInEdge(srcId);
-  return RET_OK;
-}
-
-OpGraphT::~OpGraphT() {
-  for (auto iter : nodes) {
-    delete iter.second;
-  }
-  nodes.clear();
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/common/graph_util.h b/mindspore/lite/tools/common/graph_util.h
index ce40fd1ac7..2aa53b75ff 100644
--- a/mindspore/lite/tools/common/graph_util.h
+++ b/mindspore/lite/tools/common/graph_util.h
@@ -88,17 +88,6 @@ NodeIter InsertNodeAfter(schema::MetaGraphT *graphT, NodeIter existNodeIter, siz
 
 STATUS ValidateFileStr(const std::string &modelFile, std::string fileType);
 std::string GetModelName(const std::string &modelFile);
-
-class OpGraphT : public OpGraph<OpNode> {
- public:
-  OpGraphT() {}
-  ~OpGraphT();
-  static OpGraphT *Build(const schema::MetaGraphT *subGraphDef);
-
- private:
-  int AddEdge(NODE_ID srcId, NODE_ID dstId);
-  int AddEdge(const schema::CNodeT *srcNodeDef, const std::vector<std::unique_ptr<schema::CNodeT>> *nodeDefs);
-};
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/tools/common/node_util.cc b/mindspore/lite/tools/common/node_util.cc
index f4efe2dc5e..e112b30733 100644
--- a/mindspore/lite/tools/common/node_util.cc
+++ b/mindspore/lite/tools/common/node_util.cc
@@ -29,7 +29,7 @@ static const std::vector<schema::PrimitiveType> nhwcOpList = {
   schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_DeDepthwiseConv2D,
   schema::PrimitiveType_Pooling,         schema::PrimitiveType_Resize,
   schema::PrimitiveType_BatchNorm,       schema::PrimitiveType_FusedBatchNorm,
-  schema::PrimitiveType_CaffePReLU};
+  schema::PrimitiveType_PReLU};
 
 static const std::vector<schema::PrimitiveType> fp32FullOpList = {
   schema::PrimitiveType_Concat, schema::PrimitiveType_Add,
diff --git a/mindspore/lite/tools/common/node_util.h b/mindspore/lite/tools/common/node_util.h
index 7681405454..6f619b3a24 100644
--- a/mindspore/lite/tools/common/node_util.h
+++ b/mindspore/lite/tools/common/node_util.h
@@ -89,7 +89,7 @@ static STATUS TransFilterData(schema::TensorT *tensor, kTransFilterType type, in
     MS_LOG(ERROR) << "Dim size invalid";
     return RET_ERROR;
   }
-  std::unique_ptr<T> buf(new (std::nothrow) T[count]);
+  std::unique_ptr<T[]> buf(new (std::nothrow) T[count]);
   if (buf == nullptr) {
     MS_LOG(ERROR) << "new buf failed";
     return RET_ERROR;
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.cc b/mindspore/lite/tools/common/protobuf_utils.cc
similarity index 88%
rename from mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.cc
rename to mindspore/lite/tools/common/protobuf_utils.cc
index b1222376f6..4023845d8f 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.cc
+++ b/mindspore/lite/tools/common/protobuf_utils.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h"
+#include "tools/common/protobuf_utils.h"
 #include <fstream>
 #include <string>
 #include "google/protobuf/io/zero_copy_stream_impl.h"
@@ -37,15 +37,14 @@ bool ReadProtoFromCodedInputStream(google::protobuf::io::CodedInputStream *coded
   return proto->ParseFromCodedStream(coded_stream);
 }
 
-STATUS ReadProtoFromText(const char *file,
-                         google::protobuf::Message *message) {
+STATUS ReadProtoFromText(const char *file, google::protobuf::Message *message) {
   if (file == nullptr || message == nullptr) {
     return RET_ERROR;
   }
 
   std::string realPath = RealPath(file);
   if (realPath.empty()) {
-    MS_LOG(ERROR) << "Proto file path " << file <<" is  not valid";
+    MS_LOG(ERROR) << "Proto file path " << file << " is  not valid";
     return RET_ERROR;
   }
 
@@ -67,8 +66,7 @@ STATUS ReadProtoFromText(const char *file,
   return RET_OK;
 }
 
-STATUS ReadProtoFromBinaryFile(const char *file,
-                               google::protobuf::Message *message) {
+STATUS ReadProtoFromBinaryFile(const char *file, google::protobuf::Message *message) {
   if (file == nullptr || message == nullptr) {
     return RET_ERROR;
   }
@@ -100,4 +98,3 @@ STATUS ReadProtoFromBinaryFile(const char *file,
 }
 }  // namespace lite
 }  // namespace mindspore
-
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h b/mindspore/lite/tools/common/protobuf_utils.h
similarity index 85%
rename from mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h
rename to mindspore/lite/tools/common/protobuf_utils.h
index 4cb6ff6b3b..501a34a142 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h
+++ b/mindspore/lite/tools/common/protobuf_utils.h
@@ -29,13 +29,10 @@ namespace lite {
 bool ReadProtoFromCodedInputStream(google::protobuf::io::CodedInputStream *coded_stream,
                                    google::protobuf::Message *proto);
 
-STATUS ReadProtoFromText(const char *file,
-                         google::protobuf::Message *message);
+STATUS ReadProtoFromText(const char *file, google::protobuf::Message *message);
 
-STATUS ReadProtoFromBinaryFile(const char *file,
-                               google::protobuf::Message *message);
+STATUS ReadProtoFromBinaryFile(const char *file, google::protobuf::Message *message);
 }  // namespace lite
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_PARSE_UTILS_H_
-
diff --git a/mindspore/lite/tools/common/tensor_util.cc b/mindspore/lite/tools/common/tensor_util.cc
index f0dfa304ed..f9e5838a2b 100644
--- a/mindspore/lite/tools/common/tensor_util.cc
+++ b/mindspore/lite/tools/common/tensor_util.cc
@@ -24,7 +24,7 @@ std::unique_ptr<QuantParamT> GetTensorQuantParam(const std::unique_ptr<TensorT>
   MS_ASSERT(tensor != nullptr);
   auto &quantParams = tensor->quantParams;
   if (!quantParams.empty()) {
-    return std::move(CopyQuantParamT(quantParams.front()));
+    return CopyQuantParamT(quantParams.front());
   } else {
     return nullptr;
   }
@@ -39,7 +39,7 @@ std::unique_ptr<schema::QuantParamT> CopyQuantParamT(const std::unique_ptr<schem
   dstQuantParam->max = srcQuantParam->max;
   dstQuantParam->narrowRange = srcQuantParam->narrowRange;
   dstQuantParam->numBits = srcQuantParam->numBits;
-  return std::move(dstQuantParam);
+  return dstQuantParam;
 }
 
 size_t GetElementSize(const TensorT &tensor) { return GetElementSize(TypeId(tensor.dataType)); }
@@ -75,7 +75,7 @@ size_t GetShapeSize(const TensorT &tensor) {
 std::unique_ptr<TensorT> CopyTensorDefT(const std::unique_ptr<TensorT> &oldTensor) {
   auto newTensor = std::unique_ptr<TensorT>(new (std::nothrow) TensorT);
   if (newTensor == nullptr) {
-    // MS_LOG(ERROR)("new TensorT failed");
+     MS_LOG(ERROR) << "new TensorT failed";
     return nullptr;
   }
   newTensor->dims = oldTensor->dims;
@@ -85,9 +85,9 @@ std::unique_ptr<TensorT> CopyTensorDefT(const std::unique_ptr<TensorT> &oldTenso
   newTensor->nodeType = oldTensor->nodeType;
   newTensor->data = oldTensor->data;
   if (!oldTensor->quantParams.empty()) {
-    newTensor->quantParams.emplace_back(std::move(GetTensorQuantParam(oldTensor)));
+    newTensor->quantParams.emplace_back(GetTensorQuantParam(oldTensor));
   }
-  return std::move(newTensor);
+  return newTensor;
 }
 
 size_t GetRefCount(MetaGraphT *graphT, uint32_t tensorIdx) {
diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt
index d98b37a6da..0f9d2b09ec 100644
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_definitions(-DPRIMITIVE_WRITEABLE)
-set(ANF_SRC
-        ${ANF_SRC}
+set(CORE_SRC
+        ${CORE_SRC}
         #core / abstract
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/abstract/abstract_function.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/abstract/analysis_context.cc
@@ -29,6 +29,7 @@ set(ANF_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/scope.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/value.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/value_extends.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/dtype/ref.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/dtype/container.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/dtype/empty.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/ir/dtype/number.cc
@@ -45,6 +46,8 @@ set(ANF_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/utils/info.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/utils/profile.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../core/utils/ms_context.cc
+        )
+set(CCSRC_SRC
         ## ccsrc
         ${CCSRC_DIR}/debug/draw.cc
         ${CCSRC_DIR}/pybind_api/export_flags.cc
@@ -53,8 +56,7 @@ set(ANF_SRC
         ${CCSRC_DIR}/backend/optimizer/common/pattern_engine.cc
         ${CCSRC_DIR}/backend/optimizer/common/visit.cc
         ${CCSRC_DIR}/backend/optimizer/common/optimizer.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/graph_utils_extends.cc
-        )
+)
 
 if (WIN32)
     set(LITE_SRC
@@ -66,7 +68,7 @@ if (WIN32)
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/kernel_registry.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/graph_util.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/runtime_api.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/thread_pool.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/thread_pool.c
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/workspace_pool.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/allocator.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/executor.cc
@@ -91,6 +93,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/graph_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/node_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/tensor_util.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common/protobuf_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/storage.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/ir/primitive_t_value.cc
@@ -100,6 +103,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ../optimizer/common/gllo_utils.cc
         ../optimizer/fusion/conv_biasadd_fusion.cc
         ../optimizer/fusion/conv_activation_fusion.cc
+        ../optimizer/fusion/conv_tuple_activation_fusion.cc
         ../optimizer/fusion/conv_transform_fusion.cc
         ../optimizer/fusion/conv_scale_fusion.cc
         ../optimizer/fusion/conv_bn_fusion.cc
@@ -116,11 +120,13 @@ add_subdirectory(quantizer)
 
 set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(LITE_SRC
+        ${SRC_DIR}/common/graph_utils_extends.cc
+        ${SRC_DIR}/common/log_adapter.cc
         ${SRC_DIR}/common/graph_util.cc
         ${SRC_DIR}/common/ms_tensor_utils.cc
         ${SRC_DIR}/runtime/allocator.cc
         ${SRC_DIR}/runtime/runtime_api.cc
-        ${SRC_DIR}/runtime/thread_pool.cc
+        ${SRC_DIR}/runtime/thread_pool.c
         ${SRC_DIR}/runtime/workspace_pool.cc
         ${SRC_DIR}/ir/tensor.cc
         ${SRC_DIR}/context.cc
@@ -161,13 +167,36 @@ endif()
 
 add_executable(converter_lite
         main.cc
-        ${ANF_SRC}
+        ${CORE_SRC}
+        ${CCSRC_SRC}
         ${CONVERTER_SRC}
         ${OPS_SRC}
         ${KERNEL_SRC}
         ${LITE_SRC}
         )
 
+if (WIN32)
+    add_library(converter_mid OBJECT
+            ${CORE_SRC}
+            ${CCSRC_SRC}
+            ${CONVERTER_SRC}
+            ${OPS_SRC}
+            ${KERNEL_SRC}
+            ${LITE_SRC}
+            )
+    add_library(converter_parser STATIC
+            $<TARGET_OBJECTS:tflite_parser_mid>
+            $<TARGET_OBJECTS:caffe_parser_mid>
+            $<TARGET_OBJECTS:onnx_parser_mid>
+            $<TARGET_OBJECTS:anf_importer_mid>
+            $<TARGET_OBJECTS:anf_exporter_mid>
+            $<TARGET_OBJECTS:graph_pass_mid>
+            $<TARGET_OBJECTS:fusion_mid>
+            $<TARGET_OBJECTS:quantizer_mid>
+            $<TARGET_OBJECTS:converter_mid>
+            )
+endif()
+
 target_link_libraries(converter_lite PRIVATE
         tflite_parser_mid
         caffe_parser_mid
@@ -187,5 +216,5 @@ target_link_libraries(converter_lite PRIVATE
 
 if (NOT PLATFORM_ARM64 AND NOT PLATFORM_ARM32)
     install(TARGETS converter_lite
-            RUNTIME DESTINATION ${MAIN_DIR}/converter COMPONENT ${COMPONENT_NAME})
+            RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/converter COMPONENT ${COMPONENT_NAME})
 endif ()
diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc
index 2501121890..7be4fb9a10 100644
--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@@ -20,9 +20,12 @@
 #include "utils/log_adapter.h"
 #include "tools/optimizer/fusion/conv_biasadd_fusion.h"
 #include "tools/optimizer/fusion/conv_activation_fusion.h"
+#include "tools/optimizer/fusion/conv_tuple_activation_fusion.h"
 #include "tools/optimizer/fusion/conv_scale_fusion.h"
 #include "tools/optimizer/fusion/conv_bn_fusion.h"
 #include "tools/optimizer/fusion/constant_folding_fusion.h"
+#include "tools/converter/quantizer/post_training_quantizer.h"
+#include "tools/converter/quantizer/quant_cast.h"
 
 using std::string;
 namespace mindspore {
@@ -31,10 +34,9 @@ AnfTransform::AnfTransform() = default;
 
 AnfTransform::~AnfTransform() = default;
 
-void AnfTransform::SetGraphDef(schema::MetaGraphT *_dstDef) { graphDefT = _dstDef; }
-
-FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph) {
-  // return old_graph;
+FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph, const converter::Flags *config) {
+  MS_ASSERT(nullptr != old_graph);
+  // fusion const_fold
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>("anf fusion pass manager", false);
   pm->AddPass(std::make_shared<opt::ConvBiasaddFusion>());
@@ -44,9 +46,40 @@ FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph) {
                                                           schema::ActivationType_RELU));
   pm->AddPass(std::make_shared<opt::ConvActivationFusion>(true, "conv_relu6", schema::PrimitiveType_Activation,
                                                           schema::ActivationType_RELU6));
+  pm->AddPass(std::make_shared<opt::ConvTupleActivationFusion>(true, "conv_tuple_relu",
+                                                               schema::PrimitiveType_Activation,
+                                                               schema::ActivationType_RELU));
+  pm->AddPass(std::make_shared<opt::ConvTupleActivationFusion>(true, "conv_tuple_relu6",
+                                                               schema::PrimitiveType_Activation,
+                                                               schema::ActivationType_RELU6));
   pm->AddPass(std::make_shared<opt::ConstFoldPass>());
   optimizer->AddPassManager(pm);
   FuncGraphPtr new_graph = optimizer->Optimize(old_graph);
+
+  // quant
+  if (config != nullptr && config->quantType == schema::QuantType_PostTraining) {
+    this->mQuantizer = std::make_unique<quant::PostTrainingQuantizer>(new_graph, config->configFile, 8);
+    if (mQuantizer == nullptr) {
+      MS_LOG(ERROR) << "New PostTrainingQuantizer failed";
+      return nullptr;
+    }
+  }
+  if (mQuantizer != nullptr) {
+    mQuantizer->flags = *config;
+    auto status = mQuantizer->DoQuantize(new_graph);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "Quant failed " << status;
+      return nullptr;
+    }
+    quant::QuantCast quant_cast;
+    quant_cast.SetInputDataDType(kNumberTypeFloat32);
+    status = quant_cast.Run(new_graph);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "add QuantCast error";
+      return nullptr;
+    }
+  }
+
   return new_graph;
 }
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/anf_transform.h b/mindspore/lite/tools/converter/anf_transform.h
index 3b393a15bc..491450740d 100644
--- a/mindspore/lite/tools/converter/anf_transform.h
+++ b/mindspore/lite/tools/converter/anf_transform.h
@@ -17,11 +17,12 @@
 #ifndef MS_ANF_TRANSFORM_H
 #define MS_ANF_TRANSFORM_H
 
+#include <memory>
 #include "schema/inner/model_generated.h"
 #include "tools/common/storage.h"
 #include "tools/converter/converter_flags.h"
 #include "ir/anf.h"
-
+#include "tools/converter/quantizer/quantizer.h"
 
 namespace mindspore {
 namespace lite {
@@ -29,15 +30,12 @@ class AnfTransform {
  public:
   AnfTransform();
   virtual ~AnfTransform();
-  FuncGraphPtr Transform(const FuncGraphPtr &old_graph);
-  void SetGraphDef(schema::MetaGraphT *dstDef);
-  inline schema::MetaGraphT *GetOutput() { return graphDefT; }
+  FuncGraphPtr Transform(const FuncGraphPtr &old_graph, const converter::Flags *config = nullptr);
 
- protected:
-  schema::MetaGraphT *graphDefT = nullptr;
+ private:
+  std::unique_ptr<quant::Quantizer> mQuantizer = nullptr;
 };
 }  // namespace lite
 }  // namespace mindspore
 
 #endif
-
diff --git a/mindspore/lite/tools/converter/converter.cc b/mindspore/lite/tools/converter/converter.cc
index 5d329f1977..c0dae10a15 100644
--- a/mindspore/lite/tools/converter/converter.cc
+++ b/mindspore/lite/tools/converter/converter.cc
@@ -15,9 +15,9 @@
  */
 
 #include "tools/converter/converter.h"
+#include <memory>
 #include <vector>
 #include <utility>
-#include <memory>
 #include "tools/converter/converter_flags.h"
 #include "src/common/common.h"
 #include "src/common/file_utils.h"
@@ -58,29 +58,7 @@ class MindsporeImporter : public Converter {
 
   ~MindsporeImporter() override = default;
 };
-void Converter::FreeFuncGraph(const FuncGraphPtr &func_graph) {
-  MS_ASSERT(func_graph != nullptr);
-  auto cnodes = func_graph->GetOrderedCnodes();
-  for (auto &cnode : cnodes) {
-    auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
-    if (primitiveT_value == nullptr) {
-      MS_LOG(ERROR) << "PrimitiveT_value is nullptr";
-      return;
-    }
-    auto primT = primitiveT_value->GetPrimitiveT();
-    if (primT == nullptr) {
-      MS_LOG(ERROR) << "PrimitiveT is nullptr";
-      return;
-    }
-    if (primT->value.type == schema::PrimitiveType_TupleGetItem ||
-        primT->value.type == schema::PrimitiveType_MakeTuple ||
-        primT->value.type == schema::PrimitiveType_Return) {
-      delete primT;
-      primitiveT_value->SetPrimitiveT(nullptr);
-    }
-  }
-  return;
-}
+
 MetaGraphT *Converter::Convert(const converter::Flags *flag) {
   // parse the model and weight file to generate inference data structure
   FuncGraphPtr graph = nullptr;
@@ -92,41 +70,23 @@ MetaGraphT *Converter::Convert(const converter::Flags *flag) {
     MS_ASSERT(nullptr != modelParser);
     const std::string modelFile = flag->modelFile;
     const std::string weightFile = flag->weightFile;
-    auto meta_graph = modelParser->Parse(modelFile, weightFile, flag->quantType);
-    if (meta_graph == nullptr) {
-      MS_LOG(ERROR) << "Parse to metaGraph return nullptr";
-      return nullptr;
-    }
-    graph = ModelParser::Fb2Anf(meta_graph);
+    graph = modelParser->Parse(modelFile, weightFile, flag->quantType);
   }
   if (graph == nullptr) {
     MS_LOG(ERROR) << "Parser/Import model return nullptr";
     return nullptr;
   }
 
-  graph = anfTransform->Transform(graph);
-
-  CreateQuantizer(graph, flag);
-  if (mQuantizer != nullptr) {
-    mQuantizer->flags = *flag;
-    auto status = mQuantizer->DoQuantize(graph);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "Quant failed " << status;
-      return nullptr;
-    }
-    quant::QuantCast quant_cast;
-    quant_cast.SetInputDataDType(kNumberTypeFloat32);
-    status = quant_cast.Run(graph);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "add QuantCast error";
-      return nullptr;
-    }
+  graph = anfTransform->Transform(graph, flag);
+  if (graph == nullptr) {
+    MS_LOG(ERROR) << "Transform anf graph return nullptr";
+    return nullptr;
   }
 
   // anf -- fb
   auto meta_graph = Export(graph);
   if (meta_graph == nullptr) {
-    MS_LOG(ERROR) << "Export to meta_graph return nullptr";
+    MS_LOG(ERROR) << "Export to meta graph return nullptr";
     return nullptr;
   }
 
@@ -135,41 +95,13 @@ MetaGraphT *Converter::Convert(const converter::Flags *flag) {
   transform->CreateQuantizer(flag);
   auto status = transform->Transform(*flag);
   if (status != 0) {
-    MS_LOG(ERROR) << "FBTransform model failed " << status;
+    MS_LOG(ERROR) << "Transform meta graph failed " << status;
     return nullptr;
   }
 
-  FreeFuncGraph(graph);
   return meta_graph;
 }
 
-void Converter::CreateQuantizer(FuncGraphPtr funcGraph, const converter::Flags *flags) {
-  auto type = flags->quantType;
-  switch (type) {
-    case mindspore::schema::QuantType_AwareTraining: {
-      // mQuantizer.reset(new AwareQuantizer(graphDefT, flags->inputInferenceTypeIn, flags->stdDev, flags->mean));
-      break;
-    }
-      //    case mindspore::schema::QuantType_WeightQuant: {
-      //      MS_LOG(INFO) << "create WeightQuantizer!";
-      //      mQuantizer.reset(
-      //        new quant::WeightQuantizer(funcGraph, flags->quantSize, flags->convWeightQuantChannelThreshold,
-      //        flags->bitNum));
-      //      break;
-      //    }
-    case mindspore::schema::QuantType_PostTraining: {
-      MS_LOG(INFO) << "create PostTrainningQuantizer!";
-      mQuantizer.reset(new quant::PostTrainingQuantizer(funcGraph, flags->configFile, 8));
-      break;
-    }
-    case mindspore::schema::QuantType_QUANT_NONE:
-      MS_LOG(INFO) << "Not do quantization for model!";
-      break;
-    default:
-      MS_LOG(INFO) << "will support quntizer type " << flags->quantTypeIn.c_str() << " in the future!";
-      break;
-  }
-}
 int RunConverter(int argc, const char **argv) {
   std::unique_ptr<converter::Flags> flags(new (std::nothrow) converter::Flags);
   if (flags == nullptr) {
diff --git a/mindspore/lite/tools/converter/converter.h b/mindspore/lite/tools/converter/converter.h
index 1002946f70..7deb65c9b7 100644
--- a/mindspore/lite/tools/converter/converter.h
+++ b/mindspore/lite/tools/converter/converter.h
@@ -25,7 +25,6 @@
 #include "tools/anf_importer/anf_importer.h"
 #include "tools/converter/converter_flags.h"
 #include "tools/converter/anf_transform.h"
-#include "tools/converter/quantizer/quantizer.h"
 
 namespace mindspore {
 namespace lite {
@@ -34,15 +33,12 @@ class Converter {
   Converter();
   virtual ~Converter();
   virtual schema::MetaGraphT *Convert(const lite::converter::Flags *flags);
-  void CreateQuantizer(FuncGraphPtr funcGraph, const converter::Flags *flags);
-  void FreeFuncGraph(const FuncGraphPtr &func_graph);
 
  protected:
   ModelParser *modelParser = nullptr;
   AnfImporter *modelImporter = nullptr;
   GraphDefTransform *transform = nullptr;
   AnfTransform *anfTransform = nullptr;
-  std::unique_ptr<quant::Quantizer> mQuantizer = nullptr;
 };
 
 int RunConverter(int argc, const char **argv);
diff --git a/mindspore/lite/tools/converter/converter_flags.cc b/mindspore/lite/tools/converter/converter_flags.cc
index 4059ef12a4..b0f2ec605f 100644
--- a/mindspore/lite/tools/converter/converter_flags.cc
+++ b/mindspore/lite/tools/converter/converter_flags.cc
@@ -24,8 +24,8 @@ namespace lite {
 namespace converter {
 Flags::Flags() {
   AddFlag(&Flags::fmkIn, "fmk", "Input model framework type. TFLITE | CAFFE | MS", "");
-  AddFlag(&Flags::modelFile, "modelFile", "Input model file path. TFLITE: *.tflite | CAFFE: *.prototxt | MS: *.mindir",
-          "");
+  AddFlag(&Flags::modelFile, "modelFile",
+          "Input model file path. TFLITE: *.tflite | CAFFE: *.prototxt | MS: *.mindir | ONNX: *.onnx", "");
   AddFlag(&Flags::outputFile, "outputFile", "Output model file path. Will add .ms automatically", "");
   AddFlag(&Flags::weightFile, "weightFile",
           "Input model weight file path. Needed when fmk is CAFFE. CAFFE: *.caffemodel", "");
@@ -41,6 +41,10 @@ Flags::Flags() {
 }
 
 int Flags::Init(int argc, const char **argv) {
+  if (argc == 1) {
+    std::cout << this->Usage() << std::endl;
+    return RET_SUCCESS_EXIT;
+  }
   Option<std::string> err = this->ParseFlags(argc, argv);
 
   if (err.IsSome()) {
diff --git a/mindspore/lite/tools/converter/graphdef_transform.cc b/mindspore/lite/tools/converter/graphdef_transform.cc
index 9d5d45c7cb..47dd18de33 100644
--- a/mindspore/lite/tools/converter/graphdef_transform.cc
+++ b/mindspore/lite/tools/converter/graphdef_transform.cc
@@ -15,19 +15,17 @@
  */
 
 #include "tools/converter/graphdef_transform.h"
-#include <iostream>
 #include <memory>
 #include <string>
 #include "schema/model_generated.h"
 #include "utils/log_adapter.h"
-#include "src/common/op_utils.h"
 #include "tools/converter/converter_flags.h"
 #include "tools/converter/legacy_optimizer/graph/dtype_trans_pass.h"
-// #include "tools/converter/legacy_optimizer/fusion/matmul_biasadd_fusion_pass.h"
 #include "tools/converter/legacy_optimizer/fusion/format_trans_fusion_pass.h"
 #include "tools/converter/legacy_optimizer/fusion/format_trans_transpose_fusion_pass.h"
 #include "tools/converter/legacy_optimizer/fusion/quant_cast_fusion_pass.h"
-#include "tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h"
+#include "tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h"
+#include "tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h"
 #include "tools/converter/legacy_optimizer/graph/weight_format_hardcode_pass.h"
 #include "tools/converter/legacy_optimizer/graph/weight_format_transform_pass.h"
 #include "tools/converter/legacy_optimizer/graph/format_trans_pass.h"
@@ -36,7 +34,6 @@
 #include "tools/converter/legacy_optimizer/graph/unused_node_remove_pass.h"
 #include "tools/converter/legacy_optimizer/graph/topological_sort_pass.h"
 #include "tools/converter/quantizer/aware_quantizer.h"
-#include "tools/converter/converter.h"
 
 using std::string;
 namespace mindspore::lite {
@@ -56,7 +53,7 @@ void GraphDefTransform::CreateQuantizer(const converter::Flags *flags) {
       break;
     }
     default:
-      //      MS_LOGI("will support quantizer type %s in the future!", flags->quantTypeIn.c_str());
+      MS_LOG(INFO) << "will support quantizer type " << flags->quantTypeIn << " in the future";
       break;
   }
 }
@@ -71,7 +68,6 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
     weightHardCodePass->SetFmkType(ctx.fmk);
     weightFormatPass->SetQuantType(ctx.quantType);
     weightFormatPass->SetFmkType(ctx.fmk);
-//    weightFormatPass->SetDstFormat(Format_KHWC);
     weightFormatOptimizer.AddPass(weightHardCodePass);
     weightFormatOptimizer.AddPass(weightFormatPass);
     status = weightFormatOptimizer.Run(graphDefT);
@@ -152,9 +148,6 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
     formatTransOptimizer.AddPass(new EltwiseFormatTransPass());
     formatTransOptimizer.AddPass(new (std::nothrow) FormatTransFusionPass());
     formatTransOptimizer.AddPass(new (std::nothrow) IsolatedNodeRemovePass());
-    //    if (ctx.quantType == QuantType_AwareTraining) {
-    //      formatTransOptimizer.AddPass(new (std::nothrow) FormatTransNodeQuantParamFillPass());
-    //    }
     status = formatTransOptimizer.Run(graphDefT);
     if (status != RET_OK && status != RET_NO_CHANGE) {
       MS_LOG(ERROR) << "Run formatTransOptimizer graphPasses Failed";
@@ -172,6 +165,17 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
     }
   }
 
+  {
+    Optimizer fusionOptimizer;
+    fusionOptimizer.AddPass(new (std::nothrow) MulAddFusionPass());
+    fusionOptimizer.AddPass(new (std::nothrow) IsolatedNodeRemovePass());
+    status = fusionOptimizer.Run(graphDefT);
+    if (status != RET_OK && status != RET_NO_CHANGE) {
+      MS_LOG(ERROR) << "Run fusionOptimizer graphPasses Failed";
+      return status;
+    }
+  }
+
   // do quantization
   if (fbQuantizer != nullptr) {
     status = fbQuantizer->DoQuantize();
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/CMakeLists.txt b/mindspore/lite/tools/converter/legacy_optimizer/fusion/CMakeLists.txt
index 238131a1a8..3bdeaead0d 100755
--- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/CMakeLists.txt
@@ -2,11 +2,11 @@ add_library(fusion_mid OBJECT
         ${CMAKE_CURRENT_SOURCE_DIR}/fusion_pattern.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/matmul_biasadd_fusion_pass.cc
+           ${CMAKE_CURRENT_SOURCE_DIR}/mul_add_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/quant_cast_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm_fold_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/format_trans_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/format_trans_transpose_fusion_pass.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm_convert_scale_pass.cc
         )
 
 target_link_libraries(fusion_mid securec)
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h
deleted file mode 100644
index 06a6833706..0000000000
--- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
-#define MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
-
-#include <unordered_map>
-#include <memory>
-#include <string>
-#include <utility>
-#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h"
-#include "tools/common/graph_util.h"
-
-namespace mindspore {
-namespace lite {
-struct BNWeightTensors {
-  TensorT *meanTensor = nullptr;
-  TensorT *varianceTensor = nullptr;
-  TensorT *scaleTensor = nullptr;
-  TensorT *biasTensor = nullptr;
-};
-class BatchNormConvertScalePass : public FusionPass {
- public:
-  BatchNormConvertScalePass() = default;
-
-  ~BatchNormConvertScalePass() override;
-
-  STATUS DefinePattern() override;
-
-  STATUS DoFusion(MetaGraphT *graph, const std::string &patternName,
-                  std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath) override;
-
-  STATUS Run(MetaGraphT *graph) override;
-
- protected:
-  STATUS GetTransParam(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath);
-
-  // Get and check BNNode weight tensor
-  STATUS GetBnWeightTensors(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath, BNWeightTensors* bnWeightTensors);
-
-  STATUS GetBnEpsilon(MetaGraphT *graph);
-
-  STATUS FindNodes(MetaGraphT *graph, const std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath);
-
-  STATUS GenNewScaleTensor(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath);
-
-  STATUS ConvertBNToScale(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath);
-
-  CNodeT *inputNode = nullptr;
-  CNodeT *bnNode = nullptr;
-
-  std::string inputOpName = "Input";
-  std::string bnOpName = "BatchNorm";
-  std::string bnPatternName = "BnToScaleFusion";
-  uint32_t bnChannel = 0;
-  float eps = 0;
-  TensorT *bnMeanTensor = nullptr;
-  float *transScale = nullptr;
-  float *transBias = nullptr;
-  std::unique_ptr<TensorT> newScaleWeightTensor = nullptr;
-  std::unique_ptr<TensorT> newScaleBiasTensor = nullptr;
-
-  OpDefCopyer ScaleOpCopyer = [](CNodeT *inOpDef) -> std::unique_ptr<CNodeT> {
-    std::unique_ptr<CNodeT> newOpDef(new(std::nothrow) CNodeT);
-    if (newOpDef == nullptr) {
-      MS_LOG(ERROR) << "new OpDefT failed";
-      return nullptr;
-    }
-    newOpDef->name = inOpDef->name;
-    newOpDef->quantType = inOpDef->quantType;
-    newOpDef->primitive = std::make_unique<schema::PrimitiveT>();
-    newOpDef->primitive->value.type = schema::PrimitiveType_Scale;
-    auto scaleParam = new(std::nothrow) ScaleT;
-    if (scaleParam == nullptr) {
-      MS_LOG(ERROR) << "new scaleParam failed";
-      return nullptr;
-    }
-    auto inParam = inOpDef->primitive->value.AsScale();
-    MS_ASSERT(inParam != nullptr);
-    scaleParam->axis = inParam->axis;
-    newOpDef->primitive->value.value = scaleParam;
-    return std::move(newOpDef);
-  };
-};
-}  // namespace lite
-}  // namespace mindspore
-#endif  // MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/matmul_biasadd_fusion_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/fusion/matmul_biasadd_fusion_pass.h
index 3f3a42df84..ff23fcd47f 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/matmul_biasadd_fusion_pass.h
+++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/matmul_biasadd_fusion_pass.h
@@ -74,7 +74,7 @@ class MatMulBiasAddFusionPass : public FusionPass {
     std::transform(inParam->perm.begin(), inParam->perm.end(), transposeParam->perm.begin(),
                    [](const int32_t ele) { return ele; });
     newOpDef->primitive->value.value = transposeParam;
-    return std::move(newOpDef);
+    return newOpDef;
   };
 };
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.cc
new file mode 100644
index 0000000000..a2193866b5
--- /dev/null
+++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.cc
@@ -0,0 +1,148 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <memory>
+#include "tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h"
+#include "utils/log_adapter.h"
+#include "securec/include/securec.h"
+// #include "utils/log_adapter.h"
+#include "tools/common/graph_util.h"
+#include "include/errorcode.h"
+#include "schema/inner/model_generated.h"
+#include "src/common/op_utils.h"
+
+namespace mindspore {
+namespace lite {
+#define MUL_ADD_MATCH_PATH_LEN 2
+#define ADD_OP_BIAS_INDEX 1
+#define MUL_OP_BIAS_INDEX 1
+#define MUL_OP_INPUT_NUM 2
+#define ADD_OP_INPUT_NUM 2
+
+STATUS MulAddFusionPass::Run(MetaGraphT *graph) { return FusionPass::Run(graph); }
+
+STATUS MulAddFusionPass::DefinePattern() {
+  auto mulOp = std::make_shared<PatternOp>();
+  mulOp->id = MUL_NAME;
+  mulOp->types = {schema::PrimitiveType_Mul};
+  auto baOp = std::make_shared<PatternOp>();
+  baOp->id = ADD_NAME;
+  baOp->types = {schema::PrimitiveType_Add};
+  baOp->left = mulOp;
+
+  std::unique_ptr<FusionPattern> fusionPattern(new(std::nothrow) FusionPattern("MulAddFusion"));
+  if (fusionPattern == nullptr) {
+    MS_LOG(ERROR) << "new fusionPattern failed";
+    return RET_ERROR;
+  }
+  fusionPattern->AddPatternOp(mulOp);
+  fusionPattern->AddPatternOp(baOp);
+  fusionPattern->Finish();
+
+  this->patterns.emplace_back(fusionPattern.release());
+
+  return RET_OK;
+}
+
+STATUS MulAddFusionPass::DoFusion(MetaGraphT *graph, const std::string &patternName,
+                                  std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath) {
+  MS_ASSERT(graph != nullptr);
+  if (matchedPath.size() != MUL_ADD_MATCH_PATH_LEN) {
+    MS_LOG(ERROR) << "Mul-Add-Fusion should have two NodeIndex in matchedPair";
+    return RET_PARAM_INVALID;
+  }
+
+  auto mulPath = matchedPath[MUL_NAME];
+  auto addPath = matchedPath[ADD_NAME];
+  auto &mulNode = graph->nodes.at(mulPath->nodeIdx);
+  auto &addNode = graph->nodes.at(addPath->nodeIdx);
+  // can not check shape because there is now shape infer in converter
+  MS_ASSERT(mulNode != nullptr);
+  auto mulNodeInputIndex = mulNode->inputIndex;
+  MS_ASSERT(mulNodeInputIndex.size() == MUL_OP_INPUT_NUM);
+  MS_ASSERT(graph->allTensors.size() > mulNodeInputIndex.at(MUL_OP_BIAS_INDEX));
+  const auto &mulNodeBiasTensor = graph->allTensors.at(mulNodeInputIndex.at(MUL_OP_BIAS_INDEX));
+  MS_ASSERT(mulNodeBiasTensor != nullptr);
+  if (mulNodeBiasTensor->refCount != schema::NodeType_ValueNode) {
+    // dont fusion, return
+    return RET_OK;
+  }
+  // add node the second tensor is not constant tensor, don't fusion
+  auto addNodeInputIndex = addNode->inputIndex;
+  if (addNodeInputIndex.size() != ADD_OP_INPUT_NUM) {
+    MS_LOG(ERROR) << "add node input tensors number is invalid! ";  // baNode->name.c_str());
+    return RET_ERROR;
+  }
+  MS_ASSERT(graph->allTensors.size() > addNodeInputIndex.at(ADD_OP_BIAS_INDEX));
+  const auto &addNodeBiasTensor = graph->allTensors.at(addNodeInputIndex.at(ADD_OP_BIAS_INDEX));
+  MS_ASSERT(addNodeBiasTensor != nullptr);
+  if (addNodeBiasTensor->refCount != schema::NodeType_ValueNode) {
+    // dont fusion, return
+    return RET_OK;
+  }
+
+  // convert mul and add to scale
+  auto status = AddNewScaleNode(graph, mulNode, addNode.get(), addNodeInputIndex.at(ADD_OP_BIAS_INDEX));
+  if (RET_OK != status) {
+    MS_LOG(ERROR) << "AddFullConnectionBiasTensor failed, %d";  // status);
+    return status;
+  }
+
+  return RET_OK;
+}
+
+STATUS MulAddFusionPass::AddNewScaleNode(MetaGraphT *graph, const std::unique_ptr<CNodeT> &mulNode,
+                                         CNodeT* addNode, uint32_t addBiasIndex) {
+  MS_ASSERT(graph != nullptr);
+  MS_ASSERT(mulNode != nullptr);
+  MS_ASSERT(addNode != nullptr);
+  // replace mulNode as scale
+  mulNode->primitive->value.type = schema::PrimitiveType_Scale;
+  std::unique_ptr<ScaleT> scaleParam(new ScaleT());
+  if (scaleParam == nullptr) {
+    MS_LOG(ERROR) << "new transposeParam failed";
+    return RET_ERROR;
+  }
+  // NHWC
+  int shape_size = graph->allTensors.at(addBiasIndex)->dims.size();
+  scaleParam->axis = 0 - shape_size;
+  mulNode->primitive->value.value = scaleParam.release();
+  mulNode->inputIndex.push_back(addBiasIndex);
+  if (addNode->primitive->value.AsAdd()->activationType != ActivationType_NO_ACTIVATION) {
+    // repace addnode as activation
+    std::unique_ptr<ActivationT> activationParam(new ActivationT());
+    activationParam->type = addNode->primitive->value.AsAdd()->activationType;
+    // activationParam->alpha = 0.0;
+    addNode->primitive->value.type = schema::PrimitiveType_Activation;
+    addNode->primitive->value.value = activationParam.release();
+    addNode->inputIndex.pop_back();
+    return RET_OK;
+  }
+  // delete addnode
+  auto status = IsolateOneWayNode(graph, addNode);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "IsolateOneWayNode failed, subGraph: %zu, node: %zu, error: %d";
+    // baPath->subGraphIdx, baPath->nodeIdx, status);
+    return status;
+  }
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h
new file mode 100644
index 0000000000..2006b434a3
--- /dev/null
+++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/mul_add_fusion_pass.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_PREDICT_MUL_ADD_FUSION_PASS_H
+#define MINDSPORE_PREDICT_MUL_ADD_FUSION_PASS_H
+
+#include <string>
+#include <unordered_map>
+#include <memory>
+#include <algorithm>
+#include <utility>
+#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h"
+#include "tools/common/graph_util.h"
+
+namespace mindspore {
+namespace lite {
+constexpr const char *MUL_NAME = "MUL";
+constexpr const char *ADD_NAME = "ADD";
+
+class MulAddFusionPass : public FusionPass {
+ public:
+  MulAddFusionPass() = default;
+
+  ~MulAddFusionPass() = default;
+
+  STATUS DefinePattern() override;
+
+  STATUS DoFusion(MetaGraphT *graph, const std::string &patternName,
+                  std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath) override;
+
+  STATUS Run(MetaGraphT *graph) override;
+
+ protected:
+  static STATUS AddNewScaleNode(MetaGraphT *graph, const std::unique_ptr<CNodeT> &mulNode,
+                                CNodeT* addNode, uint32_t addBiasIndex);
+};
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_PREDICT_MUL_ADD_FUSION_PASS_H
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/CMakeLists.txt b/mindspore/lite/tools/converter/legacy_optimizer/graph/CMakeLists.txt
index cbc98adc13..eede1eb0d7 100755
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/CMakeLists.txt
@@ -8,4 +8,5 @@ add_library(graph_pass_mid OBJECT
         ${CMAKE_CURRENT_SOURCE_DIR}/weight_format_transform_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/topological_sort_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/unused_node_remove_pass.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm_convert_scale_pass.cc
         )
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
similarity index 62%
rename from mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.cc
rename to mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
index fbbd4adcba..ace35e9eaf 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tools/converter/legacy_optimizer/fusion/batchnorm_convert_scale_pass.h"
+#include "tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h"
 #include <cfloat>
 #include <memory>
 #include <string>
@@ -40,121 +40,60 @@ namespace lite {
 #define TF_BATCHNORM_VARIANCE_INDEX 3
 namespace {
 constexpr const float EPS = 1e-8;
-constexpr const float EPS_DEFAULT_FLOAT = 1e-5;
+constexpr const float EPS_DEFAULT_FLOAT = 1e-8;
 constexpr const float POW_NUM = 0.5;
 constexpr const int32_t NCHW_DIM_C = 1;
 }
-STATUS BatchNormConvertScalePass::Run(MetaGraphT *graph) { return FusionPass::Run(graph); }
 
-STATUS BatchNormConvertScalePass::DefinePattern() {
-  // with preNode
-  {
-    auto inputOp = std::make_shared<PatternOp>();
-    inputOp->id = inputOpName;
-    inputOp->types = {schema::PrimitiveType_NONE};
-    inputOp->isPlaceHold = true;
-
-    auto bnOp = std::make_shared<PatternOp>();
-    bnOp->id = bnOpName;
-    bnOp->types = {schema::PrimitiveType_FusedBatchNorm, schema::PrimitiveType_BatchNorm};
-    bnOp->left = inputOp;
+STATUS BatchNormConvertScalePass::Run(MetaGraphT *graph) {
+  MS_ASSERT(graph != nullptr);
 
-    std::unique_ptr<FusionPattern> fusionPattern(new(std::nothrow) FusionPattern(bnPatternName));
-    if (fusionPattern == nullptr) {
-      MS_LOG(ERROR) << "new fusionPattern failed";
-      return RET_ERROR;
+  for (auto iter = graph->nodes.begin(); iter != graph->nodes.end(); iter++) {
+    auto &node = *iter;
+    auto type = node->primitive->value.type;
+    if (type != schema::PrimitiveType_FusedBatchNorm && type != schema::PrimitiveType_BatchNorm) {
+      continue;
     }
-    fusionPattern->AddPatternOp(inputOp);
-    fusionPattern->AddPatternOp(bnOp);
-    fusionPattern->Finish();
 
-    this->patterns.emplace_back(fusionPattern.release());
+    auto status = GenNewScaleTensor(graph, node);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "GenNewScaleTensor failed: " << status;
+      return status;
+    }
+    status = ConvertBNToScale(graph, node);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "GenNewScaleTensor failed: " << status;
+      return status;
+    }
   }
-
   return RET_OK;
 }
-STATUS BatchNormConvertScalePass::DoFusion(MetaGraphT *graph, const std::string &patternName,
-                                           std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath) {
+STATUS BatchNormConvertScalePass::ConvertBNToScale(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode) {
   MS_ASSERT(graph != nullptr);
-  if (patternName != bnPatternName) {
-    MS_LOG(ERROR) << "BatchNormConvertScale-Fusion match failed";
-    return RET_PARAM_INVALID;
-  }
-  auto status = FindNodes(graph, matchedPath);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "FindNodes failed: " << status;
-    return status;
-  }
-  auto type = bnNode->primitive->value.type;
-  if (type != schema::PrimitiveType_FusedBatchNorm && type != schema::PrimitiveType_BatchNorm) {
-    return RET_OK;
-  }
-  auto bnPath = matchedPath.at(bnOpName);
-  status = GetTransParam(graph, bnPath);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "GetTransParam failed: " << status;
-    return status;
-  }
-
-  status = GenNewScaleTensor(graph, bnPath);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "GenNewScaleTensor failed: " << status;
-    return status;
-  }
-
-  status = ConvertBNToScale(graph, bnPath);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "GenNewScaleTensor failed: " << status;
-    return status;
-  }
-  return RET_OK;
-}
-STATUS BatchNormConvertScalePass::ConvertBNToScale(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath) {
-  auto scaleNode = std::unique_ptr<CNodeT>(new(std::nothrow) CNodeT);
-  if (scaleNode == nullptr) {
-    MS_LOG(ERROR) << "new TransNode failed";
-    return RET_ERROR;
-  }
-  scaleNode->name = bnNode->name;
-  scaleNode->primitive = std::make_unique<schema::PrimitiveT>();
-  if (scaleNode->primitive == nullptr) {
-    MS_LOG(ERROR) << "op->primitive is null";
-    return RET_NULL_PTR;
-  }
-  scaleNode->primitive->value.type = schema::PrimitiveType_Scale;
+  MS_ASSERT(bnNode != nullptr);
+  bnNode->primitive->value.type = schema::PrimitiveType_Scale;
   std::unique_ptr<ScaleT> scaleParam(new ScaleT());
   if (scaleParam == nullptr) {
     MS_LOG(ERROR) << "new transposeParam failed";
     return RET_ERROR;
   }
   scaleParam->axis = NCHW_DIM_C;
-  scaleNode->primitive->value.value = scaleParam.release();
-  auto scaleIter = graph->nodes.begin() + bnPath->nodeIdx;
-  STATUS errorCode = RET_OK;
-  scaleIter =
-      InsertNode(graph, scaleIter, kBefore, 0, std::move(scaleNode), &errorCode, ScaleOpCopyer);
-  if (errorCode != RET_OK) {
-    MS_LOG(ERROR) << "InsertNode failed: %d";  // errorCode);
-    return errorCode;
-  }
-  auto &newScaleNode = *(scaleIter - 1);
+  bnNode->primitive->value.value = scaleParam.release();
+  auto input0 = bnNode->inputIndex.at(0);
+  bnNode->inputIndex.clear();
+  bnNode->inputIndex.push_back(input0);
   graph->allTensors.emplace_back(std::move(newScaleWeightTensor));
   auto weightTensorIdx = graph->allTensors.size() - 1;
   graph->allTensors.emplace_back(std::move(newScaleBiasTensor));
   auto biasTensorIdx = graph->allTensors.size() - 1;
-  newScaleNode->inputIndex.push_back(weightTensorIdx);
-  newScaleNode->inputIndex.push_back(biasTensorIdx);
-  // delete bn node
-  auto status = IsolateOneWayNode(graph, bnPath->nodeIdx + 1, true);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "IsolateOneWayNode " << bnNode->name.c_str() << " failed, error: " << status;
-    return status;
-  }
+  bnNode->inputIndex.push_back(weightTensorIdx);
+  bnNode->inputIndex.push_back(biasTensorIdx);
   return RET_OK;
 }
-STATUS BatchNormConvertScalePass::GenNewScaleTensor(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath) {
+STATUS BatchNormConvertScalePass::GenNewScaleTensor(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode) {
   MS_ASSERT(graph != nullptr);
-  GetTransParam(graph, bnPath);
+  MS_ASSERT(bnNode != nullptr);
+  GetTransParam(graph, bnNode);
   newScaleWeightTensor = std::unique_ptr<TensorT>(new(std::nothrow) TensorT);
   if (newScaleWeightTensor == nullptr) {
     MS_LOG(ERROR) << "new weightTensor failed";
@@ -169,8 +108,11 @@ STATUS BatchNormConvertScalePass::GenNewScaleTensor(MetaGraphT *graph, const std
   auto ret = memcpy_s(newScaleWeightTensor->data.data(), weightShapeSize * sizeof(float), transScale,
                       weightShapeSize * sizeof(float));
   if (ret != RET_OK) {
-    delete transScale;
     MS_LOG(ERROR) << "memcpy error: " << ret;
+    delete[] transScale;
+    delete[] transBias;
+    transScale = nullptr;
+    transBias = nullptr;
     return RET_ERROR;
   }
 
@@ -189,39 +131,25 @@ STATUS BatchNormConvertScalePass::GenNewScaleTensor(MetaGraphT *graph, const std
   ret = memcpy_s(newScaleBiasTensor->data.data(), weightShapeSize * sizeof(float), transBias,
                  weightShapeSize * sizeof(float));
   if (ret != RET_OK) {
-    delete transBias;
     MS_LOG(ERROR) << "memcpy error: " << ret;
+    delete[] transScale;
+    delete[] transBias;
+    transScale = nullptr;
+    transBias = nullptr;
     return RET_ERROR;
   }
+  delete[] transScale;
+  delete[] transBias;
+  transScale = nullptr;
+  transBias = nullptr;
   return RET_OK;
 }
-
-STATUS BatchNormConvertScalePass::FindNodes(MetaGraphT *graph,
-                                            const std::unordered_map<std::string, std::shared_ptr<Path>> &matchedPath) {
+STATUS BatchNormConvertScalePass::GetTransParam(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode) {
   MS_ASSERT(graph != nullptr);
-  auto inputPath = matchedPath.at(inputOpName);
-  auto bnPath = matchedPath.at(bnOpName);
-  MS_ASSERT(inputPath != nullptr);
-  MS_ASSERT(bnPath != nullptr);
-  if (inputPath->subGraphIdx != bnPath->subGraphIdx) {
-    MS_LOG(ERROR) << "matched nodes should from same subGraph";
-    return RET_ERROR;
-  }
-  MS_ASSERT(graph->nodes.size() > inputPath->nodeIdx);
-  MS_ASSERT(graph->nodes.size() > bnPath->nodeIdx);
-  inputNode = graph->nodes.at(inputPath->nodeIdx).get();
-  bnNode = graph->nodes.at(bnPath->nodeIdx).get();
-  MS_ASSERT(inputNode != nullptr);
   MS_ASSERT(bnNode != nullptr);
-  return RET_OK;
-}
-STATUS BatchNormConvertScalePass::GetTransParam(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath) {
-  MS_ASSERT(graph != nullptr);
-  MS_ASSERT(bnPath != nullptr);
-
   BNWeightTensors bnWeightTensors;
 
-  auto status = GetBnWeightTensors(graph, bnPath, &bnWeightTensors);
+  auto status = GetBnWeightTensors(graph, &bnWeightTensors, bnNode);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "GetBnWeightTensors error";
     return status;
@@ -235,7 +163,7 @@ STATUS BatchNormConvertScalePass::GetTransParam(MetaGraphT *graph, const std::sh
   auto *varianceData = reinterpret_cast<float *>(varianceTensor->data.data());
 
   eps = EPS_DEFAULT_FLOAT;
-  status = GetBnEpsilon(graph);
+  status = GetBnEpsilon(bnNode);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "GetBnEpsilon failed";
     return status;
@@ -245,6 +173,10 @@ STATUS BatchNormConvertScalePass::GetTransParam(MetaGraphT *graph, const std::sh
   // cal transScale, tf : scale/sqrt(variance + eps); caffe : 1/sqrt(variance + eps)
   if (memcpy_s(transScale, bnChannel * sizeof(float), varianceData, bnChannel * sizeof(float)) != 0) {
     MS_LOG(ERROR) << "memcpy_s transScale error";
+    delete[] transScale;
+    delete[] transBias;
+    transScale = nullptr;
+    transBias = nullptr;
     return RET_ERROR;
   }
   // 1/sqrt(variance + eps)
@@ -288,12 +220,11 @@ STATUS BatchNormConvertScalePass::GetTransParam(MetaGraphT *graph, const std::sh
 //   bias        --1
 //   estimated_mean  --2
 //   estimated_variance  --3
-STATUS BatchNormConvertScalePass::GetBnWeightTensors(MetaGraphT *graph, const std::shared_ptr<Path> &bnPath,
-                                                     BNWeightTensors* bnWeightTensors) {
-  if (graph == nullptr || bnPath == nullptr) {
-    MS_LOG(ERROR) << "null pointer dereferencing.";
-    return RET_NULL_PTR;
-  }
+STATUS BatchNormConvertScalePass::GetBnWeightTensors(MetaGraphT *graph, BNWeightTensors *bnWeightTensors,
+                                                     const std::unique_ptr<CNodeT> &bnNode) {
+  MS_ASSERT(graph != nullptr);
+  MS_ASSERT(bnNode != nullptr);
+  MS_ASSERT(bnWeightTensors != nullptr);
   MS_ASSERT(graph->allTensors.size() > bnNode->inputIndex.at(1));
   auto bnWeightTensorIdxes = bnNode->inputIndex;
   bnWeightTensorIdxes.erase(bnWeightTensorIdxes.begin());
@@ -347,15 +278,9 @@ STATUS BatchNormConvertScalePass::GetBnWeightTensors(MetaGraphT *graph, const st
   return RET_OK;
 }
 
-STATUS BatchNormConvertScalePass::GetBnEpsilon(MetaGraphT *graph) {
-  if (graph == nullptr) {
-    MS_LOG(ERROR) << "null pointer dereferencing.";
-    return RET_NULL_PTR;
-  }
-  if (bnNode == nullptr) {
-    MS_LOG(ERROR) << "null pointer dereferencing.";
-    return RET_NULL_PTR;
-  }
+STATUS BatchNormConvertScalePass::GetBnEpsilon(const std::unique_ptr<CNodeT> &bnNode) {
+  MS_ASSERT(graph != nullptr);
+  MS_ASSERT(bnNode != nullptr);
   if (bnNode->primitive->value.type == schema::PrimitiveType_FusedBatchNorm) {
     eps = bnNode->primitive->value.AsFusedBatchNorm()->epsilon;
   } else if (bnNode->primitive->value.type == schema::PrimitiveType_BatchNorm) {
@@ -370,14 +295,5 @@ STATUS BatchNormConvertScalePass::GetBnEpsilon(MetaGraphT *graph) {
   }
   return RET_OK;
 }
-
-BatchNormConvertScalePass::~BatchNormConvertScalePass() {
-  if (this->transScale != nullptr) {
-    delete (this->transScale);
-  }
-  if (this->transBias != nullptr) {
-    delete (this->transBias);
-  }
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
new file mode 100644
index 0000000000..b7a9eedee7
--- /dev/null
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
+#define MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
+
+#include <unordered_map>
+#include <memory>
+#include <string>
+#include <utility>
+#include "tools/common/graph_util.h"
+#include "tools/converter/optimizer.h"
+
+using mindspore::schema::TensorT;
+namespace mindspore {
+namespace lite {
+struct BNWeightTensors {
+  schema::TensorT *meanTensor = nullptr;
+  TensorT *varianceTensor = nullptr;
+  TensorT *scaleTensor = nullptr;
+  TensorT *biasTensor = nullptr;
+};
+class BatchNormConvertScalePass : public GraphPass {
+ public:
+  BatchNormConvertScalePass() = default;
+
+  ~BatchNormConvertScalePass() = default;
+
+  STATUS Run(MetaGraphT *graph) override;
+
+ protected:
+  STATUS GetTransParam(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode);
+
+  // Get and check BNNode weight tensor
+  STATUS GetBnWeightTensors(MetaGraphT *graph, BNWeightTensors *bnWeightTensors, const std::unique_ptr<CNodeT> &bnNode);
+
+  STATUS GetBnEpsilon(const std::unique_ptr<CNodeT> &bnNode);
+
+  STATUS GenNewScaleTensor(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode);
+
+  STATUS ConvertBNToScale(MetaGraphT *graph, const std::unique_ptr<CNodeT> &bnNode);
+
+  uint32_t bnChannel = 0;
+  float eps = 0;
+  TensorT *bnMeanTensor = nullptr;
+  float *transScale = nullptr;
+  float *transBias = nullptr;
+  std::unique_ptr<TensorT> newScaleWeightTensor = nullptr;
+  std::unique_ptr<TensorT> newScaleBiasTensor = nullptr;
+};
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_PREDICT_BATCHNORM_CONVERT_SCALE_PASS_H
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.cc
index 9c8da98aab..24e4ccec33 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.cc
@@ -56,7 +56,7 @@ STATUS DTypeTransPass::DoModelInputDTypeTrans(schema::MetaGraphT *graph) {
   for (auto graphInIdx : graphInIdxes) {
     MS_ASSERT(graph->allTensors.size() > graphInIdx);
     auto &graphInTensor = graph->allTensors.at(graphInIdx);
-    graphInTensor->dataType = TypeId::kNumberTypeUInt8;
+    graphInTensor->dataType = TypeId::kNumberTypeInt8;
   }
 
   if (this->inputDataDType == TypeId::kNumberTypeInt8) {
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.h
index 1c1c0a7284..2b1906b6fe 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.h
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/dtype_trans_pass.h
@@ -72,7 +72,7 @@ class DTypeTransPass : public GraphPass {
     QuantDTypeCastParam->srcT = oldQuantDTypeCastParam->srcT;
     QuantDTypeCastParam->dstT = oldQuantDTypeCastParam->dstT;
     newCNode->primitive->value.value = QuantDTypeCastParam;
-    return std::move(newCNode);
+    return newCNode;
   };
 };
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/eltwise_format_trans_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/eltwise_format_trans_pass.cc
index 66b8481ea9..17e7bffbb8 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/eltwise_format_trans_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/eltwise_format_trans_pass.cc
@@ -121,7 +121,8 @@ STATUS EltwiseFormatTransPass::Run(schema::MetaGraphT *graph) {
   MS_ASSERT(graph != nullptr);
   for (auto iter = graph->nodes.begin(); iter != graph->nodes.end(); iter++) {
     auto &node = *iter;
-    if (node->primitive->value.type != PrimitiveType_Eltwise) {
+    auto type = node->primitive->value.type;
+    if (type != PrimitiveType_Eltwise && type != PrimitiveType_Activation) {
       continue;
     }
     auto node_name = node->name;
diff --git a/mindspore/lite/tools/converter/model_parser.h b/mindspore/lite/tools/converter/model_parser.h
index 64c3cfac52..f216699acb 100644
--- a/mindspore/lite/tools/converter/model_parser.h
+++ b/mindspore/lite/tools/converter/model_parser.h
@@ -32,23 +32,23 @@ class ModelParser {
 
   virtual ~ModelParser() {}
 
-  virtual FuncGraphPtr ParseToAnf(const std::string &modelFile, const std::string &weightFile) {
-    auto *meta_graph = Parse(modelFile, weightFile);
-    if (meta_graph == nullptr) {
-      MS_LOG(ERROR) << "Parse to metaGraph return nullptr";
-      return nullptr;
-    }
-    return Fb2Anf(Parse(modelFile, weightFile));
+  FuncGraphPtr Parse(const std::string &modelFile, const std::string &weightFile,
+                                    const QuantType &quantType = QuantType_QUANT_NONE) {
+    auto *meta_graph = ParseToFb(modelFile, weightFile, quantType);
+    auto func_graph = this->Fb2Anf(meta_graph);
+    delete(meta_graph);
+    return func_graph;
   }
-  virtual schema::MetaGraphT *Parse(const std::string &modelFile, const std::string &weightFile,
-                                    const QuantType &quantType = QuantType_QUANT_NONE) = 0;
+
+  virtual schema::MetaGraphT *ParseToFb(const std::string &modelFile, const std::string &weightFile,
+                             const QuantType &quantType = QuantType_QUANT_NONE) = 0;
 
  public:
   static FuncGraphPtr Fb2Anf(schema::MetaGraphT *meta_graph) {
     MS_EXCEPTION_IF_NULL(meta_graph);
     auto func_graph = std::make_shared<FuncGraph>();
-    auto importer = new AnfImporterFromMetaGraphT(meta_graph, func_graph);
-    auto ret = importer->Import();
+    AnfImporterFromMetaGraphT importer(meta_graph, func_graph);
+    auto ret = importer.Import();
     if (RET_OK != ret) {
       MS_LOG(ERROR) << "Import anf_graph from meta_graphT failed, ret: " << ret;
       return nullptr;
diff --git a/mindspore/lite/tools/converter/parser/caffe/CMakeLists.txt b/mindspore/lite/tools/converter/parser/caffe/CMakeLists.txt
index a0b8ee236e..bf27a111ba 100644
--- a/mindspore/lite/tools/converter/parser/caffe/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/parser/caffe/CMakeLists.txt
@@ -15,7 +15,6 @@ add_library(caffe_parser_mid OBJECT
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_model_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_node_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_node_parser_registry.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/caffe_parse_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_pooling_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_power_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_prelu_parser.cc
@@ -28,4 +27,5 @@ add_library(caffe_parser_mid OBJECT
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_inspector.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_interp_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/caffe_permute_parser.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/caffe_tile_parser.cc)
+        ${CMAKE_CURRENT_SOURCE_DIR}/caffe_tile_parser.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/caffe_tanh_parser.cc)
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_converter.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_converter.cc
index 16056fa39d..0a87989886 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_converter.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_converter.cc
@@ -15,7 +15,6 @@
  */
 
 #include "mindspore/lite/tools/converter/parser/caffe/caffe_converter.h"
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_convolution_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_convolution_parser.cc
index 52a9ef5279..96f546e44f 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_convolution_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_convolution_parser.cc
@@ -69,7 +69,11 @@ STATUS CaffeConvolutionParser::Parse(const caffe::LayerParameter &proto,
     return RET_NULL_PTR;
   }
 
-  std::unique_ptr<schema::Conv2DT> attr(new (std::nothrow) schema::Conv2DT());
+  auto attr = std::make_unique<schema::Conv2DT>();
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "new attr failed";
+    return RET_NULL_PTR;
+  }
 
   attr->format = schema::Format_NCHW;
 
@@ -135,9 +139,9 @@ STATUS CaffeConvolutionParser::Parse(const caffe::LayerParameter &proto,
 
   op->name = proto.name();
   op->primitive->value.type = schema::PrimitiveType_Conv2D;
-  op->primitive->value.value = attr.get();
+  op->primitive->value.value = attr.release();
 
-  status = ParseGroupConvolution(op, attr.release());
+  status = ParseGroupConvolution(op, static_cast<schema::Conv2DT *>(op->primitive->value.value));
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Parse group convolution failed";
     return RET_ERROR;
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
index 051bb03ccb..5f75fc1ee4 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h"
+#include "tools/converter/parser/caffe/caffe_model_parser.h"
 #include <vector>
 #include <iostream>
 #include <utility>
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_node_parser_registry.h"
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_parse_utils.h"
-#include "mindspore/lite/tools/converter/parser/caffe/caffe_inspector.h"
+#include "tools/converter/parser/caffe/caffe_node_parser_registry.h"
+#include "tools/converter/parser/caffe/caffe_inspector.h"
 #include "tools/common/graph_util.h"
+#include "tools/common/protobuf_utils.h"
 
 namespace mindspore {
 namespace lite {
@@ -31,9 +31,8 @@ CaffeModelParser::~CaffeModelParser() {}
 
 const std::set<std::string> CaffeModelParser::skipedLayerType = {"Dropout"};
 
-schema::MetaGraphT *CaffeModelParser::Parse(const std::string &modelFile,
-                                            const std::string &weightFile,
-                                            const QuantType &quantType) {
+schema::MetaGraphT *CaffeModelParser::ParseToFb(const std::string &modelFile, const std::string &weightFile,
+                                                const QuantType &quantType) {
   if (ValidateFileStr(modelFile, ".prototxt") != RET_OK) {
     MS_LOG(ERROR) << "INPUT ILLEGAL: modelFile must be *.prototxt";
     return nullptr;
@@ -49,7 +48,7 @@ schema::MetaGraphT *CaffeModelParser::Parse(const std::string &modelFile,
     return nullptr;
   }
 
-  std::unique_ptr<schema::MetaGraphT> subGraphDef = std::make_unique<schema::MetaGraphT>();
+  auto metaGraph = std::make_unique<schema::MetaGraphT>();
   TensorCache tensorCache;
 
   caffe::NetParameter proto;
@@ -57,7 +56,7 @@ schema::MetaGraphT *CaffeModelParser::Parse(const std::string &modelFile,
     MS_LOG(ERROR) << "Read prototxt file failed, model path: " << modelFile;
     return nullptr;
   }
-  subGraphDef->name = proto.name();
+  metaGraph->name = proto.name();
 
   caffe::NetParameter weight;
   if (ReadProtoFromBinaryFile((const char *)weightFile.c_str(), &weight) != RET_OK) {
@@ -71,26 +70,25 @@ schema::MetaGraphT *CaffeModelParser::Parse(const std::string &modelFile,
     return nullptr;
   }
 
-  status = ParseLayer(proto, weight, &tensorCache, subGraphDef.get());
+  status = ParseLayer(proto, weight, &tensorCache, metaGraph.get());
   if (status != RET_OK) {
     MS_LOG(ERROR) << "ParseLayer failed " << status;
     return nullptr;
   }
 
-  status = SetGraphTensorIndex(proto, &tensorCache, subGraphDef.get());
+  status = SetGraphTensorIndex(proto, &tensorCache, metaGraph.get());
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Set inputTensor index and outputTensor index for graph failed!";
     return nullptr;
   }
-  subGraphDef->name = GetModelName(modelFile);
+  metaGraph->name = GetModelName(modelFile);
 
-  SetAllTensors(tensorCache, subGraphDef.get());
+  SetAllTensors(tensorCache, metaGraph.get());
 
-  return subGraphDef.release();
+  return metaGraph.release();
 }
 
-STATUS CaffeModelParser::SetOpInputIdx(const caffe::LayerParameter &layer,
-                                       schema::CNodeT *op,
+STATUS CaffeModelParser::SetOpInputIdx(const caffe::LayerParameter &layer, schema::CNodeT *op,
                                        TensorCache *tensorCache) {
   for (int i = 0; i < layer.bottom_size(); i++) {
     int index = tensorCache->FindTensor(layer.bottom(i));
@@ -104,8 +102,7 @@ STATUS CaffeModelParser::SetOpInputIdx(const caffe::LayerParameter &layer,
   return RET_OK;
 }
 
-STATUS CaffeModelParser::SetOpOutputIdx(const caffe::LayerParameter &layer,
-                                        schema::CNodeT *op,
+STATUS CaffeModelParser::SetOpOutputIdx(const caffe::LayerParameter &layer, schema::CNodeT *op,
                                         TensorCache *tensorCache) {
   for (int i = 0; i < layer.top_size(); i++) {
     std::unique_ptr<schema::TensorT> msTensor = std::make_unique<schema::TensorT>();
@@ -114,8 +111,7 @@ STATUS CaffeModelParser::SetOpOutputIdx(const caffe::LayerParameter &layer,
   return RET_OK;
 }
 
-STATUS CaffeModelParser::SetWeightTensor(const std::vector<schema::TensorT *> &weightVec,
-                                         schema::CNodeT *op,
+STATUS CaffeModelParser::SetWeightTensor(const std::vector<schema::TensorT *> &weightVec, schema::CNodeT *op,
                                          TensorCache *tensorCache) {
   for (auto iter : weightVec) {
     op->inputIndex.emplace_back(tensorCache->AddTensor("Weight", iter, CONST));
@@ -123,8 +119,7 @@ STATUS CaffeModelParser::SetWeightTensor(const std::vector<schema::TensorT *> &w
   return RET_OK;
 }
 
-STATUS CaffeModelParser::SetAllTensors(const TensorCache &tensorCache,
-                                       schema::MetaGraphT *subGraphDef) {
+STATUS CaffeModelParser::SetAllTensors(const TensorCache &tensorCache, schema::MetaGraphT *subGraphDef) {
   std::vector<schema::TensorT *> tensors = tensorCache.GetCachedTensor();
   for (auto iter : tensors) {
     std::unique_ptr<schema::TensorT> temp(iter);
@@ -133,8 +128,7 @@ STATUS CaffeModelParser::SetAllTensors(const TensorCache &tensorCache,
   return RET_OK;
 }
 
-STATUS CaffeModelParser::SetGraphTensorIndex(const caffe::NetParameter &proto,
-                                             TensorCache *tensorCache,
+STATUS CaffeModelParser::SetGraphTensorIndex(const caffe::NetParameter &proto, TensorCache *tensorCache,
                                              schema::MetaGraphT *subGraphDef) {
   CaffeInspector caffeInspector;
   caffeInspector.InspectModel(proto);
@@ -160,10 +154,8 @@ STATUS CaffeModelParser::SetGraphTensorIndex(const caffe::NetParameter &proto,
   return RET_OK;
 }
 
-STATUS CaffeModelParser::ParseLayer(const caffe::NetParameter &proto,
-                                    const caffe::NetParameter &weight,
-                                    TensorCache *tensorCache,
-                                    schema::MetaGraphT *subGraphDef) {
+STATUS CaffeModelParser::ParseLayer(const caffe::NetParameter &proto, const caffe::NetParameter &weight,
+                                    TensorCache *tensorCache, schema::MetaGraphT *subGraphDef) {
   for (int i = 0; i < proto.layer_size(); i++) {
     auto layer = proto.layer(i);
 
@@ -235,8 +227,7 @@ STATUS CaffeModelParser::ParseLayer(const caffe::NetParameter &proto,
   return RET_OK;
 }
 
-STATUS CaffeModelParser::GetModelInput(const caffe::NetParameter &proto,
-                                       TensorCache *tensorCache) {
+STATUS CaffeModelParser::GetModelInput(const caffe::NetParameter &proto, TensorCache *tensorCache) {
   for (int i = 0; i < proto.input_size(); i++) {
     if (proto.input_dim_size() <= 0) {
       continue;
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
index e475c06875..ffea8e6aaa 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
@@ -33,7 +33,7 @@ class CaffeModelParser : public ModelParser {
 
   virtual ~CaffeModelParser();
 
-  MetaGraphT *Parse(const std::string &modelFile, const std::string &weightFile,
+  schema::MetaGraphT *ParseToFb(const std::string &modelFile, const std::string &weightFile,
                     const QuantType &quantType = QuantType_QUANT_NONE) override;
 
  private:
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_prelu_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_prelu_parser.cc
index 58e8c31749..2c1b11cc82 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_prelu_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_prelu_parser.cc
@@ -34,7 +34,7 @@ STATUS CaffePReluParser::Parse(const caffe::LayerParameter &proto,
     return RET_NULL_PTR;
   }
 
-  std::unique_ptr<schema::CaffePReLUT> attr = std::make_unique<schema::CaffePReLUT>();
+  std::unique_ptr<schema::PReLUT> attr = std::make_unique<schema::PReLUT>();
   if (attr == nullptr) {
     MS_LOG(ERROR) << "new op failed";
     return RET_NULL_PTR;
@@ -60,7 +60,7 @@ STATUS CaffePReluParser::Parse(const caffe::LayerParameter &proto,
   weightVec->push_back(slope);
 
   op->name = proto.name();
-  op->primitive->value.type = schema::PrimitiveType_CaffePReLU;
+  op->primitive->value.type = schema::PrimitiveType_PReLU;
   op->primitive->value.value = attr.release();
   return RET_OK;
 }
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.cc
index 207f7ec963..0ad8a4a194 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.cc
@@ -14,14 +14,32 @@
  * limitations under the License.
  */
 
-#include <memory>
 #include "mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.h"
+#include <memory>
 
 namespace mindspore {
 namespace lite {
-STATUS CaffeRelu6Parser::Parse(const caffe::LayerParameter &proto, const caffe::LayerParameter &weight,
-                               schema::CNodeT *op, std::vector<schema::TensorT *> *weightVec) {
+STATUS CaffeRelu6Parser::Parse(const caffe::LayerParameter &proto,
+                               const caffe::LayerParameter &weight,
+                               schema::CNodeT *op,
+                               std::vector<schema::TensorT *> *weightVec) {
+  MS_LOG(DEBUG) << "parse CaffeRelu6Parser";
+  if (op == nullptr) {
+    MS_LOG(ERROR) << "op is null";
+    return RET_NULL_PTR;
+  }
+  op->primitive = std::make_unique<schema::PrimitiveT>();
+  if (op->primitive == nullptr) {
+    MS_LOG(ERROR) << "op->primitive is null";
+    return RET_NULL_PTR;
+  }
+
   std::unique_ptr<schema::ActivationT> attr(new schema::ActivationT());
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "new op failed";
+    return RET_NULL_PTR;
+  }
+
   attr->type = schema::ActivationType_RELU6;
   // relu: negative_slope = 0, no parameter;
   // leakyrelu: negative_slope != 0;
@@ -32,9 +50,10 @@ STATUS CaffeRelu6Parser::Parse(const caffe::LayerParameter &proto, const caffe::
       attr->alpha = negative_slope;
     }
   }
-  op->primitive = std::make_unique<schema::PrimitiveT>();
-  op->primitive->value.value = attr.release();
+
+  op->name = proto.name();
   op->primitive->value.type = schema::PrimitiveType_Activation;
+  op->primitive->value.value = attr.release();
   return RET_OK;
 }
 
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.h b/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.h
index 3c347c232e..bb6e948ffe 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.h
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_relu6_parser.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_TOOLS_LITE_CONVERTER_PARSER_CAFFE_CAFFE_RELU6_PARSER_H_
-#define MINDSPORE_CCSRC_TOOLS_LITE_CONVERTER_PARSER_CAFFE_CAFFE_RELU6_PARSER_H_
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_RELU6_PARSER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_RELU6_PARSER_H_
 
 #include <vector>
 #include "mindspore/lite/tools/converter/parser/caffe/caffe_node_parser.h"
@@ -26,10 +26,12 @@ class CaffeRelu6Parser : public CaffeNodeParser {
  public:
   CaffeRelu6Parser() : CaffeNodeParser("relu6") {}
 
-  STATUS Parse(const caffe::LayerParameter &proto, const caffe::LayerParameter &weight, schema::CNodeT *op,
+  STATUS Parse(const caffe::LayerParameter &proto,
+               const caffe::LayerParameter &weight,
+               schema::CNodeT *op,
                std::vector<schema::TensorT *> *weightVec) override;
 };
 }  // namespace lite
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_TOOLS_LITE_CONVERTER_PARSER_CAFFE_CAFFE_RELU_PARSER_H_
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_RELU6_PARSER_H_
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.cc
index 20aff695fd..2dc46593ff 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.cc
@@ -24,15 +24,31 @@ STATUS CaffeTanhParser::Parse(const caffe::LayerParameter &proto,
                                const caffe::LayerParameter &weight,
                                schema::CNodeT *op,
                                std::vector<schema::TensorT *> *weightVec) {
+  MS_LOG(DEBUG) << "parse CaffeTanhParser";
+  if (op == nullptr) {
+    MS_LOG(ERROR) << "op is null";
+    return RET_NULL_PTR;
+  }
+  op->primitive = std::make_unique<schema::PrimitiveT>();
+  if (op->primitive == nullptr) {
+    MS_LOG(ERROR) << "op->primitive is null";
+    return RET_NULL_PTR;
+  }
+
   std::unique_ptr<schema::ActivationT> attr(new schema::ActivationT());
+  if (attr == nullptr) {
+    MS_LOG(ERROR) << "new op failed";
+    return RET_NULL_PTR;
+  }
   attr->type = schema::ActivationType_TANH;
-  op->primitive = std::make_unique<schema::PrimitiveT>();
-  op->primitive->value.value = attr.release();
+
+  op->name = proto.name();
   op->primitive->value.type = schema::PrimitiveType_Activation;
+  op->primitive->value.value = attr.release();
   return RET_OK;
 }
 
-CaffeNodeRegistrar g_caffeTanhParser("Tanh", new CaffeTanhParser());
+CaffeNodeRegistrar g_caffeTanhParser("TanH", new CaffeTanhParser());
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.h b/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.h
index 7e5e0da6ab..bcd8366f89 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.h
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_tanh_parser.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef LITE_CAFFE_TANH_PARSER_H
-#define LITE_CAFFE_TANH_PARSER_H
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_TANH_PARSER_H
+#define MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_TANH_PARSER_H
 
 #include <vector>
 #include "tools/converter/parser/caffe/caffe_node_parser.h"
@@ -27,10 +27,12 @@ class CaffeTanhParser : public CaffeNodeParser {
  public:
   CaffeTanhParser() : CaffeNodeParser("tanh") {}
 
-  STATUS Parse(const caffe::LayerParameter &proto, const caffe::LayerParameter &weight, schema::CNodeT *op,
+  STATUS Parse(const caffe::LayerParameter &proto,
+               const caffe::LayerParameter &weight,
+               schema::CNodeT *op,
                std::vector<schema::TensorT *> *weightVec) override;
 };
 }  // namespace lite
 }  // namespace mindspore
 
-#endif  // LITE_CAFFE_TANH_PARSER_H
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_PARSER_CAFFE_CAFFE_TANH_PARSER_H
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
old mode 100755
new mode 100644
index b6edf4ebd4..1327074f11
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include "tools/common/graph_util.h"
 #include "src/common/utils.h"
+#include "tools/common/protobuf_utils.h"
 
 namespace mindspore {
 namespace lite {
@@ -54,36 +55,7 @@ std::vector<int32_t> OnnxModelParser::GetDimsFromOnnxValue(const onnx::ValueInfo
   return dims;
 }
 
-STATUS OnnxModelParser::ReadOnnxModelFromBinary(const std::string &modelFile,
-                                                google::protobuf::Message *onnx_model) {
-  std::unique_ptr<char> onnx_file(new (std::nothrow) char[PATH_MAX]{0});
-#ifdef _WIN32
-  if (_fullpath(onnx_file.get(), modelFile.c_str(), 1024) == nullptr) {
-    MS_LOG(ERROR) << "get realpath " << modelFile << " fail";
-    return RET_ERROR;
-  }
-#else
-  if (realpath(modelFile.c_str(), onnx_file.get()) == nullptr) {
-    MS_LOG(ERROR) << "get realpath " << modelFile << " fail";
-    return RET_ERROR;
-  }
-#endif
-  int fd = open(onnx_file.get(), O_RDONLY);
-  google::protobuf::io::FileInputStream input(fd);
-  google::protobuf::io::CodedInputStream code_input(&input);
-  code_input.SetTotalBytesLimit(INT_MAX, 536870912);
-  bool ret = onnx_model->ParseFromCodedStream(&code_input);
-  if (!ret) {
-    MS_LOG(ERROR) << "load onnx file failed";
-    return RET_ERROR;
-  }
-  (void)close(fd);
-  onnx_file.release();
-  return RET_OK;
-}
-
-STATUS OnnxModelParser::SetGraphConstTensor(const onnx::GraphProto &onnx_graph,
-                                            TensorCache *tensor_cache) {
+STATUS OnnxModelParser::SetGraphConstTensor(const onnx::GraphProto &onnx_graph, TensorCache *tensor_cache) {
   MS_LOG(DEBUG) << "set onnx constant tensors";
   for (const auto &onnx_const_value : onnx_graph.initializer()) {
     int index;
@@ -119,11 +91,8 @@ STATUS OnnxModelParser::SetGraphConstTensor(const onnx::GraphProto &onnx_graph,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::AddValueInfo(const onnx::ValueInfoProto &proto,
-                                     const std::string &name,
-                                     const TensorType &type,
-                                     TensorCache *tensor_cache,
-                                     int *index) {
+STATUS OnnxModelParser::AddValueInfo(const onnx::ValueInfoProto &proto, const std::string &name, const TensorType &type,
+                                     TensorCache *tensor_cache, int *index) {
   auto data_type = GetDataTypeFromOnnx(static_cast<onnx::TensorProto_DataType>(proto.type().tensor_type().elem_type()));
   if (data_type == kTypeUnknown) {
     MS_LOG(ERROR) << "not support onnx data type "
@@ -143,11 +112,8 @@ STATUS OnnxModelParser::AddValueInfo(const onnx::ValueInfoProto &proto,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::AddTensorProto(const onnx::TensorProto &proto,
-                                       const std::string &name,
-                                       const TensorType &type,
-                                       TensorCache *tensor_cache,
-                                       int *index) {
+STATUS OnnxModelParser::AddTensorProto(const onnx::TensorProto &proto, const std::string &name, const TensorType &type,
+                                       TensorCache *tensor_cache, int *index) {
   auto data_type = GetDataTypeFromOnnx(static_cast<onnx::TensorProto_DataType>(proto.data_type()));
   if (data_type == kTypeUnknown) {
     MS_LOG(ERROR) << "not support onnx data type " << static_cast<onnx::TensorProto_DataType>(proto.data_type());
@@ -174,8 +140,7 @@ STATUS OnnxModelParser::AddTensorProto(const onnx::TensorProto &proto,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::SetGraphInputTensor(const onnx::GraphProto &onnx_graph,
-                                            schema::MetaGraphT *graph,
+STATUS OnnxModelParser::SetGraphInputTensor(const onnx::GraphProto &onnx_graph, schema::MetaGraphT *graph,
                                             TensorCache *tensor_cache) {
   for (const auto &input_value : onnx_graph.input()) {
     auto ret = tensor_cache->FindTensor(input_value.name());
@@ -192,8 +157,7 @@ STATUS OnnxModelParser::SetGraphInputTensor(const onnx::GraphProto &onnx_graph,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::SetGraphOutputTensor(const onnx::GraphProto &onnx_graph,
-                                             schema::MetaGraphT *graph,
+STATUS OnnxModelParser::SetGraphOutputTensor(const onnx::GraphProto &onnx_graph, schema::MetaGraphT *graph,
                                              TensorCache *tensor_cache) {
   for (const auto &output_value : onnx_graph.output()) {
     int index;
@@ -207,10 +171,8 @@ STATUS OnnxModelParser::SetGraphOutputTensor(const onnx::GraphProto &onnx_graph,
   return RET_OK;
 }
 
-void OnnxModelParser::ParseOnnxGemmNode(const onnx::GraphProto &onnx_graph,
-                                        const onnx::NodeProto &onnx_node,
-                                        schema::MetaGraphT *graph,
-                                        TensorCache *tensor_cache) {
+void OnnxModelParser::ParseOnnxGemmNode(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                                        schema::MetaGraphT *graph, TensorCache *tensor_cache) {
   std::unique_ptr<schema::CNodeT> dst_op_1 = std::make_unique<schema::CNodeT>();
   dst_op_1->name = "Gemm_MatMul_" + onnx_node.output(0);
   ParseOnnxNodeAttr(onnx_graph, onnx_node, "MatMul", dst_op_1.get());
@@ -231,8 +193,7 @@ void OnnxModelParser::ParseOnnxGemmNode(const onnx::GraphProto &onnx_graph,
   graph->nodes.emplace_back(std::move(dst_op_2));
 }
 
-STATUS OnnxModelParser::ParseOnnxGivenFillNode(const onnx::NodeProto &onnx_node,
-                                               TensorCache *tensor_cache) {
+STATUS OnnxModelParser::ParseOnnxGivenFillNode(const onnx::NodeProto &onnx_node, TensorCache *tensor_cache) {
   // convert GivenTensorFill node to a weight/bias tensor
   auto ret = tensor_cache->FindTensor(onnx_node.output(0));
   if (ret < 0) {
@@ -284,10 +245,8 @@ STATUS OnnxModelParser::ParseOnnxGivenFillNode(const onnx::NodeProto &onnx_node,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::ParseOnnxNodeToDstOp(const onnx::GraphProto &onnx_graph,
-                                             const onnx::NodeProto &onnx_node,
-                                             schema::CNodeT *dst_op,
-                                             schema::TensorT *dst_tensor,
+STATUS OnnxModelParser::ParseOnnxNodeToDstOp(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                                             schema::CNodeT *dst_op, schema::TensorT *dst_tensor,
                                              TensorCache *tensor_cache) {
   // change op_type() to name(), that is unique
   dst_op->name = onnx_node.op_type() + "_" + onnx_node.output(0);
@@ -319,11 +278,8 @@ STATUS OnnxModelParser::ParseOnnxNodeToDstOp(const onnx::GraphProto &onnx_graph,
   return RET_OK;
 }
 
-void OnnxModelParser::SetOpQuantParams(const onnx::GraphProto &onnx_graph,
-                                       const onnx::NodeProto &onnx_node,
-                                       schema::CNodeT *dst_op,
-                                       schema::TensorT *dst_tensor,
-                                       TensorCache *tensor_cache) {
+void OnnxModelParser::SetOpQuantParams(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                                       schema::CNodeT *dst_op, schema::TensorT *dst_tensor, TensorCache *tensor_cache) {
   MS_ASSERT(dst_op != nullptr);
   MS_ASSERT(tensor_cache != nullptr);
   std::vector<string> quant_node_name;
@@ -380,10 +336,8 @@ void OnnxModelParser::SetOpQuantParams(const onnx::GraphProto &onnx_graph,
   }
 }
 
-STATUS OnnxModelParser::ParseOnnxNodeAttr(const onnx::GraphProto &onnx_graph,
-                                          const onnx::NodeProto &onnx_node,
-                                          const string &onnx_op_type,
-                                          schema::CNodeT *dst_op) {
+STATUS OnnxModelParser::ParseOnnxNodeAttr(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                                          const string &onnx_op_type, schema::CNodeT *dst_op) {
   auto node_parser = OnnxNodeParserRegistry::GetInstance()->GetNodeParser(onnx_op_type);
   if (node_parser == nullptr) {
     MS_LOG(EXCEPTION) << "not find " << onnx_op_type << ", node parser is nullptr";
@@ -392,10 +346,8 @@ STATUS OnnxModelParser::ParseOnnxNodeAttr(const onnx::GraphProto &onnx_graph,
   return node_parser->Parse(onnx_graph, onnx_node, dst_op);
 }
 
-STATUS OnnxModelParser::SetOpInputIndex(const std::vector<string> &node_inputs,
-                                        schema::CNodeT *dst_op,
-                                        const onnx::NodeProto &onnx_node,
-                                        TensorCache *tensor_cache) {
+STATUS OnnxModelParser::SetOpInputIndex(const std::vector<string> &node_inputs, schema::CNodeT *dst_op,
+                                        const onnx::NodeProto &onnx_node, TensorCache *tensor_cache) {
   for (const auto &onnx_node_input : node_inputs) {
     auto index = tensor_cache->FindTensor(onnx_node_input);
     if (index < 0) {
@@ -408,8 +360,7 @@ STATUS OnnxModelParser::SetOpInputIndex(const std::vector<string> &node_inputs,
   return RET_OK;
 }
 
-STATUS OnnxModelParser::SetOpOutputIndex(const std::vector<string> &node_outputs,
-                                         schema::CNodeT *dst_op,
+STATUS OnnxModelParser::SetOpOutputIndex(const std::vector<string> &node_outputs, schema::CNodeT *dst_op,
                                          TensorCache *tensor_cache) {
   for (const auto &onnx_node_output : node_outputs) {
     auto index = tensor_cache->FindTensor(onnx_node_output);
@@ -424,8 +375,7 @@ STATUS OnnxModelParser::SetOpOutputIndex(const std::vector<string> &node_outputs
   return RET_OK;
 }
 
-STATUS OnnxModelParser::CopyOnnxTensorData(const onnx::TensorProto &onnx_const_value,
-                                           schema::TensorT *tensor) {
+STATUS OnnxModelParser::CopyOnnxTensorData(const onnx::TensorProto &onnx_const_value, schema::TensorT *tensor) {
   size_t data_count = 1;
   std::for_each(tensor->dims.begin(), tensor->dims.end(), [&data_count](int dim) { data_count *= dim; });
   size_t data_size = 0;
@@ -484,8 +434,7 @@ STATUS OnnxModelParser::CopyOnnxTensorData(const onnx::TensorProto &onnx_const_v
   return RET_OK;
 }
 
-STATUS OnnxModelParser::SetAllTensors(const TensorCache &tensor_cache,
-                                      schema::MetaGraphT *graphDef) {
+STATUS OnnxModelParser::SetAllTensors(const TensorCache &tensor_cache, schema::MetaGraphT *graphDef) {
   std::vector<schema::TensorT *> tensors = tensor_cache.GetCachedTensor();
   for (auto iter : tensors) {
     std::unique_ptr<schema::TensorT> temp(iter);
@@ -507,17 +456,16 @@ void OnnxModelParser::FindGraphInputAndConst(const onnx::GraphProto &onnx_graph)
   }
 }
 
-MetaGraphT *OnnxModelParser::Parse(const std::string &modelFile,
-                                   const std::string &weightFile,
-                                   const QuantType &quantType) {
+schema::MetaGraphT *OnnxModelParser::ParseToFb(const std::string &modelFile, const std::string &weightFile,
+                                               const QuantType &quantType) {
   if (ValidateFileStr(modelFile, ".onnx") != RET_OK) {
     MS_LOG(ERROR) << "Input illegal: modelFile must be *.onnx";
     return nullptr;
   }
-  std::unique_ptr<schema::MetaGraphT> dst_graph = std::make_unique<schema::MetaGraphT>();
+
   onnx::ModelProto onnx_model;
-  if (ReadOnnxModelFromBinary(modelFile, &onnx_model) != RET_OK) {
-    MS_LOG(ERROR) << "read onnx model fail";
+  if (ReadProtoFromBinaryFile((const char *)modelFile.c_str(), &onnx_model) != RET_OK) {
+    MS_LOG(ERROR) << "Read onnx model file failed, model path: " << modelFile;
     return nullptr;
   }
   const onnx::GraphProto &onnx_graph = onnx_model.graph();
@@ -531,6 +479,7 @@ MetaGraphT *OnnxModelParser::Parse(const std::string &modelFile,
     MS_LOG(ERROR) << "SetGraphConstTensor failed";
     return nullptr;
   }
+  auto dst_graph = std::make_unique<schema::MetaGraphT>();
   // init onnx model graph input tensor
   if (SetGraphInputTensor(onnx_graph, dst_graph.get(), &tensor_cache)) {
     MS_LOG(ERROR) << "SetGraphInputTensor failed";
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
index 838b82f727..e227dec4fc 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
@@ -40,79 +40,48 @@ class OnnxModelParser : public ModelParser {
 
   virtual ~OnnxModelParser();
 
-  MetaGraphT *Parse(const std::string &modelFile, const std::string &weightFile,
-                    const QuantType &quantType = QuantType_QUANT_NONE) override;
+  schema::MetaGraphT *ParseToFb(const std::string &modelFile, const std::string &weightFile,
+                                const QuantType &quantType = QuantType_QUANT_NONE) override;
 
  private:
   TypeId GetDataTypeFromOnnx(onnx::TensorProto_DataType onnx_type);
 
   std::vector<int32_t> GetDimsFromOnnxValue(const onnx::ValueInfoProto &onnx_value);
 
-  STATUS ReadOnnxModelFromBinary(const std::string &modelFile,
-                                 google::protobuf::Message *model_proto);
-
-  STATUS SetGraphConstTensor(const onnx::GraphProto &onnx_graph,
-                             TensorCache *tensor_cache);
-
-  STATUS SetGraphInputTensor(const onnx::GraphProto &onnx_graph,
-                             schema::MetaGraphT *graph,
-                             TensorCache *tensor_cache);
-
-  STATUS SetGraphOutputTensor(const onnx::GraphProto &onnx_graph,
-                              schema::MetaGraphT *graph,
-                              TensorCache *tensor_cache);
-
-  STATUS AddValueInfo(const onnx::ValueInfoProto &proto,
-                      const std::string &name,
-                      const TensorType &type,
-                      TensorCache *tensor_cache,
-                      int *index);
-
-  STATUS AddTensorProto(const onnx::TensorProto &proto,
-                        const std::string &name,
-                        const TensorType &type,
-                        TensorCache *tensor_cache,
-                        int *index);
-
-  STATUS ParseOnnxNodeToDstOp(const onnx::GraphProto &onnx_graph,
-                              const onnx::NodeProto &onnx_node,
-                              schema::CNodeT *dst_op,
-                              schema::TensorT *dst_tensor,
-                              TensorCache *tensor_cache);
-
-  void ParseOnnxGemmNode(const onnx::GraphProto &onnx_graph,
-                         const onnx::NodeProto &onnx_node,
-                         schema::MetaGraphT *graph,
-                         TensorCache *tensor_cache);
-
-  STATUS ParseOnnxGivenFillNode(const onnx::NodeProto &onnx_node,
-                                TensorCache *tensor_cache);
-
-  STATUS ParseOnnxNodeAttr(const onnx::GraphProto &onnx_graph,
-                           const onnx::NodeProto &onnx_node,
-                           const string &onnx_op_type,
-                           schema::CNodeT *dst_op);
-
-  void SetOpQuantParams(const onnx::GraphProto &onnx_graph,
-                        const onnx::NodeProto &onnx_node,
-                        schema::CNodeT *dst_op,
-                        schema::TensorT *dst_tensor,
-                        TensorCache *tensor_cache);
-
-  STATUS SetOpInputIndex(const std::vector<string> &node_inputs,
-                         schema::CNodeT *dst_op,
-                         const onnx::NodeProto &onnx_node,
-                         TensorCache *tensor_cache);
-
-  STATUS SetOpOutputIndex(const std::vector<string> &node_outputs,
-                          schema::CNodeT *dst_op,
-                          TensorCache *tensor_cache);
-
-  STATUS CopyOnnxTensorData(const onnx::TensorProto &onnx_init_value,
-                            schema::TensorT *tensor);
-
-  STATUS SetAllTensors(const TensorCache &tensor_cache,
-                       schema::MetaGraphT *graphDef);
+  STATUS SetGraphConstTensor(const onnx::GraphProto &onnx_graph, TensorCache *tensor_cache);
+
+  STATUS SetGraphInputTensor(const onnx::GraphProto &onnx_graph, schema::MetaGraphT *graph, TensorCache *tensor_cache);
+
+  STATUS SetGraphOutputTensor(const onnx::GraphProto &onnx_graph, schema::MetaGraphT *graph, TensorCache *tensor_cache);
+
+  STATUS AddValueInfo(const onnx::ValueInfoProto &proto, const std::string &name, const TensorType &type,
+                      TensorCache *tensor_cache, int *index);
+
+  STATUS AddTensorProto(const onnx::TensorProto &proto, const std::string &name, const TensorType &type,
+                        TensorCache *tensor_cache, int *index);
+
+  STATUS ParseOnnxNodeToDstOp(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                              schema::CNodeT *dst_op, schema::TensorT *dst_tensor, TensorCache *tensor_cache);
+
+  void ParseOnnxGemmNode(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                         schema::MetaGraphT *graph, TensorCache *tensor_cache);
+
+  STATUS ParseOnnxGivenFillNode(const onnx::NodeProto &onnx_node, TensorCache *tensor_cache);
+
+  STATUS ParseOnnxNodeAttr(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node,
+                           const string &onnx_op_type, schema::CNodeT *dst_op);
+
+  void SetOpQuantParams(const onnx::GraphProto &onnx_graph, const onnx::NodeProto &onnx_node, schema::CNodeT *dst_op,
+                        schema::TensorT *dst_tensor, TensorCache *tensor_cache);
+
+  STATUS SetOpInputIndex(const std::vector<string> &node_inputs, schema::CNodeT *dst_op,
+                         const onnx::NodeProto &onnx_node, TensorCache *tensor_cache);
+
+  STATUS SetOpOutputIndex(const std::vector<string> &node_outputs, schema::CNodeT *dst_op, TensorCache *tensor_cache);
+
+  STATUS CopyOnnxTensorData(const onnx::TensorProto &onnx_init_value, schema::TensorT *tensor);
+
+  STATUS SetAllTensors(const TensorCache &tensor_cache, schema::MetaGraphT *graphDef);
 
   void FindGraphInputAndConst(const onnx::GraphProto &onnx_graph);
 
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_relu_parser.cc b/mindspore/lite/tools/converter/parser/onnx/onnx_relu_parser.cc
index d14a5054a7..06f5d460f5 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_relu_parser.cc
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_relu_parser.cc
@@ -73,7 +73,7 @@ STATUS OnnxPReluParser::Parse(const onnx::GraphProto &onnx_graph, const onnx::No
     MS_LOG(ERROR) << "input num should be 2";
     return RET_ERROR;
   }
-  std::unique_ptr<schema::CaffePReLUT> attr = std::make_unique<schema::CaffePReLUT>();
+  std::unique_ptr<schema::PReLUT> attr = std::make_unique<schema::PReLUT>();
   std::vector<onnx::TensorProto> params;
   const auto &input_name = onnx_node.input(1);
   for (const auto &it : onnx_graph.initializer()) {
@@ -102,7 +102,7 @@ STATUS OnnxPReluParser::Parse(const onnx::GraphProto &onnx_graph, const onnx::No
     }
   }
 
-  op->primitive->value.type = schema::PrimitiveType_CaffePReLU;
+  op->primitive->value.type = schema::PrimitiveType_PReLU;
   op->primitive->value.value = attr.release();
   return RET_OK;
 }
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_slice_parser.cc b/mindspore/lite/tools/converter/parser/onnx/onnx_slice_parser.cc
index c2bda2c3d8..d89c5dcb54 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_slice_parser.cc
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_slice_parser.cc
@@ -38,22 +38,38 @@ STATUS OnnxSliceParser::Parse(const onnx::GraphProto &onnx_graph, const onnx::No
     return RET_NULL_PTR;
   }
 
+  std::vector<int> axes;
+  std::vector<int> starts;
+  std::vector<int> ends;
   for (const auto &onnx_node_attr : onnx_node.attribute()) {
     const auto &attribute_name = onnx_node_attr.name();
     if (attribute_name == "starts") {
-      const int size = onnx_node_attr.ints_size();
-      MS_LOG(ERROR) << "SLICE starts size " << size;
-      for (int i = 0; i < size; ++i) {
-        attr->begin.emplace_back(static_cast<int32_t>(onnx_node_attr.ints(i)));
+      const int num = onnx_node_attr.ints_size();
+      starts.clear();
+      for (int i = 0; i < num; ++i) {
+        starts.push_back(static_cast<int>(onnx_node_attr.ints()[i]));
+      }
+    } else if (attribute_name == "axes") {
+      const int num = onnx_node_attr.ints_size();
+      axes.clear();
+      for (int i = 0; i < num; ++i) {
+        axes.push_back(static_cast<int>(onnx_node_attr.ints()[i]));
       }
     } else if (attribute_name == "ends") {
-      const int size = onnx_node_attr.ints_size();
-      for (int i = 0; i < size; ++i) {
-        attr->size.emplace_back(static_cast<int32_t>(onnx_node_attr.ints(i)));
+      const int num = onnx_node_attr.ints_size();
+      ends.clear();
+      for (int i = 0; i < num; ++i) {
+        ends.push_back(static_cast<int>(onnx_node_attr.ints()[i]));
       }
     }
   }
-
+  std::vector<int> sizes(starts.size(), -1);
+  for (size_t i = 0; i < starts.size(); ++i) {
+      sizes[i] = (ends[i] < 0 ? ends[i] : ends[i] - starts[i]);
+  }
+  attr->axes = axes;
+  attr->begin = starts;
+  attr->size = sizes;
   op->primitive->value.type = schema::PrimitiveType_Slice;
   op->primitive->value.value = attr.release();
   return RET_OK;
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.cc
index 2e58536bcd..5a4adeec4d 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.cc
@@ -71,7 +71,7 @@ STATUS TfliteActivationParser::Parse(const std::unique_ptr<tflite::OperatorT> &t
       return RET_NULL_PTR;
     }
     attr->alpha = tflite_attr->alpha;
-    attr->type = schema::ActivationType_SIGMOID;
+    attr->type = schema::ActivationType_LEAKY_RELU;
   }
 
   op->primitive->value.type = schema::PrimitiveType_Activation;
@@ -84,52 +84,11 @@ STATUS TfliteActivationParser::Parse(const std::unique_ptr<tflite::OperatorT> &t
   return RET_OK;
 }
 
-STATUS TflitePreluParser::Parse(const std::unique_ptr<tflite::OperatorT> &tflite_op,
-                                const std::vector<std::unique_ptr<tflite::TensorT>> &tflite_tensors,
-                                const std::vector<std::unique_ptr<tflite::BufferT>> &tflite_model_buffer,
-                                schema::CNodeT *op,
-                                std::vector<int32_t> *tensors_id,
-                                std::vector<schema::Format> *tensors_format,
-                                std::map<int, int>  *tensors_id_map) {
-  MS_LOG(DEBUG) << "parse TflitePreluParser";
-
-  if (op == nullptr) {
-    MS_LOG(ERROR) << "op is null";
-    return RET_NULL_PTR;
-  }
-  op->primitive = std::make_unique<schema::PrimitiveT>();
-  if (op->primitive == nullptr) {
-    MS_LOG(ERROR) << "op->primitive is null";
-    return RET_NULL_PTR;
-  }
-
-  std::unique_ptr<schema::PreluT> attr = std::make_unique<schema::PreluT>();
-  if (attr == nullptr) {
-    MS_LOG(ERROR) << "new op failed";
-    return RET_NULL_PTR;
-  }
-
-  if (GetTfliteData(tflite_op->inputs[1], tflite_tensors, tflite_model_buffer, attr->slope)) {
-    MS_LOG(ERROR) << "get pRelu -> slope failed";
-    return RET_ERROR;
-  }
-  op->primitive->value.type = schema::PrimitiveType_Prelu;
-  op->primitive->value.value = attr.release();
-
-  AddOpInput(op, tensors_id, tensors_format, tensors_id_map,
-             tflite_op->inputs[0], tensors_id->size(), tflite_tensors.size(), schema::Format_NHWC);
-  AddOpOutput(op, tensors_id, tensors_format, tensors_id_map,
-              tflite_op->outputs[0], tensors_id->size(), tflite_tensors.size(), schema::Format_NHWC);
-  return RET_OK;
-}
-
-
 TfliteNodeRegister g_TfliteReluParser("Relu", new TfliteReluParser());
 TfliteNodeRegister g_TfliteRelu6Parser("Relu6", new TfliteRelu6Parser());
 TfliteNodeRegister g_TfliteTanhParser("Tanh", new TfliteTanhParser());
 TfliteNodeRegister g_TfliteHardSwishParser("HardSwish", new TfliteHardSwishParser());
 TfliteNodeRegister g_tfliteLogisticParser("Logistic", new TfliteLogisticParser());
-TfliteNodeRegister g_tflitePreluParser("Prelu", new TflitePreluParser());
 TfliteNodeRegister g_TfliteLeakyReluParser("LeakyRelu", new TfliteLeakyReluParser());
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.h b/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.h
index a47b0e1180..b1509599d9 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.h
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_activation_parser.h
@@ -68,18 +68,6 @@ class TfliteLeakyReluParser : public TfliteActivationParser {
   TfliteLeakyReluParser() : TfliteActivationParser() {}
 };
 
-class TflitePreluParser : public TfliteNodeParser {
- public:
-  TflitePreluParser() : TfliteNodeParser("Prelu") {}
-
-  STATUS Parse(const std::unique_ptr<tflite::OperatorT> &tflite_op,
-               const std::vector<std::unique_ptr<tflite::TensorT>> &tflite_tensors,
-               const std::vector<std::unique_ptr<tflite::BufferT>> &tflite_model_buffer,
-               schema::CNodeT *op,
-               std::vector<int32_t> *tensors_id,
-               std::vector<schema::Format> *tensors_format,
-               std::map<int, int>  *tensors_id_map) override;
-};
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_arithmetic_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_arithmetic_parser.cc
index c615a795b2..8db54f0497 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_arithmetic_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_arithmetic_parser.cc
@@ -44,7 +44,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
   const char *node_name = node_name_str.data()->c_str();
   if (std::strcmp(node_name, "Add") == 0) {
     MS_LOG(DEBUG) << "parse TfliteAddParser";
-    std::unique_ptr<schema::AddT> attr = std::make_unique<schema::AddT>();
+    auto attr = std::make_unique<schema::AddT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -59,7 +59,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Sub") == 0) {
     MS_LOG(DEBUG) << "parse TfliteSubParser";
-    std::unique_ptr<schema::SubT> attr = std::make_unique<schema::SubT>();
+    auto attr = std::make_unique<schema::SubT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -74,7 +74,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Mul") == 0) {
     MS_LOG(DEBUG) << "parse TfliteMulParser";
-    std::unique_ptr<schema::MulT> attr = std::make_unique<schema::MulT>();
+    auto attr = std::make_unique<schema::MulT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -89,7 +89,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Div") == 0) {
     MS_LOG(DEBUG) << "parse TfliteDivParser";
-    std::unique_ptr<schema::DivT> attr = std::make_unique<schema::DivT>();
+    auto attr = std::make_unique<schema::DivT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -113,7 +113,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "FloorMod") == 0) {
     MS_LOG(DEBUG) << "parse TfliteFloorModParser";
-    std::unique_ptr<schema::FloorModT> attr = std::make_unique<schema::FloorModT>();
+    auto attr = std::make_unique<schema::FloorModT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -131,7 +131,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "SquaredDifference") == 0) {
     MS_LOG(DEBUG) << "parse TfliteSquaredDifferenceParser";
-    std::unique_ptr<schema::SquaredDifferenceT> attr = std::make_unique<schema::SquaredDifferenceT>();
+    auto attr = std::make_unique<schema::SquaredDifferenceT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -140,7 +140,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Pow") == 0) {
     MS_LOG(DEBUG) << "parse TflitePowParser";
-    std::unique_ptr<schema::PowerT> attr = std::make_unique<schema::PowerT>();
+    auto attr = std::make_unique<schema::PowerT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -152,7 +152,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Maximum") == 0) {
     MS_LOG(DEBUG) << "parse TfliteMaximumParser";
-    std::unique_ptr<schema::MaximumT> attr = std::make_unique<schema::MaximumT>();
+    auto attr = std::make_unique<schema::MaximumT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -161,7 +161,7 @@ STATUS TfliteDoubleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Minimum") == 0) {
     MS_LOG(DEBUG) << "parse TfliteMinimumParser";
-    std::unique_ptr<schema::MinimumT> attr = std::make_unique<schema::MinimumT>();
+    auto attr = std::make_unique<schema::MinimumT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -202,7 +202,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
   const char *node_name = node_name_str.data()->c_str();
   if (std::strcmp(node_name, "Abs") == 0) {
     MS_LOG(DEBUG) << "parse TfliteAbsParser";
-    std::unique_ptr<schema::AbsT> attr = std::make_unique<schema::AbsT>();
+    auto attr = std::make_unique<schema::AbsT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -211,7 +211,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Exp") == 0) {
     MS_LOG(DEBUG) << "parse TfliteExpParser";
-    std::unique_ptr<schema::ExpT> attr = std::make_unique<schema::ExpT>();
+    auto attr = std::make_unique<schema::ExpT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -220,7 +220,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Sqrt") == 0) {
     MS_LOG(DEBUG) << "parse TfliteSqrtParser";
-    std::unique_ptr<schema::SqrtT> attr = std::make_unique<schema::SqrtT>();
+    auto attr = std::make_unique<schema::SqrtT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -229,7 +229,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Rsqrt") == 0) {
     MS_LOG(DEBUG) << "parse TfliteRsqrtParser";
-    std::unique_ptr<schema::RsqrtT> attr = std::make_unique<schema::RsqrtT>();
+    auto attr = std::make_unique<schema::RsqrtT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -238,7 +238,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Square") == 0) {
     MS_LOG(DEBUG) << "parse TfliteSquareParser";
-    std::unique_ptr<schema::SquareT> attr = std::make_unique<schema::SquareT>();
+    auto attr = std::make_unique<schema::SquareT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -247,7 +247,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Sin") == 0) {
     MS_LOG(DEBUG) << "parse TfliteSinParser";
-    std::unique_ptr<schema::SinT> attr = std::make_unique<schema::SinT>();
+    auto attr = std::make_unique<schema::SinT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -265,7 +265,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Log") == 0) {
     MS_LOG(DEBUG) << "parse TfliteLogParser";
-    std::unique_ptr<schema::LogT> attr = std::make_unique<schema::LogT>();
+    auto attr = std::make_unique<schema::LogT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -274,7 +274,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Round") == 0) {
     MS_LOG(DEBUG) << "parse TfliteRoundParser";
-    std::unique_ptr<schema::RoundT> attr = std::make_unique<schema::RoundT>();
+    auto attr = std::make_unique<schema::RoundT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -283,7 +283,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "Ceil") == 0) {
     MS_LOG(DEBUG) << "parse TfliteCeilParser";
-    std::unique_ptr<schema::CeilT> attr = std::make_unique<schema::CeilT>();
+    auto attr = std::make_unique<schema::CeilT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
@@ -292,7 +292,7 @@ STATUS TfliteSingleInputOpParser::Parse(const std::unique_ptr<tflite::OperatorT>
     op->primitive->value.value = attr.release();
   } else if (std::strcmp(node_name, "flOOR") == 0) {
     MS_LOG(DEBUG) << "parse TfliteFloorParser";
-    std::unique_ptr<schema::FloorT> attr = std::make_unique<schema::FloorT>();
+    auto attr = std::make_unique<schema::FloorT>();
     if (attr == nullptr) {
       MS_LOG(ERROR) << "new op failed";
       return RET_NULL_PTR;
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
index 824ef52768..e4c2a5086f 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
@@ -28,26 +28,25 @@ namespace mindspore {
 namespace lite {
 TfliteModelParser::TfliteModelParser() = default;
 
-TfliteModelParser::~TfliteModelParser() = default;
+TfliteModelParser::~TfliteModelParser() { delete[](this->tfliteModelBuf); }
 
 std::unique_ptr<tflite::ModelT> TfliteModelParser::ReadTfliteModel(const char *model_path) {
   size_t size;
-  auto buf = ReadFile(model_path, &size);
-  if (buf == nullptr) {
+  tfliteModelBuf = ReadFile(model_path, &size);
+  if (tfliteModelBuf == nullptr) {
     MS_LOG(ERROR) << "the file buffer is nullptr";
     return nullptr;
   }
-  flatbuffers::Verifier verify((const uint8_t *)buf, size);
+  flatbuffers::Verifier verify((const uint8_t *)tfliteModelBuf, size);
   if (!tflite::VerifyModelBuffer(verify)) {
     MS_LOG(ERROR) << "the buffer is invalid and fail to create graph";
     return nullptr;
   }
-  return tflite::UnPackModel(buf);
+  return tflite::UnPackModel(tfliteModelBuf);
 }
 
 STATUS TfliteModelParser::CopyConstTensorData(const std::vector<std::unique_ptr<tflite::BufferT>> &tflite_model_buffer,
-                                              const tflite::TensorT *tflite_tensor,
-                                              schema::TensorT *tensor) {
+                                              const tflite::TensorT *tflite_tensor, schema::TensorT *tensor) {
   auto count = 1;
   std::for_each(tflite_tensor->shape.begin(), tflite_tensor->shape.end(), [&](int32_t sha) { count *= sha; });
   auto data_size = count * GetDataTypeSize(TypeId(tensor->dataType));
@@ -95,8 +94,7 @@ void TfliteModelParser::SetTensorQuantParam(const std::unique_ptr<tflite::Tensor
 
 STATUS TfliteModelParser::ConvertOp(const std::unique_ptr<tflite::ModelT> &tflite_model,
                                     const std::unique_ptr<tflite::SubGraphT> &tflite_subgraph,
-                                    const QuantType &quant_type,
-                                    schema::MetaGraphT *sub_graph) {
+                                    const QuantType &quant_type, schema::MetaGraphT *sub_graph) {
   int idx = 0;
   for (const auto &tflite_op : tflite_subgraph->operators) {
     auto tflite_op_type = (tflite_model->operator_codes[tflite_op->opcode_index])->builtin_code;
@@ -107,7 +105,7 @@ STATUS TfliteModelParser::ConvertOp(const std::unique_ptr<tflite::ModelT> &tflit
       return RET_ERROR;
     }
 
-    std::unique_ptr<schema::CNodeT> op = std::make_unique<schema::CNodeT>();
+    auto op = std::make_unique<schema::CNodeT>();
     op->name = op_type + "-" + std::to_string(idx++);
     op->quantType = quant_type;
     MS_LOG(INFO) << "parse op: " << op->name.c_str();
@@ -227,7 +225,7 @@ STATUS TfliteModelParser::GetGraphInfo(const std::unique_ptr<tflite::SubGraphT>
   return RET_OK;
 }
 
-STATUS TfliteModelParser::ConvertGroupDepthwiseOp(schema::MetaGraphT* sub_graph) {
+STATUS TfliteModelParser::ConvertGroupDepthwiseOp(schema::MetaGraphT *sub_graph) {
   for (auto &op : sub_graph->nodes) {
     if (op->primitive->value.type == schema::PrimitiveType_DepthwiseConv2D) {
       auto attr = op->primitive->value.AsDepthwiseConv2D();
@@ -249,62 +247,66 @@ STATUS TfliteModelParser::ConvertGroupDepthwiseOp(schema::MetaGraphT* sub_graph)
           return RET_NULL_PTR;
         }
         auto data_shape = data_tensor->dims;
-        conv_attr->channelIn = data_shape[3];
-        conv_attr->channelOut = conv_attr->channelIn * attr->channelMultiplier;
-
-        // update attr
-        conv_attr->group = 0;
-        conv_attr->format = attr->format;
-        conv_attr->kernelH = attr->kernelH;
-        conv_attr->kernelW = attr->kernelW;
-        conv_attr->strideH = attr->strideH;
-        conv_attr->strideW = attr->strideW;
-        conv_attr->padMode = attr->padMode;
-        conv_attr->padUp = attr->padUp;
-        conv_attr->padDown = attr->padDown;
-        conv_attr->padLeft = attr->padLeft;
-        conv_attr->padRight = attr->padRight;
-        conv_attr->dilateH = attr->dilateH;
-        conv_attr->dilateW = attr->dilateW;
-        conv_attr->hasBias = attr->hasBias;
-        conv_attr->activationType = attr->activationType;
-
-        op->primitive->value.type = schema::PrimitiveType_Conv2D;
-        op->primitive->value.value = conv_attr.release();
-
-        // update weight
-        auto weight_id = op->inputIndex[1];
-        auto &weight_tensor = sub_graph->allTensors.at(weight_id);
-        if (weight_tensor->dataType == TypeId::kNumberTypeUInt8) {
-          auto status = TransFilterFormat<uint8_t>(weight_tensor.get(), kKHWC2CHWK);
-          if (status != RET_OK) {
-            MS_LOG(ERROR) << "Trans depthwiseConv Filter Format failed.";
-            return RET_ERROR;
-          }
-        }
-        if (weight_tensor->dataType == kNumberTypeFloat32 || weight_tensor->dataType == kNumberTypeFloat) {
-          auto status = TransFilterFormat<float>(weight_tensor.get(), kKHWC2CHWK);
-          if (status != RET_OK) {
-            MS_LOG(ERROR) << "Trans filter format failed.";
+
+        if (data_shape[3] == 1) {
+          conv_attr->channelIn = data_shape[3];
+          conv_attr->channelOut = conv_attr->channelIn * attr->channelMultiplier;
+
+          // update attr
+          conv_attr->group = 1;
+          conv_attr->format = attr->format;
+          conv_attr->kernelH = attr->kernelH;
+          conv_attr->kernelW = attr->kernelW;
+          conv_attr->strideH = attr->strideH;
+          conv_attr->strideW = attr->strideW;
+          conv_attr->padMode = attr->padMode;
+          conv_attr->padUp = attr->padUp;
+          conv_attr->padDown = attr->padDown;
+          conv_attr->padLeft = attr->padLeft;
+          conv_attr->padRight = attr->padRight;
+          conv_attr->dilateH = attr->dilateH;
+          conv_attr->dilateW = attr->dilateW;
+          conv_attr->hasBias = attr->hasBias;
+          conv_attr->activationType = attr->activationType;
+
+          op->primitive->value.type = schema::PrimitiveType_Conv2D;
+          op->primitive->value.value = conv_attr.release();
+
+          // update weight
+          auto weight_id = op->inputIndex[1];
+          auto &weight_tensor = sub_graph->allTensors.at(weight_id);
+          if (weight_tensor->dataType == TypeId::kNumberTypeUInt8) {
+            auto status = TransFilterFormat<uint8_t>(weight_tensor.get(), kKHWC2CHWK);
+            if (status != RET_OK) {
+              MS_LOG(ERROR) << "Trans depthwiseConv Filter Format failed.";
+              return RET_ERROR;
+            }
+          } else if (weight_tensor->dataType == kNumberTypeFloat32 || weight_tensor->dataType == kNumberTypeFloat) {
+            auto status = TransFilterFormat<float>(weight_tensor.get(), kKHWC2CHWK);
+            if (status != RET_OK) {
+              MS_LOG(ERROR) << "Trans filter format failed.";
+              return RET_ERROR;
+            }
+          } else {
+            MS_LOG(ERROR) << "The dataType of weight tensor is unsupported.";
             return RET_ERROR;
           }
+          weight_tensor->format = schema::Format_CHWK;
         }
-        weight_tensor->format = schema::Format_CHWK;
       }
     }
   }
   return RET_OK;
 }
 
-MetaGraphT *TfliteModelParser::Parse(const std::string &model_file,
-                                     const std::string &weight_file,
-                                     const QuantType &quant_type) {
-  std::unique_ptr<schema::MetaGraphT> sub_graph = std::make_unique<schema::MetaGraphT>();
-  sub_graph->name = "MS_model converted by TF-Lite";
-  quantType = quant_type;
-
+schema::MetaGraphT *TfliteModelParser::ParseToFb(const std::string &model_file, const std::string &weight_file,
+                                                 const QuantType &quant_type) {
   // load graph
-  std::unique_ptr<tflite::ModelT> tflite_model = ReadTfliteModel(model_file.c_str());
+  auto tflite_model = ReadTfliteModel(model_file.c_str());
+  if (tflite_model == nullptr) {
+    MS_LOG(ERROR) << "read tflite model failed";
+    return nullptr;
+  }
 
   if (tflite_model->subgraphs.size() != 1) {
     MS_LOG(ERROR) << "read tflite model subgraphs failed";
@@ -312,31 +314,38 @@ MetaGraphT *TfliteModelParser::Parse(const std::string &model_file,
   }
   const auto &tflite_subgraph = tflite_model->subgraphs[0];
 
+  auto meta_graph = std::make_unique<schema::MetaGraphT>();
+  if (meta_graph == nullptr) {
+    MS_LOG(ERROR) << "new meta graph failed";
+    return nullptr;
+  }
+  meta_graph->name = "MS_model converted by TF-Lite";
+  quantType = quant_type;
   // convert op
-  if (ConvertOp(tflite_model, tflite_subgraph, quant_type, sub_graph.get()) != RET_OK) {
+  if (ConvertOp(tflite_model, tflite_subgraph, quant_type, meta_graph.get()) != RET_OK) {
     MS_LOG(ERROR) << "parse op failed.";
     return nullptr;
   }
 
   // convert tensor
-  if (ConvertTensor(tflite_subgraph, tflite_model->buffers, sub_graph.get()) != RET_OK) {
+  if (ConvertTensor(tflite_subgraph, tflite_model->buffers, meta_graph.get()) != RET_OK) {
     MS_LOG(ERROR) << "convert tensor failed";
     return nullptr;
   }
 
   // set graph input/output
-  if (GetGraphInfo(tflite_subgraph, sub_graph.get()) != RET_OK) {
+  if (GetGraphInfo(tflite_subgraph, meta_graph.get()) != RET_OK) {
     MS_LOG(ERROR) << "convert tensors failed";
     return nullptr;
   }
 
   // update for depthwiseConv
-  if (ConvertGroupDepthwiseOp(sub_graph.get()) != RET_OK) {
+  if (ConvertGroupDepthwiseOp(meta_graph.get()) != RET_OK) {
     MS_LOG(ERROR) << "convert group depthwise conv failed";
     return nullptr;
   }
 
-  return sub_graph.release();
+  return meta_graph.release();
 }
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
index 71e28c3c88..38dbe95592 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
@@ -41,7 +41,7 @@ class TfliteModelParser : public ModelParser {
 
   ~TfliteModelParser() override;
 
-  MetaGraphT *Parse(const std::string &model_file,
+  schema::MetaGraphT *ParseToFb(const std::string &model_file,
                     const std::string &weight_file,
                     const QuantType &quantType = QuantType_QUANT_NONE) override;
 
@@ -78,6 +78,7 @@ class TfliteModelParser : public ModelParser {
   std::map<std::string, schema::CNodeT *> opMap;
   std::map<const tflite::OperatorT *, schema::CNodeT *> tfliteOpMap;
   QuantType quantType = QuantType_QUANT_NONE;
+  char *tfliteModelBuf = nullptr;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_pooling_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_pooling_parser.cc
index 8a1f2d4b38..96fb0681c8 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_pooling_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_pooling_parser.cc
@@ -70,6 +70,7 @@ STATUS TflitePoolingParser::Parse(const std::unique_ptr<tflite::OperatorT> &tfli
 
   attr->global = false;
   attr->roundMode = schema::RoundMode_FLOOR;
+  attr->activationType = GetActivationFunctionType(tflite_attr->fused_activation_function);
 
   // calculate pad params
   auto data_index = tflite_op->inputs[0];
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_slice_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_slice_parser.cc
index 12e1afaea6..571f6ad4d1 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_slice_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_slice_parser.cc
@@ -55,7 +55,12 @@ STATUS TfliteSliceParser::Parse(const std::unique_ptr<tflite::OperatorT> &tflite
     MS_LOG(ERROR) << "get slice -> size failed";
     return RET_ERROR;
   }
-
+  std::vector<int> axes;
+  axes.clear();
+  for (size_t i = 0; i < attr->begin.size(); ++i) {
+    axes.push_back(i);
+  }
+  attr->axes = axes;
   op->primitive->value.type = schema::PrimitiveType_Slice;
   op->primitive->value.value = attr.release();
 
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
index eb9b247a1e..d987b6e8f9 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
@@ -107,7 +107,6 @@ std::map<tflite::BuiltinOperator, std::string> tfMsOpTypeMap{
   {tflite::BuiltinOperator_DEPTH_TO_SPACE, "DepthToSpace"},
   {tflite::BuiltinOperator_SPACE_TO_BATCH_ND, "SpaceToBatchND"},
   {tflite::BuiltinOperator_SPACE_TO_DEPTH, "SpaceToDepth"},
-  {tflite::BuiltinOperator_PRELU, "Prelu"},
   {tflite::BuiltinOperator_ROUND, "Round"},
   {tflite::BuiltinOperator_WHERE, "Where"},
   {tflite::BuiltinOperator_SPARSE_TO_DENSE, "SparseToDense"},
diff --git a/mindspore/lite/tools/converter/quantizer/aware_quantizer.cc b/mindspore/lite/tools/converter/quantizer/aware_quantizer.cc
index 3c4ff4dc2c..2a55aedf93 100644
--- a/mindspore/lite/tools/converter/quantizer/aware_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/aware_quantizer.cc
@@ -226,7 +226,8 @@ STATUS AwareQuantizer::DoQuantize() {
     }
     STATUS status;
     if (GetCNodeTType(*node) == schema::PrimitiveType_Conv2D ||
-        GetCNodeTType(*node) == schema::PrimitiveType_DepthwiseConv2D) {
+        GetCNodeTType(*node) == schema::PrimitiveType_DepthwiseConv2D ||
+        GetCNodeTType(*node) == schema::PrimitiveType_FullConnection) {
       auto inputIndexes = node->inputIndex;
       if (inputIndexes.size() < 2) {
         MS_LOG(ERROR) << node->name.c_str()
diff --git a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
index e6f426a759..9d7a8b4478 100644
--- a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
@@ -41,201 +41,174 @@ using std::vector;
 namespace mindspore {
 namespace lite {
 namespace quant {
-struct DivergInfo {
-  std::vector<float> histogram;
-  CNodePtr cnode;
-  int bin_num;
-  float interval = 0;
-  float max;
-  float min;
-  float best_T = 0.0f;
-  size_t bit_num;
-  int quant_max = 255;
-  int quant_min = 0;
-  std::string method_x = kMethodKL;
-
-  DivergInfo(CNodePtr cnode, int bins, size_t bits, int quant_max, int quant_min, const std::string &method_x) {
-    this->method_x = method_x;
-    this->cnode = cnode;
-    this->bin_num = bins;
-    this->bit_num = bits;
-    histogram.resize(bin_num);
-    max = -FLT_MAX;
-    min = FLT_MAX;
-    this->quant_max = quant_max;
-    this->quant_min = quant_min;
-    std::fill(histogram.begin(), histogram.end(), 1.0e-7);
+STATUS DivergInfo::RecordMaxValue(const std::vector<float> &datas) {
+  for (float data : datas) {
+    max = std::max(data, max);
+    min = std::min(data, min);
   }
+  return RET_OK;
+}
+
+void DivergInfo::UpdateInterval() {
+  auto max_value = std::max(fabs(this->max), fabs(this->min));
+  this->interval = max_value / static_cast<float>(bin_num);
+}
 
-  STATUS RecordMaxValue(const std::vector<float> &datas) {
-    for (float data : datas) {
-      max = std::max(data, max);
-      min = std::min(data, min);
+STATUS DivergInfo::UpdateHistogram(const std::vector<float> &data) {
+  for (auto value : data) {
+    if (value == 0) {
+      continue;
     }
-    return RET_OK;
+    int bin_index = std::min(static_cast<int>(std::fabs(value) / this->interval), bin_num - 1);
+    this->histogram[bin_index]++;
   }
+  return RET_OK;
+}
 
-  void UpdateInterval() {
-    auto max_value = std::max(fabs(this->max), fabs(this->min));
-    this->interval = max_value / static_cast<float>(bin_num);
+void DivergInfo::DumpHistogram() {
+  MS_LOG(INFO) << "Print node " << cnode->fullname_with_scope() << " histogram";
+  for (float item : this->histogram) {
+    std::cout << item << " ";
   }
+  std::cout << std::endl;
+}
 
-  STATUS UpdateHistogram(const std::vector<float> &data, const std::vector<int> &shape) {
-    for (auto value : data) {
-      if (value == 0) {
-        continue;
-      }
-      int bin_index = std::min(static_cast<int>(std::fabs(value) / this->interval), bin_num - 1);
-      this->histogram[bin_index]++;
-    }
+STATUS DivergInfo::ComputeThreshold() {
+  if (method_x == kMethodMaxMin) {
+    this->best_T = std::max(fabs(this->max), fabs(this->min));
+    MS_LOG(DEBUG) << "using MAX_MIN, T: " << this->best_T;
     return RET_OK;
   }
 
-  void DumpHistogram() {
-    MS_LOG(INFO) << "Print node " << cnode->fullname_with_scope() << " histogram";
-    for (float item : this->histogram) {
-      std::cout << item << " ";
-    }
-    std::cout << std::endl;
-  }
-
-  STATUS ComputeThreshold() {
-    if (method_x == kMethodMaxMin) {
-      this->best_T = std::max(fabs(this->max), fabs(this->min));
-      MS_LOG(DEBUG) << "using MAX_MIN, T: " << this->best_T;
-      return RET_OK;
+  constexpr int quant_bint_nums = 128;
+  int threshold = quant_bint_nums;
+  float min_kl = FLT_MAX;
+  float after_threshold_sum = std::accumulate(this->histogram.begin() + quant_bint_nums, this->histogram.end(), 0.0f);
+
+  for (int i = quant_bint_nums; i < this->bin_num; ++i) {
+    std::vector<float> quantized_histogram(quant_bint_nums, 0);
+    std::vector<float> reference_histogram(this->histogram.begin(), this->histogram.begin() + i);
+    std::vector<float> expanded_histogram(i, 0);
+    reference_histogram[i - 1] += after_threshold_sum;
+    after_threshold_sum -= this->histogram[i];
+
+    const float bin_interval = static_cast<float>(i) / static_cast<float>(quant_bint_nums);
+
+    // merge i bins to target bins
+    for (int j = 0; j < quant_bint_nums; ++j) {
+      const float start = j * bin_interval;
+      const float end = start + bin_interval;
+      const int left_upper = static_cast<int>(std::ceil(start));
+      if (left_upper > start) {
+        const double left_scale = left_upper - start;
+        quantized_histogram[j] += left_scale * this->histogram[left_upper - 1];
+      }
+      const int right_lower = static_cast<int>(std::floor(end));
+      if (right_lower < end) {
+        const double right_scale = end - right_lower;
+        quantized_histogram[j] += right_scale * this->histogram[right_lower];
+      }
+      std::for_each(this->histogram.begin() + left_upper, this->histogram.begin() + right_lower,
+                    [&quantized_histogram, j](float item) { quantized_histogram[j] += item; });
     }
-
-    constexpr int quant_bint_nums = 128;
-    int threshold = quant_bint_nums;
-    float min_kl = FLT_MAX;
-    float after_threshold_sum = std::accumulate(this->histogram.begin() + quant_bint_nums, this->histogram.end(), 0.0f);
-
-    for (int i = quant_bint_nums; i < this->bin_num; ++i) {
-      std::vector<float> quantized_histogram(quant_bint_nums, 0);
-      std::vector<float> reference_histogram(this->histogram.begin(), this->histogram.begin() + i);
-      std::vector<float> expanded_histogram(i, 0);
-      reference_histogram[i - 1] += after_threshold_sum;
-      after_threshold_sum -= this->histogram[i];
-
-      const float bin_interval = static_cast<float>(i) / static_cast<float>(quant_bint_nums);
-
-      // merge i bins to target bins
-      for (int j = 0; j < quant_bint_nums; ++j) {
-        const float start = j * bin_interval;
-        const float end = start + bin_interval;
-        const int left_upper = static_cast<int>(std::ceil(start));
-        if (left_upper > start) {
-          const double left_scale = left_upper - start;
-          quantized_histogram[j] += left_scale * this->histogram[left_upper - 1];
-        }
-        const int right_lower = static_cast<int>(std::floor(end));
-        if (right_lower < end) {
-          const double right_scale = end - right_lower;
-          quantized_histogram[j] += right_scale * this->histogram[right_lower];
+    // expand target bins to i bins in order to calculate KL with reference_histogram
+    for (int j = 0; j < quant_bint_nums; ++j) {
+      const float start = j * bin_interval;
+      const float end = start + bin_interval;
+      float count = 0;
+      const int left_upper = static_cast<int>(std::ceil(start));
+      float left_scale = 0.0f;
+      if (left_upper > start) {
+        left_scale = left_upper - start;
+        if (this->histogram[left_upper - 1] != 0) {
+          count += left_scale;
         }
-        std::for_each(this->histogram.begin() + left_upper, this->histogram.begin() + right_lower,
-                      [&quantized_histogram, j](float item) { quantized_histogram[j] += item; });
       }
-      // expand target bins to i bins in order to calculate KL with reference_histogram
-      for (int j = 0; j < quant_bint_nums; ++j) {
-        const float start = j * bin_interval;
-        const float end = start + bin_interval;
-        float count = 0;
-        const int left_upper = static_cast<int>(std::ceil(start));
-        float left_scale = 0.0f;
-        if (left_upper > start) {
-          left_scale = left_upper - start;
-          if (this->histogram[left_upper - 1] != 0) {
-            count += left_scale;
-          }
-        }
-        const int right_lower = static_cast<int>(std::floor(end));
-        double right_scale = 0.0f;
-        if (right_lower < end) {
-          right_scale = end - right_lower;
-          if (this->histogram[right_lower] != 0) {
-            count += right_scale;
-          }
+      const int right_lower = static_cast<int>(std::floor(end));
+      double right_scale = 0.0f;
+      if (right_lower < end) {
+        right_scale = end - right_lower;
+        if (this->histogram[right_lower] != 0) {
+          count += right_scale;
         }
-        std::for_each(this->histogram.begin() + left_upper, this->histogram.begin() + right_lower,
-                      [&count](float item) {
-                        if (item != 0) {
-                          count += 1;
-                        }
-                      });
-        if (count == 0) {
-          continue;
-        }
-        const float average_num = quantized_histogram[j] / count;
-        if (left_upper > start && this->histogram[left_upper - 1] != 0) {
-          expanded_histogram[left_upper - 1] += average_num * left_scale;
-        }
-        if (right_lower < end && this->histogram[right_lower] != 0) {
-          expanded_histogram[right_lower] += average_num * right_scale;
+      }
+      std::for_each(this->histogram.begin() + left_upper, this->histogram.begin() + right_lower, [&count](float item) {
+        if (item != 0) {
+          count += 1;
         }
-        for (int k = left_upper; k < right_lower; ++k) {
-          if (this->histogram[k] != 0) {
-            expanded_histogram[k] += average_num;
-          }
+      });
+      if (count == 0) {
+        continue;
+      }
+      const float average_num = quantized_histogram[j] / count;
+      if (left_upper > start && this->histogram[left_upper - 1] != 0) {
+        expanded_histogram[left_upper - 1] += average_num * left_scale;
+      }
+      if (right_lower < end && this->histogram[right_lower] != 0) {
+        expanded_histogram[right_lower] += average_num * right_scale;
+      }
+      for (int k = left_upper; k < right_lower; ++k) {
+        if (this->histogram[k] != 0) {
+          expanded_histogram[k] += average_num;
         }
       }
-      auto KLDivergence = [](std::vector<float> p, std::vector<float> q) {
-        auto sum = 0.0f;
-        std::for_each(p.begin(), p.end(), [&sum](float item) { sum += item; });
-        std::for_each(p.begin(), p.end(), [sum](float &item) { item /= sum; });
-        sum = 0.0f;
-        std::for_each(q.begin(), q.end(), [&sum](float item) { sum += item; });
-        std::for_each(q.begin(), q.end(), [sum](float &item) { item /= sum; });
-
-        float result = 0.0f;
-        const int size = p.size();
-        for (int i = 0; i < size; ++i) {
-          if (p[i] != 0) {
-            if (q[i] == 0) {
-              result += 1.0f;
-            } else {
-              result += (p[i] * std::log((p[i]) / (q[i])));
-            }
+    }
+    auto KLDivergence = [](std::vector<float> p, std::vector<float> q) {
+      auto sum = 0.0f;
+      std::for_each(p.begin(), p.end(), [&sum](float item) { sum += item; });
+      std::for_each(p.begin(), p.end(), [sum](float &item) { item /= sum; });
+      sum = 0.0f;
+      std::for_each(q.begin(), q.end(), [&sum](float item) { sum += item; });
+      std::for_each(q.begin(), q.end(), [sum](float &item) { item /= sum; });
+
+      float result = 0.0f;
+      const int size = p.size();
+      for (int i = 0; i < size; ++i) {
+        if (p[i] != 0) {
+          if (q[i] == 0) {
+            result += 1.0f;
+          } else {
+            result += (p[i] * std::log((p[i]) / (q[i])));
           }
         }
-        return result;
-      };
-      const float kl = KLDivergence(reference_histogram, expanded_histogram);
-      if (kl < min_kl) {
-        min_kl = kl;
-        threshold = i;
       }
+      return result;
+    };
+    const float kl = KLDivergence(reference_histogram, expanded_histogram);
+    if (kl < min_kl) {
+      min_kl = kl;
+      threshold = i;
     }
-    this->best_T = (static_cast<float>(threshold) + 0.5f) * this->interval;
-    MS_LOG(DEBUG) << cnode->fullname_with_scope() << " Best threshold bin index: " << threshold << " T: " << best_T
-                  << " max: " << std::max(fabs(this->max), fabs(this->min));
-    return RET_OK;
   }
+  this->best_T = (static_cast<float>(threshold) + 0.5f) * this->interval;
+  MS_LOG(DEBUG) << cnode->fullname_with_scope() << " Best threshold bin index: " << threshold << " T: " << best_T
+                << " max: " << std::max(fabs(this->max), fabs(this->min));
+  return RET_OK;
+}
 
-  std::pair<CNodePtr, float> GetScale() {
-    float max_value = this->best_T;
-    float min_value = -max_value;
+std::pair<CNodePtr, float> DivergInfo::GetScale() {
+  float max_value = this->best_T;
+  float min_value = -max_value;
 
-    MS_ASSERT(quant_max - quant_min != 0);
-    float scale = (max_value - min_value) / (quant_max - quant_min);
-    MS_ASSERT(scale != 0);
-    return std::make_pair(this->cnode, scale);
-  }
+  MS_ASSERT(quant_max - quant_min != 0);
+  float scale = (max_value - min_value) / (quant_max - quant_min);
+  MS_ASSERT(scale != 0);
+  return std::make_pair(this->cnode, scale);
+}
 
-  std::pair<CNodePtr, int32_t> GetZeropoint() {
-    int zero_point = 0;
-    if (quant_min == 0 && quant_max == 255) {
-      zero_point = 128;
-    } else if (quant_min == -127 && quant_max == 127) {
-      zero_point = 0;
-    } else {
-      MS_LOG(WARNING) << "unexpectd quant range, quant_min: " << quant_min << " quant_max: " << quant_max;
-    }
-    return std::make_pair(this->cnode, zero_point);
+std::pair<CNodePtr, int32_t> DivergInfo::GetZeropoint() {
+  int zero_point = 0;
+  if (quant_min == 0 && quant_max == 255) {
+    zero_point = 128;
+  } else if (quant_min == -127 && quant_max == 127) {
+    zero_point = 0;
+  } else {
+    MS_LOG(WARNING) << "unexpectd quant range, quant_min: " << quant_min << " quant_max: " << quant_max;
   }
-};
-std::unordered_map<CNodePtr, float> Calibrator::GetResult(
+  return std::make_pair(this->cnode, zero_point);
+}
+
+std::unordered_map<CNodePtr, float> Calibrator::GetScale(
   std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info) {
   std::unordered_map<CNodePtr, float> result;
   for (auto iter = diverg_info->begin(); iter != diverg_info->end(); iter++) {
@@ -246,9 +219,9 @@ std::unordered_map<CNodePtr, float> Calibrator::GetResult(
   return result;
 }
 std::unordered_map<CNodePtr, int32_t> Calibrator::GetZeropoint(
-  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *mDivergInfo) {
+  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info) {
   std::unordered_map<CNodePtr, int32_t> result;
-  for (auto iter = mDivergInfo->begin(); iter != mDivergInfo->end(); iter++) {
+  for (auto iter = diverg_info->begin(); iter != diverg_info->end(); iter++) {
     DivergInfo *info = iter->second.get();
     auto zeropoint = info->GetZeropoint();
     result.insert(zeropoint);
@@ -257,9 +230,9 @@ std::unordered_map<CNodePtr, int32_t> Calibrator::GetZeropoint(
 }
 
 std::map<CNodePtr, MaxMin> Calibrator::GetMinMax(
-  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *mDivergInfo) {
+  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info) {
   std::map<CNodePtr, MaxMin> result;
-  for (auto iter = mDivergInfo->begin(); iter != mDivergInfo->end(); iter++) {
+  for (auto iter = diverg_info->begin(); iter != diverg_info->end(); iter++) {
     DivergInfo *info = iter->second.get();
     mindspore::lite::quant::MaxMin input_maxmin{};
     input_maxmin.min = info->min;
@@ -284,10 +257,10 @@ std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *Calibrator::GetOut
   return &this->output_diverg_info_;
 }
 
-STATUS Calibrator::RecordMaxValue(std::string opName, vector<float> data,
-                                  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *mDivergInfo) {
-  auto got = (*mDivergInfo).find(opName);
-  if (got != (*mDivergInfo).end()) {
+STATUS Calibrator::RecordMaxValue(const std::string &op_name, const vector<float> &data,
+                                  std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info) {
+  auto got = (*diverg_info).find(op_name);
+  if (got != (*diverg_info).end()) {
     ((*got).second)->RecordMaxValue(data);
   }
   return RET_OK;
@@ -332,11 +305,11 @@ STATUS Calibrator::UpdateDivergInverval(std::unordered_map<std::string, std::uni
   return RET_OK;
 }
 
-STATUS Calibrator::UpdateDataFrequency(std::string op_name, vector<float> data, vector<int> shape,
+STATUS Calibrator::UpdateDataFrequency(const std::string &op_name, const vector<float> &data,
                                        std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info) {
   auto got = (*diverg_info).find(op_name);
   if (got != (*diverg_info).end()) {
-    ((*got).second)->UpdateHistogram(data, shape);
+    ((*got).second)->UpdateHistogram(data);
   }
   return RET_OK;
 }
@@ -347,10 +320,10 @@ STATUS Calibrator::AddQuantizedOp(CNodePtr node) {
     return RET_ERROR;
   }
   string node_name = node->fullname_with_scope();
-  std::unique_ptr<DivergInfo> input_diverg =
-    std::unique_ptr<DivergInfo>(new DivergInfo(node, 2048, bit_num_, quant_max_, quant_min_, config_param_.method_x));
-  std::unique_ptr<DivergInfo> output_diverg =
-    std::unique_ptr<DivergInfo>(new DivergInfo(node, 2048, bit_num_, quant_max_, quant_min_, config_param_.method_x));
+  std::unique_ptr<DivergInfo> input_diverg = std::unique_ptr<DivergInfo>(
+    new DivergInfo(node, kDefaultBinNumber, bit_num_, quant_max_, quant_min_, config_param_.method_x));
+  std::unique_ptr<DivergInfo> output_diverg = std::unique_ptr<DivergInfo>(
+    new DivergInfo(node, kDefaultBinNumber, bit_num_, quant_max_, quant_min_, config_param_.method_x));
 
   input_diverg_info_.insert(std::make_pair(string(node_name), std::move(input_diverg)));
   output_diverg_info_.insert(std::make_pair(string(node_name), std::move(output_diverg)));
@@ -359,29 +332,33 @@ STATUS Calibrator::AddQuantizedOp(CNodePtr node) {
 
 void Calibrator::AddImage(const string file) {
   auto exist = [](const string file) {
-    struct stat buf;
+    struct stat buf {};
     return stat(file.c_str(), &buf) == 0;
   };
   if (exist(file)) {
     MS_LOG(INFO) << "load image: " << file;
     this->images_.push_back(file);
   } else {
-    MS_LOG(WARNING) << "Invaild image file path: " << file;
+    MS_LOG(WARNING) << "invalid image file path: " << file;
   }
 }
 
-STATUS Calibrator::GenerateInputData(const int index, mindspore::tensor::MSTensor *tensor) const {
+STATUS Calibrator::GenerateInputData(int index, mindspore::tensor::MSTensor *tensor) const {
   string path = images_[index];
   MS_LOG(INFO) << "read image: " << path;
   size_t size;
-  char *binBuf = ReadFile(path.c_str(), &size);
+  char *bin_buf = ReadFile(path.c_str(), &size);
   auto data = tensor->MutableData();
   if (size != tensor->Size()) {
     MS_LOG(ERROR) << "the input data is not consistent with model input, file_size: " << size
                   << " input tensor size: " << tensor->Size();
     return RET_ERROR;
   }
-  memcpy(data, binBuf, size);
+  auto ret = memcpy_s(data, tensor->Size(), bin_buf, size);
+  if (ret != EOK) {
+    MS_LOG(ERROR) << "memcpy_s error: " << ret;
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -467,7 +444,7 @@ STATUS Calibrator::ReadConfig() {
   }
   MS_LOG(DEBUG) << "image_path: " << config_param_.image_path << "  "
                 << "batch_count: " << config_param_.batch_count << "  "
-                << "mothod_x: " << config_param_.method_x << "  "
+                << "method_x: " << config_param_.method_x << "  "
                 << "thread_num: " << config_param_.thread_num;
 
   delete[] resolved_path;
@@ -475,8 +452,8 @@ STATUS Calibrator::ReadConfig() {
   return RET_OK;
 }
 
-Calibrator::Calibrator(string path, size_t bitNum, int quantMax, int quantMin)
-    : config_path_(path), bit_num_(bitNum), quant_max_(quantMax), quant_min_(quantMin) {}
+Calibrator::Calibrator(string path, size_t bit_num, int quant_max, int quant_min)
+    : config_path_(path), bit_num_(bit_num), quant_max_(quant_max), quant_min_(quant_min) {}
 
 PostTrainingQuantizer::PostTrainingQuantizer(FuncGraphPtr graph, string path, int bit_num, TypeId target_type,
                                              bool per_channel)
@@ -486,7 +463,7 @@ PostTrainingQuantizer::PostTrainingQuantizer(FuncGraphPtr graph, string path, in
   this->target_type_ = target_type;
   if (target_type == kNumberTypeInt8) {
     quant_max = (1 << (this->bit_num - 1)) - 1;  // 127
-    quant_min = -quant_max;     // -127
+    quant_min = -quant_max;                      // -127
   } else if (target_type == kNumberTypeUInt8) {
     quant_max = (1 << this->bit_num) - 1;  // 255
     quant_min = 0;
@@ -534,8 +511,8 @@ STATUS PostTrainingQuantizer::DoQuantOutput(double scale, int zeropoint, struct
   return RET_OK;
 }
 
-STATUS PostTrainingQuantizer::DoWeightQuant(AnfNodePtr weight, std::shared_ptr<PrimitiveC> primitiveT_value,
-                                            bool perchanel, bool depthwise) {
+STATUS PostTrainingQuantizer::DoWeightQuant(AnfNodePtr weight, std::shared_ptr<PrimitiveC> primitive_c, bool perchanel,
+                                            bool depthwise) {
   // const vector<int> dims = filter->dims;
   // perlayer
   if (!weight->isa<Parameter>()) {
@@ -552,8 +529,8 @@ STATUS PostTrainingQuantizer::DoWeightQuant(AnfNodePtr weight, std::shared_ptr<P
     MS_LOG(ERROR) << weight->fullname_with_scope() << " can not get value";
     return RET_ERROR;
   }
-  auto status = QuantFilter(paramValue, primitiveT_value, QuantType_PostTraining, quant_max, quant_min, bit_num,
-                            perchanel, depthwise);
+  auto status =
+    QuantFilter(paramValue, primitive_c, QuantType_PostTraining, quant_max, quant_min, bit_num, perchanel, depthwise);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "QuantFilter failed: " << status;
     return status;
@@ -573,8 +550,8 @@ STATUS PostTrainingQuantizer::DoWeightQuant(AnfNodePtr weight, std::shared_ptr<P
   return RET_OK;
 }
 
-STATUS PostTrainingQuantizer::DoBiasQuant(AnfNodePtr bias, std::shared_ptr<PrimitiveC> primitiveT_value) {
-  if (primitiveT_value == nullptr || bias == nullptr) {
+STATUS PostTrainingQuantizer::DoBiasQuant(AnfNodePtr bias, std::shared_ptr<PrimitiveC> primitive_c) {
+  if (primitive_c == nullptr || bias == nullptr) {
     MS_LOG(ERROR) << "null pointer!";
     return RET_NULL_PTR;
   }
@@ -583,7 +560,7 @@ STATUS PostTrainingQuantizer::DoBiasQuant(AnfNodePtr bias, std::shared_ptr<Primi
   auto bias_default_param = bias_parameter_ptr->default_param();
   auto bias_param = std::dynamic_pointer_cast<ParamValueLite>(bias_default_param);
 
-  auto active_weight_quant_params = primitiveT_value->GetInputQuantParams();
+  auto active_weight_quant_params = primitive_c->GetInputQuantParams();
   if (active_weight_quant_params.size() != 2) {
     MS_LOG(ERROR) << "unexpected active_weight_quant_params size: " << active_weight_quant_params.size();
     return RET_ERROR;
@@ -627,7 +604,7 @@ STATUS PostTrainingQuantizer::DoBiasQuant(AnfNodePtr bias, std::shared_ptr<Primi
     quant_param.inited = true;
     quant_params.emplace_back(quant_param);
   }
-  primitiveT_value->AddInputQuantParam(quant_params);
+  primitive_c->AddInputQuantParam(quant_params);
   // quant bias data
   int32_t *quant_datas = new (std::nothrow) int32_t[shape_size];
   if (quant_datas == nullptr) {
@@ -669,11 +646,11 @@ STATUS PostTrainingQuantizer::DoBiasQuant(AnfNodePtr bias, std::shared_ptr<Primi
 
 STATUS PostTrainingQuantizer::QuantNode() {
   auto input_min_max = this->calibrator_->GetMinMax(this->calibrator_->GetInputDivergInfo());
-  auto input_scale = this->calibrator_->GetResult(this->calibrator_->GetInputDivergInfo());
+  auto input_scale = this->calibrator_->GetScale(this->calibrator_->GetInputDivergInfo());
   auto input_zero_point = this->calibrator_->GetZeropoint(this->calibrator_->GetInputDivergInfo());
 
   auto output_min_max = this->calibrator_->GetMinMax(this->calibrator_->GetOutputDivergInfo());
-  auto output_scale = this->calibrator_->GetResult(this->calibrator_->GetOutputDivergInfo());
+  auto output_scale = this->calibrator_->GetScale(this->calibrator_->GetOutputDivergInfo());
   auto output_zeropoint = this->calibrator_->GetZeropoint(this->calibrator_->GetOutputDivergInfo());
 
   auto cnodes = funcGraph->GetOrderedCnodes();
@@ -683,18 +660,18 @@ STATUS PostTrainingQuantizer::QuantNode() {
       MS_LOG(INFO) << cnode_name << " can not do quant";
       continue;
     }
-    auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
-    if (primitiveT_value == nullptr) {
-      MS_LOG(ERROR) << "PrimitiveT_value is nullptr";
+    auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+    if (primitive_c == nullptr) {
+      MS_LOG(ERROR) << "primitive_c is nullptr";
       continue;
     }
     if (input_scale.find(cnode) == input_scale.end()) {
-      primitiveT_value->SetQuantType(schema::QuantType_QUANT_NONE);
+      primitive_c->SetQuantType(schema::QuantType_QUANT_NONE);
       continue;
     }
-    primitiveT_value->ClearInputOutputQuantParam();
+    primitive_c->ClearInputOutputQuantParam();
     auto op_name = cnode->fullname_with_scope();
-    auto op_type = (schema::PrimitiveType)primitiveT_value->Type();
+    auto op_type = (schema::PrimitiveType)primitive_c->Type();
     MS_LOG(INFO) << "OpName: " << op_name;
     if (op_type != PrimitiveType_Conv2D && op_type != PrimitiveType_DepthwiseConv2D &&
         op_type != PrimitiveType_FullConnection) {
@@ -715,35 +692,35 @@ STATUS PostTrainingQuantizer::QuantNode() {
           auto abstractTensor = utils::cast<abstract::AbstractTensorPtr>(abstractBase);
           if (abstractTensor->element()->GetTypeTrack()->type_id() == kNumberTypeFloat32) {
             MS_LOG(DEBUG) << "this parameter do quant";
-            DoWeightQuant(input_node, primitiveT_value, false, false);
+            DoWeightQuant(input_node, primitive_c, false, false);
           } else {
             MS_LOG(DEBUG) << "this parameter no need to do quant";
           }
           continue;
         }
         auto input_cnode = std::dynamic_pointer_cast<mindspore::CNode>(input_node);
-        auto input_cnode_primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(input_cnode->input(0));
-        if (input_cnode_primitiveT_value == nullptr) {
+        auto input_cnode_primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(input_cnode->input(0));
+        if (input_cnode_primitive_c == nullptr) {
           MS_LOG(DEBUG) << "input: " << i << " " << input_cnode->fullname_with_scope() << ": "
                         << " PrimitiveC is null";
           continue;
         }
-        if (!input_cnode_primitiveT_value->GetOutputQuantParams().empty()) {
-          for (auto &quant_param : input_cnode_primitiveT_value->GetOutputQuantParams()) {
-            primitiveT_value->AddInputQuantParam(quant_param);
+        if (!input_cnode_primitive_c->GetOutputQuantParams().empty()) {
+          for (auto &quant_param : input_cnode_primitive_c->GetOutputQuantParams()) {
+            primitive_c->AddInputQuantParam(quant_param);
           }
         } else {
           // do input quant
           double scale = input_scale[cnode];
           int32_t zp = input_zero_point[cnode];
-          DoQuantInput(scale, zp, &input_min_max[cnode], primitiveT_value);
+          DoQuantInput(scale, zp, &input_min_max[cnode], primitive_c);
         }
       }
     } else {
       // do input quant
       double scale = input_scale[cnode];
       int32_t convInputzeropoint = input_zero_point[cnode];
-      DoQuantInput(scale, convInputzeropoint, &input_min_max[cnode], primitiveT_value);
+      DoQuantInput(scale, convInputzeropoint, &input_min_max[cnode], primitive_c);
       // do weight quant
       auto weight = cnode->input(2);
       bool depthwise = op_type == PrimitiveType_DepthwiseConv2D;
@@ -751,18 +728,18 @@ STATUS PostTrainingQuantizer::QuantNode() {
       if (op_type == PrimitiveType_FullConnection) {
         perchannel = false;
       }
-      DoWeightQuant(weight, primitiveT_value, perchannel, depthwise);
+      DoWeightQuant(weight, primitive_c, perchannel, depthwise);
       // do bias quant
       if (cnode->inputs().size() == 4) {
         auto bias = cnode->input(3);
-        DoBiasQuant(bias, primitiveT_value);
+        DoBiasQuant(bias, primitive_c);
       }
     }
     // do output quant
     double OutputScale = output_scale[cnode];
     int32_t OutputZeropoint = output_zeropoint[cnode];
-    DoQuantOutput(OutputScale, OutputZeropoint, &output_min_max[cnode], primitiveT_value);
-    primitiveT_value->SetQuantType(schema::QuantType_PostTraining);
+    DoQuantOutput(OutputScale, OutputZeropoint, &output_min_max[cnode], primitive_c);
+    primitive_c->SetQuantType(schema::QuantType_PostTraining);
   }
   return RET_OK;
 }
@@ -803,7 +780,7 @@ STATUS PostTrainingQuantizer::PreProcess() {
   // from user input
   QuantStrategy strategy(10);
   auto cnodes = funcGraph->GetOrderedCnodes();
-  for (auto cnode : cnodes) {
+  for (auto &cnode : cnodes) {
     AnfNodePtr anf = std::dynamic_pointer_cast<AnfNode>(cnode);
     if (strategy.CanOpPostQuantized(anf)) {
       MS_LOG(INFO) << "node: " << cnode->fullname_with_scope() << " will be quantized";
@@ -813,16 +790,15 @@ STATUS PostTrainingQuantizer::PreProcess() {
   return RET_OK;
 }
 
-STATUS PostTrainingQuantizer::CheckTensorVec(const std::string &nodeName,
-                                             const std::vector<mindspore::tensor::MSTensor *> &tensorVec) const {
-  if (tensorVec.size() < 1) {
-    MS_LOG(ERROR) << "node: " << nodeName << " input tensors is 0";
+STATUS PostTrainingQuantizer::CheckTensorVec(const std::string &node_name,
+                                             const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) const {
+  if (tensor_vec.size() < 1) {
+    MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
     return RET_ERROR;
   }
-  auto *tensor = tensorVec[0];
+  auto *tensor = tensor_vec[0];
   if (tensor->data_type() != kNumberTypeFloat32) {
-    //&& tensor->RefCount() != MSCONST_WEIGHT_REFCOUNT
-    MS_LOG(DEBUG) << "node: " << nodeName << " will not quantize"
+    MS_LOG(DEBUG) << "node: " << node_name << " will not quantize"
                   << " tensor data_type: " << tensor->data_type();
     return RET_ERROR;
   }
@@ -856,8 +832,8 @@ STATUS PostTrainingQuantizer::DoInference() {
       }
       auto tensor = beforeInputs[0];
       const float *tData = static_cast<const float *>(tensor->MutableData());
-      size_t shapeSize = tensor->ElementsNum();
-      vector<float> data(tData, tData + shapeSize);
+      size_t elem_count = tensor->ElementsNum();
+      vector<float> data(tData, tData + elem_count);
       this->calibrator_->RecordMaxValue(callParam.name_callback_param, data, this->calibrator_->GetInputDivergInfo());
       return true;
     };
@@ -871,8 +847,8 @@ STATUS PostTrainingQuantizer::DoInference() {
       }
       auto tensor = afterOutputs[0];
       const float *tensor_data = static_cast<const float *>(tensor->MutableData());
-      size_t shape_size = tensor->ElementsNum();
-      vector<float> data(tensor_data, tensor_data + shape_size);
+      size_t elem_count = tensor->ElementsNum();
+      vector<float> data(tensor_data, tensor_data + elem_count);
       this->calibrator_->RecordMaxValue(callParam.name_callback_param, data, this->calibrator_->GetOutputDivergInfo());
       return true;
     };
@@ -910,7 +886,7 @@ STATUS PostTrainingQuantizer::CollectDataFrequency() {
         const float *tensor_data = static_cast<const float *>(tensor->MutableData());
         size_t shape_size = tensor->ElementsNum();
         vector<float> data(tensor_data, tensor_data + shape_size);
-        this->calibrator_->UpdateDataFrequency(callParam.name_callback_param, data, tensor->shape(),
+        this->calibrator_->UpdateDataFrequency(callParam.name_callback_param, data,
                                                this->calibrator_->GetInputDivergInfo());
         return true;
       };
@@ -926,7 +902,7 @@ STATUS PostTrainingQuantizer::CollectDataFrequency() {
         const float *tenosr_data = static_cast<const float *>(tensor->MutableData());
         size_t shape_size = tensor->ElementsNum();
         vector<float> data(tenosr_data, tenosr_data + shape_size);
-        this->calibrator_->UpdateDataFrequency(call_param.name_callback_param, data, tensor->shape(),
+        this->calibrator_->UpdateDataFrequency(call_param.name_callback_param, data,
                                                this->calibrator_->GetOutputDivergInfo());
         return true;
       };
@@ -951,7 +927,7 @@ STATUS PostTrainingQuantizer::DoQuantize(FuncGraphPtr funcGraph) {
   }
 
   // anf -- fb
-  auto meta_graph = Export(funcGraph);
+  auto meta_graph = Export(funcGraph, true);
   if (meta_graph == nullptr) {
     MS_LOG(ERROR) << "Export to meta_graph return nullptr";
     return RET_ERROR;
diff --git a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.h b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.h
index 1c47ee4dff..e8e6be2d6d 100644
--- a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.h
@@ -23,6 +23,7 @@
 #include <vector>
 #include <cfloat>
 #include <map>
+#include <utility>
 #include "src/lite_session.h"
 #include "tools/converter/quantizer/quantizer.h"
 #include "tools/converter/converter.h"
@@ -39,21 +40,15 @@ struct MaxMin {
   float max;
 };
 
-enum ImageFormat {
-  RGB = 0,
-  GRAY = 1,
-  BGR = 2,
-};
-
 const char kMethodMaxMin[] = "MAX_MIN";
 const char kMethodKL[] = "KL";
+constexpr int kDefaultBinNumber = 2048;
 
 struct ConfigParam {
-  // ImageFormat imageFormat;
   std::string image_path;
   uint32_t batch_count{100};
   std::string method_x{kMethodKL};
-  uint32_t thread_num;
+  uint32_t thread_num{1};
 };
 
 class PostTrainingQuantizer : public Quantizer {
@@ -78,7 +73,8 @@ class PostTrainingQuantizer : public Quantizer {
 
   STATUS PreProcess();
 
-  STATUS CheckTensorVec(const std::string &nodeName, const std::vector<mindspore::tensor::MSTensor *> &tensorVec) const;
+  STATUS CheckTensorVec(const std::string &node_name,
+                        const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) const;
 
   STATUS DoInference();
 
@@ -95,17 +91,55 @@ class PostTrainingQuantizer : public Quantizer {
   STATUS DoQuantInput(double scale, int32_t zeropoint, struct MaxMin *max_min, std::shared_ptr<PrimitiveC>);
   STATUS DoQuantOutput(double scale, int32_t zeropoint, struct MaxMin *max_min, std::shared_ptr<PrimitiveC>);
 
-  STATUS DoWeightQuant(AnfNodePtr weight, std::shared_ptr<PrimitiveC> primitiveT_value, bool perchannel,
-                       bool depthwise);
+  STATUS DoWeightQuant(AnfNodePtr weight, std::shared_ptr<PrimitiveC> primitive_c, bool perchannel, bool depthwise);
 
-  STATUS DoBiasQuant(AnfNodePtr bias, std::shared_ptr<PrimitiveC> primitiveT_value);
+  STATUS DoBiasQuant(AnfNodePtr bias, std::shared_ptr<PrimitiveC> primitive_c);
 };
 
-struct DivergInfo;
+struct DivergInfo {
+  std::vector<float> histogram;
+  CNodePtr cnode;
+  int bin_num;
+  float interval = 0;
+  float max;
+  float min;
+  float best_T = 0.0f;
+  size_t bit_num;
+  int quant_max = 255;
+  int quant_min = 0;
+  std::string method_x = kMethodKL;
+
+  DivergInfo(CNodePtr cnode, int bins, size_t bits, int quant_max, int quant_min, const std::string &method_x) {
+    this->method_x = method_x;
+    this->cnode = cnode;
+    this->bin_num = bins;
+    this->bit_num = bits;
+    histogram.resize(bin_num);
+    max = -FLT_MAX;
+    min = FLT_MAX;
+    this->quant_max = quant_max;
+    this->quant_min = quant_min;
+    std::fill(histogram.begin(), histogram.end(), 1.0e-7);
+  }
+
+  STATUS RecordMaxValue(const std::vector<float> &datas);
+
+  void UpdateInterval();
+
+  STATUS UpdateHistogram(const std::vector<float> &data);
+
+  void DumpHistogram();
+
+  STATUS ComputeThreshold();
+
+  std::pair<CNodePtr, float> GetScale();
+
+  std::pair<CNodePtr, int32_t> GetZeropoint();
+};
 
 class Calibrator {
  public:
-  explicit Calibrator(std::string path, size_t quant_size, int quant_max, int quant_msin);
+  explicit Calibrator(std::string path, size_t bit_num, int quant_max, int quant_min);
 
   ~Calibrator() = default;
 
@@ -123,18 +157,18 @@ class Calibrator {
 
   STATUS AddQuantizedOp(CNodePtr node);
 
-  STATUS RecordMaxValue(std::string opName, std::vector<float> data,
+  STATUS RecordMaxValue(const std::string &op_name, const std::vector<float> &data,
                         std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info);
 
   STATUS UpdateDivergInverval(std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info);
 
-  STATUS UpdateDataFrequency(std::string op_name, std::vector<float> data, std::vector<int> shape,
+  STATUS UpdateDataFrequency(const std::string &op_name, const std::vector<float> &data,
                              std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info);
   void Dump();
 
   STATUS ComputeThreshold();
 
-  std::unordered_map<CNodePtr, float> GetResult(
+  std::unordered_map<CNodePtr, float> GetScale(
     std::unordered_map<std::string, std::unique_ptr<DivergInfo>> *diverg_info);
 
   std::unordered_map<CNodePtr, int32_t> GetZeropoint(
diff --git a/mindspore/lite/tools/converter/quantizer/quant_cast.cc b/mindspore/lite/tools/converter/quantizer/quant_cast.cc
index 9e0c9d4cfe..c205452c26 100644
--- a/mindspore/lite/tools/converter/quantizer/quant_cast.cc
+++ b/mindspore/lite/tools/converter/quantizer/quant_cast.cc
@@ -44,17 +44,17 @@ STATUS QuantCast::Run(FuncGraphPtr graph) {
   bool first = true;
 
   for (auto &cnode : cnodes) {
-    auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+    auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
     auto curnode_quant_type = schema::QuantType_QUANT_NONE;
-    if (primitiveT_value == nullptr) {
-      MS_LOG(WARNING) << "PrimitiveT_value is nullptr: " << cnode->fullname_with_scope();
+    if (primitive_c == nullptr) {
+      MS_LOG(WARNING) << "primitive_c is nullptr: " << cnode->fullname_with_scope();
     } else {
-      curnode_quant_type = primitiveT_value->GetQuantType();
+      curnode_quant_type = primitive_c->GetQuantType();
     }
     if (first) {
       if (curnode_quant_type == schema::QuantType_PostTraining && inputDataDType == kNumberTypeFloat32) {
         auto value_node =
-            NewQuantCastValueNode(kNumberTypeFloat32, kNumberTypeInt8, primitiveT_value->GetInputQuantParams().front());
+            NewQuantCastValueNode(kNumberTypeFloat32, kNumberTypeInt8, primitive_c->GetInputQuantParams().front());
         std::vector<AnfNodePtr> op_inputs = {value_node, cnode->input(1)};
         auto quant_cast_cnode = graph->NewCNode(op_inputs);
         quant_cast_cnode->set_fullname_with_scope(cnode->fullname_with_scope() + "_quant_cast");
@@ -72,24 +72,24 @@ STATUS QuantCast::Run(FuncGraphPtr graph) {
         continue;
       }
       auto input_cnode = std::dynamic_pointer_cast<CNode>(input_node);
-      auto input_cnode_primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(input_cnode->input(0));
-      if (input_cnode_primitiveT_value == nullptr) {
+      auto input_cnode_primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(input_cnode->input(0));
+      if (input_cnode_primitive_c == nullptr) {
         MS_LOG(DEBUG) << "input: " << i << " " << input_cnode->fullname_with_scope() << ": "
                       << " PrimitiveC is null";
         continue;
       }
-      auto input_cnode_quant_type = input_cnode_primitiveT_value->GetQuantType();
+      auto input_cnode_quant_type = input_cnode_primitive_c->GetQuantType();
 
       if (curnode_quant_type != input_cnode_quant_type) {
         ValueNodePtr value_node = nullptr;
         if (curnode_quant_type == schema::QuantType_PostTraining &&
             input_cnode_quant_type == schema::QuantType_QUANT_NONE) {
           value_node = NewQuantCastValueNode(kNumberTypeFloat32, kNumberTypeInt8,
-                                             primitiveT_value->GetInputQuantParams().front());
+                                             primitive_c->GetInputQuantParams().front());
         } else if (curnode_quant_type == schema::QuantType_QUANT_NONE &&
             input_cnode_quant_type == schema::QuantType_PostTraining) {
           value_node = NewQuantCastValueNode(kNumberTypeInt8, kNumberTypeFloat32,
-                                             input_cnode_primitiveT_value->GetInputQuantParams().front());
+                                             input_cnode_primitive_c->GetInputQuantParams().front());
         }
         if (value_node == nullptr) {
           MS_LOG(WARNING) << "value_node is null! "
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
index 95f8a48810..6753973ac7 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
@@ -87,13 +87,13 @@ bool QuantStrategy::CanOpPostQuantized(AnfNodePtr &node) const {
   }
   auto cnode = std::dynamic_pointer_cast<CNode>(node);
 
-  auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
-  if (primitiveT_value == nullptr) {
-    MS_LOG(WARNING) << "PrimitiveT_value is nullptr: " << cnode->fullname_with_scope();
+  auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+  if (primitive_c == nullptr) {
+    MS_LOG(WARNING) << "primitive_c is nullptr: " << cnode->fullname_with_scope();
     return false;
   }
 
-  auto type = primitiveT_value->GetPrimitiveT()->value.type;
+  auto type = (schema::PrimitiveType)primitive_c->Type();
   MS_LOG(INFO) << "Primitive type: " << type;
   static const std::vector<schema::PrimitiveType> uint8OpList = {
     schema::PrimitiveType_Nchw2Nhwc, schema::PrimitiveType_Nhwc2Nchw,
@@ -168,11 +168,11 @@ STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, doubl
                              int quant_max, int quant_min, int num_bits) {
   MS_ASSERT(quantParam != nullptr);
   if (mMin > 0.0f) {
-    MS_LOG(ERROR) << "min " << mMin << " is bigger then 0, set to 0, this may course low precision";
+    MS_LOG(DEBUG) << "min " << mMin << " is bigger then 0, set to 0, this may course low precision";
     mMin = 0.0f;
   }
   if (mMax < 0.0f) {
-    MS_LOG(ERROR) << "mMax " << mMax << " is smaller than 0, set to 0, this may course low precision";
+    MS_LOG(DEBUG) << "mMax " << mMax << " is smaller than 0, set to 0, this may course low precision";
     mMax = 0.0f;
   }
   if (mMin > mMax) {
@@ -279,7 +279,7 @@ STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, doubl
   return RET_OK;
 }
 
-STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitiveT_value, QuantType quantType,
+STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitive_c, QuantType quantType,
                    int quant_max, int quant_min, size_t bitNum, bool per_channel, bool depth_wise) {
   auto dims = weight->tensor_shape();
   if (per_channel) {
@@ -349,16 +349,12 @@ STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primiti
           quant_datas[index] = quant_data;
         }
       }
-      auto ret = memcpy_s(const_cast<float *>(raw_datas), weight->tensor_size(), quant_datas.data(),
+      auto ret = memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(),
                           elem_count * sizeof(int8_t));
       if (ret != EOK) {
         MS_LOG(ERROR) << "memcpy error: " << ret;
         return RET_ERROR;
       }
-      if (quantType == QuantType_WeightQuant) {
-        PostBitPack(const_cast<float *>(raw_datas), elem_count, bitNum);
-      }
-
       weight->set_tensor_size(elem_count * sizeof(int8_t));
     } else {
       // channel at first
@@ -407,9 +403,6 @@ STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primiti
         MS_LOG(ERROR) << "memcpy error: " << ret;
         return RET_ERROR;
       }
-      if (quantType == QuantType_WeightQuant) {
-        PostBitPack(const_cast<float *>(raw_datas), elem_count, bitNum);
-      }
       weight->set_tensor_size(elem_count * sizeof(int8_t));
     }
 
@@ -441,16 +434,13 @@ STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primiti
       MS_LOG(ERROR) << "memcpy error: " << ret;
       return RET_ERROR;
     }
-    if (quantType == QuantType_WeightQuant) {
-      PostBitPack(raw_datas, elem_count, bitNum);
-    }
     weight->set_tensor_size(elem_count * sizeof(int8_t));
   }
   if (quant_params.empty()) {
     MS_LOG(ERROR) << "quant_params empty";
     return RET_ERROR;
   }
-  primitiveT_value->AddInputQuantParam(quant_params);
+  primitive_c->AddInputQuantParam(quant_params);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.h b/mindspore/lite/tools/converter/quantizer/quantize_util.h
index 7a95fc90cc..352f969c10 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.h
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h
@@ -118,7 +118,7 @@ T QuantizeData(float originData, const schema::QuantParamT &quantParam, int quan
   }();
 }
 
-STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitiveT_value, QuantType quantType,
+STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitive_c, QuantType quantType,
                    int quant_max, int quant_min, size_t bitNum = UINT8_QUANTIZATION, bool per_channel = false,
                    bool depth_wise = false);
 
diff --git a/mindspore/lite/tools/converter/quantizer/quantizer.h b/mindspore/lite/tools/converter/quantizer/quantizer.h
index 963a963552..3fe37379b3 100644
--- a/mindspore/lite/tools/converter/quantizer/quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/quantizer.h
@@ -51,7 +51,7 @@ class Quantizer {
 
   virtual STATUS DetermineNodeQuantType();
 
-  virtual STATUS DoQuantize(FuncGraphPtr funcGraph) = 0;
+  virtual STATUS DoQuantize(FuncGraphPtr func_graph) = 0;
 
     mindspore::lite::converter::Flags flags;
  protected:
diff --git a/mindspore/lite/tools/optimizer/common/gllo_utils.cc b/mindspore/lite/tools/optimizer/common/gllo_utils.cc
index c4c1aace5b..62f2613c32 100644
--- a/mindspore/lite/tools/optimizer/common/gllo_utils.cc
+++ b/mindspore/lite/tools/optimizer/common/gllo_utils.cc
@@ -170,7 +170,7 @@ bool AnfEqual(const BaseRef &a, const BaseRef &b) {
   if (a.m_ptr->isa<lite::PrimitiveC>() && b.m_ptr->isa<lite::PrimitiveC>()) {
     auto a_value_node_ptr = a.m_ptr->cast<PrimitiveCPtr>();
     auto b_value_node_ptr = b.m_ptr->cast<PrimitiveCPtr>();
-    return a_value_node_ptr->GetPrimitiveT()->value.type == b_value_node_ptr->GetPrimitiveT()->value.type;
+    return a_value_node_ptr->Type() == b_value_node_ptr->Type();
   }
 
   return a == b;
@@ -295,6 +295,9 @@ ParameterPtr AddNewBiasNode(float *bias_data, const FuncGraphPtr &func_graph, in
   MS_ASSERT(param_value != nullptr);
   param_value->set_tensor_addr(bias_data);
   param_value->set_tensor_size(kernel_num * sizeof(float) / sizeof(uint8_t));
+  param_value->set_format(weight_tensor->format());
+  param_value->set_tensor_type(weight_tensor->tensor_type());
+  param_value->set_tensor_shape(shape);
   bias_parameter->set_default_param(param_value);
   return bias_parameter;
 }
@@ -316,7 +319,7 @@ schema::PrimitiveType GetCNodeType(const BaseRef &n) {
   if (utils::isa<PrimitiveCPtr>(value)) {
     auto primitive = value->cast<PrimitiveCPtr>();
     MS_ASSERT(primitive != nullptr);
-    return primitive->GetPrimitiveT()->value.type;
+    return (schema::PrimitiveType)primitive->Type();
   } else if (utils::isa<Primitive>(value)) {
     auto primitive = value->cast<PrimitivePtr>();
     MS_ASSERT(primitive != nullptr);
@@ -346,6 +349,14 @@ bool IsConvNode(const BaseRef &n) {
   return false;
 }
 
+bool IsPoolingNode(const BaseRef &n) {
+  if (utils::isa<CNodePtr>(n) || utils::isa<ValueNodePtr>(n)) {
+    auto type = opt::GetCNodeType(n);
+    return type == schema::PrimitiveType_Pooling;
+  }
+  return false;
+}
+
 bool CheckIsAllInputsParam(const AnfNodePtr &node) {
   if (utils::isa<CNode>(node)) {
     auto cnode = node->cast<CNodePtr>();
diff --git a/mindspore/lite/tools/optimizer/common/gllo_utils.h b/mindspore/lite/tools/optimizer/common/gllo_utils.h
index 2299779b39..066882caca 100644
--- a/mindspore/lite/tools/optimizer/common/gllo_utils.h
+++ b/mindspore/lite/tools/optimizer/common/gllo_utils.h
@@ -56,6 +56,8 @@ bool IsParamNode(const BaseRef &n);
 
 bool IsConvNode(const BaseRef &n);
 
+bool IsPoolingNode(const BaseRef &n);
+
 bool CheckIsAllInputsParam(const AnfNodePtr &node);
 
 size_t GetOutputTensorNum(const AnfNodePtr &node);
diff --git a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
index 5043df1c92..2ca1d8d4c6 100644
--- a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
@@ -13,18 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "tools/optimizer/fusion/constant_folding_fusion.h"
 #include <memory>
 #include <set>
 #include <vector>
-#include <algorithm>
-#include "schema/inner/model_generated.h"
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/anf_exporter/anf_exporter.h"
 #include "src/kernel_registry.h"
-#include "src/scheduler.h"
 #include "include/context.h"
-#include "src/lite_session.h"
 #include "src/populate_parameter.h"
 #include "src/ops/primitive_c.h"
 
@@ -33,7 +30,7 @@ using mindspore::lite::PrimitiveC;
 using mindspore::lite::tensor::Tensor;
 namespace mindspore::opt {
 namespace {
-const std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &CNode) {
+std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &CNode) {
   MS_ASSERT(CNode != nullptr);
   auto tmp_meta_graph = std::make_unique<schema::MetaGraphT>();
   auto tmp_fb_node = std::make_unique<schema::CNodeT>();
@@ -51,11 +48,11 @@ const std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &CNode) {
     }
     auto lite_tensor_size = tensorT->data.size() * sizeof(uint8_t);
     // when tensorT as graph input
-    if (lite_tensor_size == 0) {
+    if (lite_tensor_size <= 0) {
       delete lite_tensor;
       return input_tensors;
     }
-    auto tensor_data = new (std::nothrow) char[lite_tensor_size / sizeof(char)];
+    auto tensor_data = reinterpret_cast<uint8_t *>(malloc(lite_tensor_size / sizeof(char)));
     if (tensor_data == nullptr) {
       MS_LOG(ERROR) << "tensor_data is nullptr";
       delete lite_tensor;
@@ -64,39 +61,18 @@ const std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &CNode) {
     auto ret = memcpy_s(tensor_data, lite_tensor_size, tensorT->data.data(), lite_tensor_size);
     if (ret != EOK) {
       delete lite_tensor;
-      delete tensor_data;
+      delete[](tensor_data);
       MS_LOG(EXCEPTION) << "memcpy error: " << ret;
-      return input_tensors;
     }
     lite_tensor->SetData(tensor_data);
     input_tensors.emplace_back(lite_tensor);
   }
   return input_tensors;
 }
-schema::Primitive *PackPrimitiveT(const CNodePtr &cnode) {
-  auto primitiveT_value = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
-  if (primitiveT_value == nullptr) {
-    MS_LOG(ERROR) << "PrimitiveT_value is nullptr";
-    return nullptr;
-  }
 
-  auto *lite_primitive = primitiveT_value->GetPrimitiveT();
-  if (lite_primitive == nullptr) {
-    MS_LOG(ERROR) << "Primitive in primitiveT_value is nullptr";
-    return nullptr;
-  }
-
-  flatbuffers::FlatBufferBuilder builder(1024);
-  auto offset = schema::Primitive::Pack(builder, lite_primitive);
-  builder.Finish(offset);
-  auto buf = builder.GetBufferPointer();
-  auto primitive = flatbuffers::GetRoot<schema::Primitive>(buf);
-  return const_cast<schema::Primitive *>(primitive);
-}
-const ParameterPtr CreateNewParamter(const FuncGraphPtr &func_graph, Tensor *tensor) {
+ParameterPtr CreateNewParamter(const FuncGraphPtr &func_graph, Tensor *tensor) {
   auto parameter = func_graph->add_parameter();
-  std::vector<int> shape;
-  std::copy(tensor->shape().begin(), tensor->shape().end(), std::back_inserter(shape));
+  std::vector<int> shape(tensor->shape());
   auto type_id = static_cast<TypeId>(tensor->data_type());
   auto type_ptr = TypeIdToType(type_id);
   auto abstract_tensor = std::make_shared<abstract::AbstractTensor>(type_ptr, shape);
@@ -106,6 +82,7 @@ const ParameterPtr CreateNewParamter(const FuncGraphPtr &func_graph, Tensor *ten
   MS_ASSERT(param_value != nullptr);
   param_value->set_tensor_shape(shape);
   param_value->set_tensor_type(type_id);
+  param_value->set_format(tensor->GetFormat());
   if (tensor->Data() != nullptr) {
     auto size = tensor->ElementsNum();
     auto tensor_data = new (std::nothrow) float[size];
@@ -125,17 +102,12 @@ const ParameterPtr CreateNewParamter(const FuncGraphPtr &func_graph, Tensor *ten
   parameter->set_default_param(param_value);
   return parameter;
 }
-kernel::LiteKernel *GetLiteKernel(std::vector<Tensor *> inputs, std::vector<Tensor *> outputs,
+kernel::LiteKernel *GetLiteKernel(std::vector<Tensor *> inputs, std::vector<Tensor *> outputs, OpParameter *parameter,
                                   mindspore::lite::PrimitiveC *primitive) {
   MS_ASSERT(nullptr != lite_primitive);
   auto data_type = inputs.front()->data_type();
   kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, (schema::PrimitiveType)primitive->Type()};
   lite::Context context;
-  auto parameter = kernel::PopulateParameter(primitive);
-  if (parameter == nullptr) {
-    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: " << (schema::PrimitiveType)primitive->Type();
-    return nullptr;
-  }
   auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
   if (creator != nullptr) {
     auto lite_kernel = creator(inputs, outputs, parameter, &context, desc, primitive);
@@ -144,17 +116,21 @@ kernel::LiteKernel *GetLiteKernel(std::vector<Tensor *> inputs, std::vector<Tens
   return nullptr;
 }
 }  //  namespace
-void FreeInputTensor(std::vector<Tensor *> *input_tensor) {
-  MS_ASSERT(input_tensor != nullptr);
-  for (size_t i = 0; i < input_tensor->size(); i++) {
-    if ((*input_tensor)[i] == nullptr) {
-      continue;
+void FreeTensors(std::vector<Tensor *> *input_tensor, std::vector<Tensor *> *output_tensor) {
+  if (input_tensor != nullptr) {
+    for (size_t i = 0; i < input_tensor->size(); i++) {
+      delete (*input_tensor)[i];
+      (*input_tensor)[i] = nullptr;
+    }
+  }
+  if (output_tensor != nullptr) {
+    for (size_t i = 0; i < output_tensor->size(); i++) {
+      delete (*output_tensor)[i];
+      (*output_tensor)[i] = nullptr;
     }
-    delete (*input_tensor)[i];
-    (*input_tensor)[i] = nullptr;
   }
-  return;
 }
+
 const AnfNodePtr ConstFoldPass::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                         const EquivPtr &) const {
   CheckIfFuncGraphIsNull(func_graph);
@@ -170,41 +146,56 @@ const AnfNodePtr ConstFoldPass::Process(const FuncGraphPtr &func_graph, const An
       auto input_cnode = input_node->cast<CNodePtr>();
       auto input_tensors = GetCNodeInputTensors(input_cnode);
       if (input_tensors.empty() || input_tensors.size() != input_cnode->inputs().size() - 1) {
-        FreeInputTensor(&input_tensors);
+        FreeTensors(&input_tensors, nullptr);
         continue;
       }
       MS_LOG(INFO) << "Begin fold node:" << input_node->fullname_with_scope();
       auto output_nums = GetOutputTensorNum(input_cnode);
       std::vector<Tensor *> output_tensors{output_nums, new Tensor()};
-      auto scheam_primitive = PackPrimitiveT(input_cnode);
-      auto lite_primitive = mindspore::lite::PrimitiveC::UnPackFromSchemaPrimitive(scheam_primitive);
+      auto lite_primitive = GetValueNode<std::shared_ptr<PrimitiveC>>(input_cnode->input(0));
       if (lite_primitive == nullptr) {
-        MS_LOG(ERROR) << "constant_folding schedule node lite primitive nullptr";
-        FreeInputTensor(&input_tensors);
+        MS_LOG(ERROR) << "lite_primitive is nullptr";
+        FreeTensors(&input_tensors, &output_tensors);
         return nullptr;
       }
+      // here, input_tensor's format need to be transposed nhwc according to fmkType,
+      // but for the time being, we only transpose the tensor with 0/1/2/3D.
+      // Others should be added in future.
+      for (size_t j = 0; j < input_tensors.size(); ++j) {
+        input_tensors[j]->SetFormat(schema::Format_NHWC);
+        if (input_tensors[j]->shape().size() == 4) {
+          MS_LOG(INFO) << "init input_tensor format to nhwc";
+        }
+      }
       lite_primitive->InferShape(input_tensors, output_tensors);
-      auto lite_kernel = GetLiteKernel(input_tensors, output_tensors, lite_primitive);
+      auto parameter = kernel::PopulateParameter(lite_primitive.get());
+      if (parameter == nullptr) {
+        MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
+                      << schema::EnumNamePrimitiveType((schema::PrimitiveType)(lite_primitive->Type()));
+        return nullptr;
+      }
+      auto lite_kernel = GetLiteKernel(input_tensors, output_tensors, parameter, lite_primitive.get());
       if (lite_kernel == nullptr) {
         MS_LOG(ERROR) << "constant_folding schedule node lite kernel nullptr";
-        FreeInputTensor(&input_tensors);
+        FreeTensors(&input_tensors, &output_tensors);
         return nullptr;
       }
       auto ret = lite_kernel->Run();
       if (0 != ret) {
-        FreeInputTensor(&input_tensors);
+        FreeTensors(&input_tensors, &output_tensors);
         MS_LOG(ERROR) << "run kernel failed, name: " << lite_kernel->name();
         return nullptr;
       }
       auto new_parameter = CreateNewParamter(func_graph, output_tensors.front());
       if (new_parameter == nullptr) {
-        FreeInputTensor(&input_tensors);
+        FreeTensors(&input_tensors, &output_tensors);
         MS_LOG(ERROR) << "CreateNewParamter failed, name: " << lite_kernel->name();
         return nullptr;
       }
       new_parameter->set_name(input_node->fullname_with_scope());
       any_node->set_input(i, new_parameter);
-      FreeInputTensor(&input_tensors);
+      FreeTensors(&input_tensors, &output_tensors);
+      delete (lite_kernel);
     }
   }
   return any_node;
diff --git a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
index ad6a64ecf6..29fde221bf 100644
--- a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
@@ -17,6 +17,10 @@
 #ifndef MINDSPORE_LITE_SRC_PASS_FUSION_CONSTANT_FOLDING_FUSION_H_
 #define MINDSPORE_LITE_SRC_PASS_FUSION_CONSTANT_FOLDING_FUSION_H_
 
+#include "schema/inner/model_generated.h"
+#include "src/ir/tensor.h"
+#include "src/lite_kernel.h"
+#include "nnacl/op_base.h"
 #include "backend/optimizer/common/optimizer.h"
 
 namespace mindspore {
@@ -30,4 +34,3 @@ class ConstFoldPass : public PatternProcessPass {
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_PASS_FUSION_CONSTANT_FOLDING_FUSION_H_
-
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.cc b/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.cc
index 99ac179907..534b3b863a 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.cc
@@ -6,7 +6,7 @@
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
- *conv_activation_fusion.h
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,7 +23,6 @@
 #include "schema/inner/model_generated.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-
 namespace mindspore::opt {
 namespace {
 constexpr size_t kActivationInputsLength = 2;
@@ -52,34 +51,38 @@ const AnfNodePtr ConvActivationFusion::Process(const FuncGraphPtr &func_graph, c
   auto act_primitivec = utils::cast<std::shared_ptr<mindspore::lite::Activation>>(primitivec);
   MS_ASSERT(act_primitivec != nullptr);
   if (act_primitivec->GetType() != activation_type) {
-    return node;
+    return nullptr;
   }
   AnfNodePtr pre_node = act_node->input(1);
   CheckIfAnfNodeIsNull(pre_node);
   if (pre_node != nullptr && pre_node->isa<CNode>()) {
     if (IsMultiOutputTensors(func_graph, pre_node)) {
-      return node;
+      return nullptr;
     }
     auto conv_node = pre_node->cast<CNodePtr>();
     auto node_type = GetCNodeType(conv_node);
-    auto primitiveT_value = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
-    MS_ASSERT(primitiveT_value);
+    auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
+    MS_ASSERT(primitive_c);
     if (node_type == schema::PrimitiveType_Conv2D) {
-      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value));
-      auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value);
+      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c));
+      auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c);
       MS_ASSERT(primc != nullptr);
-      primc->SetActivationType(activation_type);
-      return pre_node;
+      if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
+        primc->SetActivationType(activation_type);
+        return pre_node;
+      }
     } else if (node_type == schema::PrimitiveType_DepthwiseConv2D) {
-      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value));
-      auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value);
+      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c));
+      auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c);
       MS_ASSERT(primc != nullptr);
-      primc->SetActivationType(activation_type);
-      return pre_node;
+      if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
+        primc->SetActivationType(activation_type);
+        return pre_node;
+      }
     } else {
-      MS_LOG(EXCEPTION) << "conv activation pass match only conv2d or depthwise_conv2d ";
+      MS_LOG(ERROR) << "conv activation pass match only conv2d or depthwise_conv2d ";
     }
   }
-  return node;
+  return nullptr;
 }
 }  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.h b/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.h
index 9436240bf1..af7d900bec 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/conv_activation_fusion.h
@@ -6,7 +6,7 @@
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
- *conv_activation_fusion.h
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_biasadd_fusion.cc b/mindspore/lite/tools/optimizer/fusion/conv_biasadd_fusion.cc
index f746240552..6ab182aef1 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_biasadd_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/conv_biasadd_fusion.cc
@@ -160,26 +160,27 @@ const AnfNodePtr ConvBiasaddFusion::Process(const FuncGraphPtr &func_graph, cons
   auto conv_node = conv_node_anf->cast<CNodePtr>();
   CheckIfCNodeIsNull(conv_node);
   GenConvNewBias(func_graph, conv_node, add_node);
-  auto primitiveT_value = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
-  MS_ASSERT(primitiveT_value != nullptr);
-  auto type = primitiveT_value->Type();
+  auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
+  MS_ASSERT(primitive_c != nullptr);
+  auto type = primitive_c->Type();
   if (type == schema::PrimitiveType_Conv2D) {
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value);
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c);
     MS_ASSERT(primc != nullptr);
     primc->SetHasBias(true);
   } else if (type == schema::PrimitiveType_DepthwiseConv2D) {
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value);
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c);
     MS_ASSERT(primc != nullptr);
     primc->SetHasBias(true);
   } else if (type == schema::PrimitiveType_DeConv2D) {
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DeConv2D>>(primitiveT_value));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DeConv2D>>(primitiveT_value);
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DeConv2D>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DeConv2D>>(primitive_c);
     MS_ASSERT(primc != nullptr);
     primc->SetHasBias(true);
   } else {
-    MS_LOG(EXCEPTION) << "Unsupported opType, " << type;
+    MS_LOG(ERROR) << "Unsupported opType, " << type;
+    return nullptr;
   }
   return conv_node;
 }
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_bn_fusion.cc b/mindspore/lite/tools/optimizer/fusion/conv_bn_fusion.cc
index b02eccd3fe..e348691bcf 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_bn_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/conv_bn_fusion.cc
@@ -22,6 +22,8 @@
 #include "utils/utils.h"
 #include "tools/optimizer/common/gllo_utils.h"
 #include "securec/include/securec.h"
+#include "src/ops/batch_norm.h"
+#include "src/ops/fused_batchnorm.h"
 
 namespace mindspore::opt {
 namespace {
@@ -94,7 +96,7 @@ const BaseRef ConvBatchNormFusion::DefinePattern() const {
   auto bn_mean_var = std::make_shared<CondVar>(IsParamNode);
   auto bn_variable_var = std::make_shared<CondVar>(IsParamNode);
   auto bn_other_var = std::make_shared<SeqVar>();
-  return VectorRef({bn_var, conv_var, bn_mean_var, bn_variable_var, bn_other_var});;
+  return VectorRef({bn_var, conv_var, bn_mean_var, bn_variable_var, bn_other_var});
 }
 // BatchNorm weight Tensor definition:
 // caffe
@@ -106,26 +108,32 @@ const BaseRef ConvBatchNormFusion::DefinePattern() const {
 //   estimated_mean  --2
 //   estimated_variance  --3
 const void ConvBatchNormFusion::InitTransParam(const CNodePtr &bn_node, int kernel_num, float *trans_scale,
-                                                       float *trans_bias) const {
+                                               float *trans_bias) const {
   MS_ASSERT(bn_node != nullptr);
   AnfNodePtr bn_mean_node = nullptr;
   AnfNodePtr bn_variance_node = nullptr;
   AnfNodePtr bn_scale_node = nullptr;
   AnfNodePtr bn_bias_node = nullptr;
   float eps = 0;
-  auto primitiveT_value = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(bn_node->input(0));
+  auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(bn_node->input(0));
   if (GetCNodeType(bn_node) == schema::PrimitiveType_BatchNorm) {
     bn_mean_node = bn_node->input(kCaffeBNMeanIndex);
     bn_variance_node = bn_node->input(kCaffeBNVarIndex);
     CheckIfNodeIsParam(bn_mean_node);
     CheckIfNodeIsParam(bn_variance_node);
-    eps = primitiveT_value->GetPrimitiveT()->value.AsBatchNorm()->epsilon;
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::BatchNorm>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::BatchNorm>>(primitive_c);
+    MS_ASSERT(primc != nullptr);
+    eps = primc->GetEpsilon();
   } else if (GetCNodeType(bn_node) == schema::PrimitiveType_FusedBatchNorm) {
     bn_scale_node = bn_node->input(kTFBNScaleIndex);
     bn_bias_node = bn_node->input(kTFBNBiasIndex);
     bn_mean_node = bn_node->input(kTFBNMeanIndex);
     bn_variance_node = bn_node->input(kTFBNVarIndex);
-    eps = primitiveT_value->GetPrimitiveT()->value.AsFusedBatchNorm()->epsilon;
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::FusedBatchNorm>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::FusedBatchNorm>>(primitive_c);
+    MS_ASSERT(primc != nullptr);
+    eps = primc->GetEpsilon();
   } else {
     MS_LOG(EXCEPTION) << "not caffe or tf batchnorm op.";
   }
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.cc b/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.cc
index 9256a60a63..6ecc60e5e4 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.cc
@@ -85,43 +85,37 @@ const AnfNodePtr ConvTransformFusion::Process(const FuncGraphPtr &func_graph, co
   auto trans_scale = new (std::nothrow) float[kernel_nums];
   if (trans_scale == nullptr) {
     MS_LOG(ERROR) << "tensor_data is nullptr";
+    delete[] trans_scale;
     return nullptr;
   }
   auto trans_bias = new (std::nothrow) float[kernel_nums];
   if (trans_bias == nullptr) {
     MS_LOG(ERROR) << "tensor_data is nullptr";
-    delete trans_scale;
+    delete[] trans_bias;
     return nullptr;
   }
   GenTransParam(transform_node, kernel_nums, trans_scale, trans_bias);
   GenNewConvTensor(func_graph, conv_node, kernel_nums, trans_scale, trans_bias);
   delete[] trans_bias;
   delete[] trans_scale;
-  auto primitiveT_value = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
-  MS_ASSERT(primitiveT_value != nullptr);
-  auto type = primitiveT_value->Type();
+  auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_node->input(0));
+  MS_ASSERT(primitive_c != nullptr);
+  auto type = primitive_c->Type();
   if (type == schema::PrimitiveType_Conv2D) {
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitiveT_value);
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c);
     MS_ASSERT(primc != nullptr);
     primc->SetHasBias(true);
   } else if (type == schema::PrimitiveType_DepthwiseConv2D) {
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitiveT_value);
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c);
     MS_ASSERT(primc != nullptr);
     primc->SetHasBias(true);
   } else {
-    MS_LOG(EXCEPTION) << "Unsupported opType, " << type;
+    MS_LOG(ERROR) << "Unsupported opType, " << type;
+    return nullptr;
   }
   pre_node->set_abstract(abstr);
-  const auto &prim = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(transform_node->input(0));
-  if (prim != nullptr) {
-    auto *prim_t = prim->GetPrimitiveT();
-    if (prim_t != nullptr) {
-      delete prim_t;
-      prim->SetPrimitiveT(nullptr);
-    }
-  }
   return pre_node;
 }
 
@@ -187,6 +181,7 @@ const void ConvTransformFusion::GenNewConvTensor(const FuncGraphPtr &func_graph,
     bias_data = new (std::nothrow) float[kernel_num];
     if (bias_data == nullptr) {
       MS_LOG(ERROR) << "tensor_data is nullptr";
+      delete[] bias_data;
       return;
     }
   }
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.cc b/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.cc
new file mode 100644
index 0000000000..3964193a97
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *conv_activation_fusion.h
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/optimizer/fusion/conv_tuple_activation_fusion.h"
+#include <memory>
+#include "src/ops/primitive_c.h"
+#include "src/ops/conv2d.h"
+#include "src/ops/depthwise_conv2d.h"
+#include "src/ops/activation.h"
+#include "schema/inner/model_generated.h"
+#include "tools/optimizer/common/gllo_utils.h"
+
+namespace mindspore::opt {
+namespace {
+constexpr size_t kActivationInputsLength = 2;
+}
+const BaseRef ConvTupleActivationFusion::DefinePattern() const {
+  auto conv_var = std::make_shared<CondVar>(IsConvNode);
+  auto tuple_index = std::make_shared<Var>();
+  auto tuple_prim = new schema::PrimitiveT();
+  tuple_prim->value.type = schema::PrimitiveType_TupleGetItem;
+  auto tuple_value = std::make_shared<lite::PrimitiveC>(tuple_prim);
+  VectorRef tuple_get_item = VectorRef({tuple_value, conv_var, tuple_index});
+
+  auto act_prim = new schema::PrimitiveT();
+  act_prim->value.type = primitive_type;
+  auto act_value = std::make_shared<lite::PrimitiveC>(act_prim);
+
+  return VectorRef({act_value, tuple_get_item});
+}
+
+const AnfNodePtr ConvTupleActivationFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                                    const EquivPtr &) const {
+  MS_LOG(DEBUG) << "conv tuple activation pass process:" << schema::EnumNamesPrimitiveType()[primitive_type];
+  CheckIfFuncGraphIsNull(func_graph);
+
+  CheckIfAnfNodeIsNull(node);
+  auto act_node = node->cast<CNodePtr>();
+  CheckIfCNodeIsNull(act_node);
+  CheckInputSize(act_node, kActivationInputsLength);
+
+  auto primitivec = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(act_node->input(0));
+  MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Activation>>(primitivec));
+  auto act_primitivec = utils::cast<std::shared_ptr<mindspore::lite::Activation>>(primitivec);
+  MS_ASSERT(act_primitivec != nullptr);
+  if (act_primitivec->GetType() != activation_type) {
+    return nullptr;
+  }
+  AnfNodePtr tuple_node = act_node->input(1);
+  MS_ASSERT(tuple_node != nullptr);
+  auto tuple_cnode = tuple_node->cast<CNodePtr>();
+  auto conv_node = tuple_cnode->input(1);
+  CheckIfAnfNodeIsNull(conv_node);
+  if (conv_node != nullptr && conv_node->isa<CNode>()) {
+    if (IsMultiOutputTensors(func_graph, conv_node)) {
+      return nullptr;
+    }
+    auto conv_cnode = conv_node->cast<CNodePtr>();
+    auto node_type = GetCNodeType(conv_cnode);
+    auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(conv_cnode->input(0));
+    MS_ASSERT(primitive_c);
+    if (node_type == schema::PrimitiveType_Conv2D) {
+      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c));
+      auto primc = utils::cast<std::shared_ptr<mindspore::lite::Conv2D>>(primitive_c);
+      MS_ASSERT(primc != nullptr);
+      if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
+        primc->SetActivationType(activation_type);
+        conv_node->set_abstract(act_node->abstract());
+        return conv_node;
+      }
+    } else if (node_type == schema::PrimitiveType_DepthwiseConv2D) {
+      MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c));
+      auto primc = utils::cast<std::shared_ptr<mindspore::lite::DepthwiseConv2D>>(primitive_c);
+      MS_ASSERT(primc != nullptr);
+      if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
+        primc->SetActivationType(activation_type);
+        conv_node->set_abstract(act_node->abstract());
+        return conv_node;
+      }
+    } else {
+      MS_LOG(ERROR) << "conv activation pass match only conv2d or depthwise_conv2d ";
+    }
+  }
+  return nullptr;
+}
+}  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.h b/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.h
new file mode 100644
index 0000000000..cc9344615a
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/conv_tuple_activation_fusion.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *conv_activation_fusion.h
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_PASS_FUSION_CONV_TUPLE_ACTIVATION_FUSION_H_
+#define MINDSPORE_LITE_SRC_PASS_FUSION_CONV_TUPLE_ACTIVATION_FUSION_H_
+
+#include <string>
+#include "backend/optimizer/common/optimizer.h"
+#include "schema/inner/model_generated.h"
+
+namespace mindspore {
+namespace opt {
+class ConvTupleActivationFusion : public PatternProcessPass {
+ public:
+  explicit ConvTupleActivationFusion(bool multigraph = true, const std::string &name = "conv_tuple_activation_fusion",
+                                     schema::PrimitiveType primitive = schema::PrimitiveType_LeakyReLU,
+                                     schema::ActivationType activation = schema::ActivationType_LEAKY_RELU)
+      : PatternProcessPass(name, multigraph), primitive_type(primitive), activation_type(activation) {}
+  ~ConvTupleActivationFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+  schema::PrimitiveType primitive_type;
+  schema::ActivationType activation_type;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_PASS_FUSION_CONV_TUPLE_ACTIVATION_FUSION_H_
diff --git a/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc b/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc
new file mode 100644
index 0000000000..acd3579ea2
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/optimizer/fusion/pooling_activation_fusion.h"
+#include <memory>
+#include "src/ops/primitive_c.h"
+#include "src/ops/pooling.h"
+#include "src/ops/activation.h"
+#include "schema/inner/model_generated.h"
+#include "tools/optimizer/common/gllo_utils.h"
+
+namespace mindspore::opt {
+namespace {
+constexpr size_t kActivationInputsLength = 2;
+}
+const BaseRef PoolingActivationFusion::DefinePattern() const {
+  auto pooling_var = std::make_shared<CondVar>(IsPoolingNode)();
+  auto prim = new schema::PrimitiveT();
+  if (prim == nullptr) {
+    MS_LOG(ERROR) << "new primitiveT failed";
+    return nullptr;
+  }
+  prim->value.type = primitive_type;
+  auto prim_value = std::make_shared<lite::PrimitiveC>(prim);
+
+  return VectorRef({prim_value, pooling_var});
+}
+
+const AnfNodePtr PoolingActivationFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                                  const EquivPtr &) const {
+  MS_LOG(DEBUG) << "pooling activation pass process:" << schema::EnumNamesPrimitiveType()[primitive_type];
+  CheckIfFuncGraphIsNull(func_graph);
+
+  CheckIfAnfNodeIsNull(node);
+  auto act_node = node->cast<CNodePtr>();
+  CheckIfCNodeIsNull(act_node);
+  CheckInputSize(act_node, kActivationInputsLength);
+
+  auto primitivec = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(act_node->input(0));
+  MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Activation>>(primitivec));
+  auto act_primitivec = utils::cast<std::shared_ptr<mindspore::lite::Activation>>(primitivec);
+  MS_ASSERT(act_primitivec != nullptr);
+  if (act_primitivec->GetType() != activation_type) {
+    return node;
+  }
+  AnfNodePtr pre_node = act_node->input(1);
+  CheckIfAnfNodeIsNull(pre_node);
+  if (pre_node != nullptr && pre_node->isa<CNode>()) {
+    if (IsMultiOutputTensors(func_graph, pre_node)) {
+      return node;
+    }
+    auto pooling_node = pre_node->cast<CNodePtr>();
+    auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(pooling_node->input(0));
+    MS_ASSERT(primitive_c);
+
+    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Pooling>>(primitive_c));
+    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Pooling>>(primitive_c);
+    MS_ASSERT(primc != nullptr);
+    if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
+      primc->SetActivationType(activation_type);
+      return pre_node;
+    }
+  }
+  return node;
+}
+}  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.h b/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.h
new file mode 100644
index 0000000000..b01206ea82
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_PASS_FUSION_POOLING_ACTIVATION_FUSION_H_
+#define MINDSPORE_LITE_SRC_PASS_FUSION_POOLING_ACTIVATION_FUSION_H_
+
+#include <string>
+#include "backend/optimizer/common/optimizer.h"
+#include "schema/inner/model_generated.h"
+
+namespace mindspore {
+namespace opt {
+class PoolingActivationFusion : public PatternProcessPass {
+ public:
+  explicit PoolingAActivationFusion(bool multigraph = true, const std::string &name = "pooling_activation_fusion",
+                                    schema::PrimitiveType primitive = schema::PrimitiveType_LeakyReLU,
+                                    schema::ActivationType activation = schema::ActivationType_LEAKY_RELU)
+    : PatternProcessPass(name, multigraph), primitive_type(primitive), activation_type(activation) {}
+  ~PoolingAActivationFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+  schema::PrimitiveType primitive_type;
+  schema::ActivationType activation_type;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_PASS_FUSION_POOLING_ACTIVATION_FUSION_H_
diff --git a/mindspore/lite/tools/time_profile/CMakeLists.txt b/mindspore/lite/tools/time_profile/CMakeLists.txt
index fce3655d3a..af96c660f8 100644
--- a/mindspore/lite/tools/time_profile/CMakeLists.txt
+++ b/mindspore/lite/tools/time_profile/CMakeLists.txt
@@ -17,5 +17,10 @@ else()
     target_link_libraries(timeprofile mindspore-lite pthread)
 endif()
 
-install(TARGETS timeprofile
-        RUNTIME DESTINATION ${MAIN_DIR}/time_profile COMPONENT ${COMPONENT_NAME})
\ No newline at end of file
+if (PLATFORM_ARM32 OR PLATFORM_ARM64)
+    install(TARGETS timeprofile
+            RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/time_profile COMPONENT ${COMPONENT_NAME})
+else()
+    install(TARGETS timeprofile
+            RUNTIME DESTINATION ${MAIN_DIR}-${RUN_X86_COMPONENT_NAME}/time_profile COMPONENT ${RUN_X86_COMPONENT_NAME})
+endif()
diff --git a/mindspore/lite/tools/time_profile/time_profile.cc b/mindspore/lite/tools/time_profile/time_profile.cc
index 09bec1f340..7ed8e5c2b3 100644
--- a/mindspore/lite/tools/time_profile/time_profile.cc
+++ b/mindspore/lite/tools/time_profile/time_profile.cc
@@ -42,6 +42,7 @@ int TimeProfile::GenerateInputData() {
     auto input_data = tensor->MutableData();
     if (input_data == nullptr) {
       MS_LOG(ERROR) << "MallocData for inTensor failed";
+      std::cerr << "MallocData for inTensor failed" << std::endl;
       return RET_ERROR;
     }
     MS_ASSERT(tensor->GetData() != nullptr);
@@ -49,6 +50,7 @@ int TimeProfile::GenerateInputData() {
     auto status = GenerateRandomData(tensor_byte_size, input_data);
     if (status != RET_OK) {
       MS_LOG(ERROR) << "Generate RandomData for inTensor failed " << status;
+      std::cerr << "Generate RandomData for inTensor failed " << status << std::endl;
       return RET_ERROR;
     }
   }
@@ -66,12 +68,14 @@ int TimeProfile::ReadInputFile() {
   size_t size;
   char *bin_buf = ReadFile(_flags->in_data_path_.c_str(), &size);
   if (bin_buf == nullptr) {
-    MS_LOG(ERROR) << "Input data file error, required: ";
+    MS_LOG(ERROR) << "Read input data failed.";
+    std::cerr << "Read input data failed." << std::endl;
     return RET_ERROR;
   }
   auto tensor_data_size = inTensor->Size();
   if (size != tensor_data_size) {
     MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << " in fact: " << size;
+    std::cerr << "Input binary file size error, required: " << tensor_data_size << " in fact: " << size << std::endl;
     return RET_ERROR;
   }
   auto input_data = inTensor->MutableData();
@@ -85,12 +89,14 @@ int TimeProfile::LoadInput() {
     auto status = GenerateInputData();
     if (status != RET_OK) {
       MS_LOG(ERROR) << "Generate input data error " << status;
+      std::cerr << "Generate input data error " << status << std::endl;
       return RET_ERROR;
     }
   } else {
     auto status = ReadInputFile();
     if (status != RET_OK) {
       MS_LOG(ERROR) << "ReadInputFile error " << status;
+      std::cerr << "ReadInputFile error " << status << std::endl;
       return RET_ERROR;
     }
   }
@@ -102,6 +108,7 @@ int TimeProfile::InitSession() {
   char *graph_buf = ReadFile(_flags->model_path_.c_str(), &size);
   if (graph_buf == nullptr) {
     MS_LOG(ERROR) << "Load graph failed, path " << _flags->model_path_;
+    std::cerr << "Load graph failed, path " << _flags->model_path_ << std::endl;
     return RET_ERROR;
   }
 
@@ -113,6 +120,7 @@ int TimeProfile::InitSession() {
   session_ = session::LiteSession::CreateSession(ctx);
   if (session_ == nullptr) {
     MS_LOG(ERROR) << "New session failed while running.";
+    std::cerr << "New session failed while running." << std::endl;
     return RET_ERROR;
   }
 
@@ -179,11 +187,13 @@ int TimeProfile::Init() {
 
   if (_flags->num_threads_ < 1) {
     MS_LOG(ERROR) << "NumThreads: " << _flags->num_threads_ << " must greater than or equal 1";
+    std::cerr << "NumThreads: " << _flags->num_threads_ << " must greater than or equal 1" << std::endl;
     return RET_ERROR;
   }
 
   if (_flags->loop_count_ < 1) {
     MS_LOG(ERROR) << "LoopCount: " << _flags->loop_count_ << " must greater than or equal 1";
+    std::cerr << "LoopCount: " << _flags->loop_count_ << " must greater than or equal 1" << std::endl;
     return RET_ERROR;
   }
 
@@ -200,24 +210,28 @@ int TimeProfile::Init() {
 
   if (_flags->model_path_.empty()) {
     MS_LOG(ERROR) << "modelPath is required";
+    std::cerr << "modelPath is required" << std::endl;
     return RET_ERROR;
   }
 
   auto status = InitSession();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Init session failed.";
+    std::cerr << "Init session failed." << std::endl;
     return RET_ERROR;
   }
 
   status = this->LoadInput();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Load input failed.";
+    std::cerr << "Load input failed." << std::endl;
     return RET_ERROR;
   }
 
   status = InitCallbackParameter();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Init callback Parameter failed.";
+    std::cerr << "Init callback Parameter failed." << std::endl;
     return RET_ERROR;
   }
 
@@ -299,6 +313,7 @@ int TimeProfile::RunTimeProfile() {
   char *graphBuf = ReadFile(_flags->model_path_.c_str(), &size);
   if (graphBuf == nullptr) {
     MS_LOG(ERROR) << "Load graph failed while running " << modelName.c_str();
+    std::cerr << "Load graph failed while running " << modelName.c_str() << std::endl;
     delete graphBuf;
     delete session_;
     return RET_ERROR;
@@ -307,6 +322,7 @@ int TimeProfile::RunTimeProfile() {
   delete graphBuf;
   if (model == nullptr) {
     MS_LOG(ERROR) << "Import model file failed while running " << modelName.c_str();
+    std::cerr << "Import model file failed while running " << modelName.c_str() << std::endl;
     delete session_;
     delete model;
     return RET_ERROR;
@@ -314,6 +330,7 @@ int TimeProfile::RunTimeProfile() {
   auto ret = session_->CompileGraph(model);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Compile graph failed.";
+    std::cerr << "Compile graph failed." << std::endl;
     delete session_;
     delete model;
     return RET_ERROR;
@@ -324,6 +341,7 @@ int TimeProfile::RunTimeProfile() {
   auto status = LoadInput();
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Generate input data error";
+    std::cerr << "Generate input data error" << std::endl;
     delete session_;
     delete model;
     return status;
@@ -337,11 +355,12 @@ int TimeProfile::RunTimeProfile() {
     ret = session_->RunGraph(before_call_back_, after_call_back_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Run graph failed.";
+      std::cerr << "Run graph failed." << std::endl;
       delete session_;
       delete model;
       return RET_ERROR;
     }
-    auto outputs = session_->GetOutputs();
+    auto outputs = session_->GetOutputMapByNode();
 
     uint64_t run_end = GetTimeUs();
     uint64_t time = run_end - run_begin;
@@ -384,12 +403,14 @@ int RunTimeProfile(int argc, const char **argv) {
   auto ret = time_profile.Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init TimeProfile failed.";
+    std::cerr << "Init TimeProfile failed." << std::endl;
     return RET_ERROR;
   }
 
   ret = time_profile.RunTimeProfile();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Run TimeProfile failed.";
+    std::cerr << "Run TimeProfile failed." << std::endl;
     return RET_ERROR;
   }
 
diff --git a/model_zoo/official/lite/.gitignore b/model_zoo/official/lite/.gitignore
new file mode 100644
index 0000000000..59c69550a3
--- /dev/null
+++ b/model_zoo/official/lite/.gitignore
@@ -0,0 +1,81 @@
+# MindSpore
+build/
+mindspore/lib
+output
+*.ir
+mindspore/ccsrc/schema/inner/*
+
+# Cmake files
+CMakeFiles/
+cmake_install.cmake
+CMakeCache.txt
+Makefile
+cmake-build-debug
+
+# Dynamic libraries
+*.so
+*.so.*
+*.dylib
+
+# Static libraries
+*.la
+*.lai
+*.a
+*.lib
+
+# Protocol buffers
+*_pb2.py
+*.pb.h
+*.pb.cc
+
+# Object files
+*.o
+
+# Editor
+.vscode
+.idea/
+
+# Cquery
+.cquery_cached_index/
+compile_commands.json
+
+# Ctags and cscope
+tags
+TAGS
+CTAGS
+GTAGS
+GRTAGS
+GSYMS
+GPATH
+cscope.*
+
+# Python files
+*__pycache__*
+.pytest_cache
+
+# Mac files
+*.DS_Store
+
+# Test results
+test_temp_summary_event_file/
+*.dot
+*.dat
+*.svg
+*.perf
+*.info
+*.ckpt
+*.shp
+*.pkl
+.clangd
+mindspore/version.py
+mindspore/default_config.py
+mindspore/.commit_id
+onnx.proto
+mindspore/ccsrc/onnx.proto
+
+# Android
+local.properties
+.gradle
+sdk/build
+sdk/.cxx
+app/.cxx