sync code of micro to master

4 years ago · 4faf97f6bd
parent fa4c19f938
commit 4faf97f6bd
24 changed files with 261 additions and 79 deletions
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@ -136,6 +136,8 @@ if(PLATFORM_ARM64)
            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend* ops*" EXCLUDE)
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+            COMPONENT ${CODEGEN_COMPONENT_NAME})
    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
@ -157,6 +159,8 @@ elseif(PLATFORM_ARM32)
            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend*" EXCLUDE)
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+            COMPONENT ${CODEGEN_COMPONENT_NAME})
    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
@ -231,6 +235,8 @@ else()
        install(FILES ${glog_LIBPATH}/libglog.so.0.4.0
                DESTINATION ${CONVERTER_PKG_NAME}/third_party/glog/lib RENAME libglog.so.0
                COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+                COMPONENT ${CODEGEN_COMPONENT_NAME})
        install(TARGETS codegen RUNTIME DESTINATION ${CODEGEN_PKG_NAME}/
                COMPONENT ${CODEGEN_COMPONENT_NAME})
    endif()
@ -249,7 +255,7 @@ else()
 endif()
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 if(PLATFORM_ARM64 OR PLATFORM_ARM32)
-    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME})
+    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 else()
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CONVERTER_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 endif()
--- a/mindspore/core/utils/log_adapter.h
+++ b/mindspore/core/utils/log_adapter.h
@ -34,7 +34,7 @@
 #define LOG_HDR_FILE_REL_PATH "mindspore/core/utils/log_adapter.h"

 // Get start index of file relative path in __FILE__
-static constexpr int GetRelPathPos() noexcept {
+static constexpr size_t GetRelPathPos() noexcept {
  return sizeof(__FILE__) > sizeof(LOG_HDR_FILE_REL_PATH) ? sizeof(__FILE__) - sizeof(LOG_HDR_FILE_REL_PATH) : 0;
 }

--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -89,8 +89,10 @@ if(SUPPORT_TRAIN)
 else()
    if(PLATFORM_ARM64)
        set(RUNTIME_COMPONENT_NAME inference-android-aarch64)
+        set(CODEGEN_COMPONENT_NAME codegen-android-aarch64)
    elseif(PLATFORM_ARM32)
        set(RUNTIME_COMPONENT_NAME inference-android-aarch32)
+        set(CODEGEN_COMPONENT_NAME codegen-android-aarch32)
    elseif(WIN32)
        if("${X86_64_SIMD}" STREQUAL "off")
            set(RUNTIME_COMPONENT_NAME inference-win-x64)
@ -218,7 +220,6 @@ if(ENABLE_CONVERTER)
    include(${TOP_DIR}/cmake/external_libs/eigen.cmake)
    include(${TOP_DIR}/cmake/external_libs/protobuf.cmake)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 endif()

 if(ENABLE_MINDRT)
@ -272,6 +273,7 @@ endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/nnacl)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 if(ENABLE_TOOLS)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark)
    if(SUPPORT_TRAIN)
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@ -301,6 +301,30 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/infer/splice_infer.c
        )

+#### sse
+if("${X86_64_SIMD}" STREQUAL "sse")
+    set(SSE_SRC
+            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
+            )
+    set_property(SOURCE ${SSE_SRC} PROPERTY LANGUAGE C)
+endif()
+
+#### avx
+if("${X86_64_SIMD}" STREQUAL "avx")
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse4.1 -mavx -mavx2")
+    set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2")
+    set(AVX_SRC
+            ${LITE_DIR}/nnacl/intrinsics/avx/common_utils.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
+            ${LITE_DIR}/nnacl/assembly/avx/MatmulAvx.S
+            )
+    set_property(SOURCE ${AVX_SRC} PROPERTY LANGUAGE C)
+endif()
+
 list(APPEND FILE_SET ${CODER_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
-        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE})
+        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE} ${SSE_SRC} ${AVX_SRC})

--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@ -25,10 +25,12 @@ include(${MICRO_DIR}/cmake/file_list.cmake)
 include(${MICRO_DIR}/cmake/package_wrapper.cmake)
 add_subdirectory(operator_library)

-add_executable(codegen main.cc ${FILE_SET})
-add_dependencies(codegen fbs_src)
-add_dependencies(codegen fbs_inner_src)
-target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
-if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
-    add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
+if(NOT PLATFORM_ARM32 AND NOT PLATFORM_ARM64)
+    add_executable(codegen main.cc ${FILE_SET})
+    add_dependencies(codegen fbs_src)
+    add_dependencies(codegen fbs_inner_src)
+    target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
+    if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+        add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
+    endif()
 endif()
--- a/mindspore/lite/micro/coder/allocator/allocator.h
+++ b/mindspore/lite/micro/coder/allocator/allocator.h
@ -92,19 +92,17 @@ class MemoryAllocator {
   * including tensor, workspace
   */
  template <typename T>
-  std::string GetRuntimeAddr(T t, bool is_const = false) {
+  std::string GetRuntimeAddr(T t, bool immutable = false) {
    if (!t) {
      return "";
    }
-    std::string type_info = is_const ? "const " : "";
    std::string type_name;
    if (std::type_index(typeid(T)) == std::type_index(typeid(Tensor *))) {
      type_name = GetTensorDataType(reinterpret_cast<Tensor *>(t)->data_type()) + "*";
    } else {
      type_name = GetVariableTypeName<T>();
    }
-    type_info = wrap(type_info + type_name);
-
+    std::string type_info = wrap(type_name);
    void *variable = reinterpret_cast<void *>(t);
    auto item = inputs_addr_.find(variable);
    if (item != inputs_addr_.end()) {
@ -133,6 +131,9 @@ class MemoryAllocator {
                        [&variable](const std::pair<Tensor *, std::string> &a) { return variable == a.first; });
    if (iter != origin_weights_addr_.end()) {
      saved_weights_addr_.insert(std::make_pair(iter->second, reinterpret_cast<Tensor *>(variable)));
+      if (immutable) {
+        malloc_weights_addr_.insert({reinterpret_cast<Tensor *>(variable), iter->second});
+      }
      return iter->second;
    }
    MS_LOG(ERROR) << "uninitialized memory";
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
@ -134,7 +134,7 @@ void CodeBenchmarkInference(std::ofstream &ofs, const std::string &module_name)
      << "    uint64_t timeAvg = 0;\n"
      << "    int loop_count = atoi(argv[3]);\n"
      << "    printf(\"======Inference Start======\\n\");\n"
-      << "    printf(\"cycles: %d\", loop_count);\n"
+      << "    printf(\"cycles: %d\\n\", loop_count);\n"
      << "    for (int i = 0; i < loop_count; i++) {\n"
      << "      uint64_t runBegin = GetTimeUs();\n"
      << "      " << module_name << "_Inference();\n"
--- a/mindspore/lite/micro/coder/generator/component/cmake_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/cmake_component.cc
@ -48,7 +48,7 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::string &module_name, con
  }

  ofs << "file(GLOB NET_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.c)\n"
-      << "add_library(${PROJ_NAME} STATIC ${NET_SRC})\n";
+      << "add_library(net STATIC ${NET_SRC})\n";
 }

 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
@ -19,9 +19,8 @@

 const char *bench_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
-  "project(${PROJ_NAME})\n"
+  "project(benchmark)\n"
  "\n"
-  "message(\"project name: ${PROJ_NAME}\")\n"
  "message(\"project name: ${MODEL_LIB_PATH}\")\n"
  "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
  "\n"
@ -54,14 +53,13 @@ const char *bench_cmake_lists_txt =
  "endif ()\n"
  "link_directories(${MODEL_LIB_PATH})\n"
  "include(benchmark.cmake)\n"
-  "add_executable(${PROJ_NAME}_bench ${SRC_FILES})\n"
-  "target_link_libraries(${PROJ_NAME}_bench ${MODEL_LIB_NAME} -lm -pthread)\n";
+  "add_executable(benchmark ${SRC_FILES})\n"
+  "target_link_libraries(benchmark ${MODEL_LIB_NAME} -lm -pthread)\n";

 const char *src_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
-  "project(${PROJ_NAME})\n"
+  "project(net)\n"
  "\n"
-  "message(\"project name: ${PROJ_NAME}\")\n"
  "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
  "message(\"operator lib path: ${OP_LIB}\")\n"
  "message(\"operator header path: ${OP_HEADER_PATH}\")\n"
@ -83,10 +81,11 @@ const char *src_cmake_lists_txt =
  "else()\n"
  "    set(CMAKE_C_FLAGS \"-fPIC -fPIE -O3 -Werror -fstack-protector-strong -fomit-frame-pointer ${CMAKE_C_FLAGS}\")\n"
  "    set(CMAKE_C_FLAGS_Release \"${CMAKE_C_FLAGS_Release} -O3 -ffunction-sections -Werror -fdata-sections\")\n"
+  "    string(REPLACE \"-g\" \"\" CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}\")\n"
  "endif()\n"
  "\n"
  "function(create_library)\n"
-  "    add_custom_command(TARGET ${PROJ_NAME}\n"
+  "    add_custom_command(TARGET net\n"
  "            POST_BUILD\n"
  "            COMMAND rm -rf tmp\n"
  "            COMMAND mkdir tmp\n"
@ -97,9 +96,9 @@ const char *src_cmake_lists_txt =
  "            COMMENT \"unzip raw static library ${library_name}\"\n"
  "            )\n"
  "    foreach (object_file ${OP_SRC})\n"
-  "        add_custom_command(TARGET ${PROJ_NAME} POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
+  "        add_custom_command(TARGET net POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
  "    endforeach ()\n"
-  "    add_custom_command(TARGET ${PROJ_NAME}\n"
+  "    add_custom_command(TARGET net\n"
  "            POST_BUILD\n"
  "            COMMAND ar cr ${library_name} *.o\n"
  "            COMMAND ranlib ${library_name}\n"
@ -109,7 +108,7 @@ const char *src_cmake_lists_txt =
  "            COMMENT \"generate specified static library ${library_name}\"\n"
  "            )\n"
  "endfunction(create_library)\n"
-  "string(CONCAT library_name \"lib\" ${PROJ_NAME} \".a\")\n"
+  "string(CONCAT library_name \"lib\" net \".a\")\n"
  "create_library()\n";

 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
--- a/mindspore/lite/micro/coder/generator/component/parallel_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/parallel_component.cc
@ -36,7 +36,7 @@ void CodeCreateThreadPool(std::ofstream &ofs, const std::string &module_name) {
         "    MICRO_ERROR(\"set global thread pool failed\");\n"
         "    return RET_ERROR;\n"
         "  }\n"
-         "  MICRO_INFO(\"config: ThreadNum: %d, BindMode: %d\", thread_num, bind_mode);\n";
+         "  printf(\"config: ThreadNum: %d, BindMode: %d\\n\", thread_num, bind_mode);\n";
 }

 void CodeDestroyThreadPool(std::ofstream &ofs) { ofs << "  DestroyThreadPool(thread_pool);\n"; }
--- a/mindspore/lite/micro/coder/generator/component/weight_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/weight_component.cc
@ -17,9 +17,9 @@
 #include "coder/generator/component/weight_component.h"
 #include <memory>
 #include <utility>
-#include <algorithm>
 #include "coder/generator/component/const_blocks/license.h"
 #include "coder/utils/coder_utils.h"
+#include "coder/opcoders/parallel.h"

 namespace mindspore::lite::micro {
 void CodeWeightFileHeader(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
@ -89,7 +89,7 @@ void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, cons
      << "  if (weight_buffer == NULL) {\n"
      << "    return RET_ERROR;\n"
      << "  }\n";
-
+  ofs << "  int " << gThreadNum << " = 1;\n\n";
  ofs << "  struct ModelParameter {\n"
      << "    void *addr;\n"
      << "    size_t size;\n"
--- a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
@ -82,9 +82,9 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
  MS_CHECK_PTR(params_->decoded_boxes_);
  params_->nms_candidate_ = allocator_->Malloc(kNumberTypeUInt8, num_boxes_ * sizeof(uint8_t), kWorkspace);
  MS_CHECK_PTR(params_->nms_candidate_);
-  params_->selected_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
+  params_->selected_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->selected_);
-  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
+  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->single_class_indexes_);

  if (params_->use_regular_nms_) {
@ -92,13 +92,13 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->scores_);
    params_->indexes_ =
-      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
+      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->indexes_);
    params_->all_class_scores_ =
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->all_class_scores_);
    params_->all_class_indexes_ =
-      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
+      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->all_class_indexes_);
  } else {
    params_->scores_ = allocator_->Malloc(kNumberTypeFloat, num_boxes_ * sizeof(float), kWorkspace);
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
@ -36,7 +36,7 @@ int BiasAddFP32Coder::DoCode(CoderContext *ctx) {
    return RET_ERROR;
  }
  size_t data_size = input_tensor_->ElementsNum();
-  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex));
+  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex), true);
  Collect(ctx,
          {"nnacl/arithmetic.h", "nnacl/nnacl_utils.h", "nnacl/nnacl_common.h", "nnacl/base/arithmetic_base.h",
           "nnacl/fp32/add_fp32.h", "nnacl/fp32/arithmetic_fp32.h"},
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
@ -183,13 +183,15 @@ int Conv2DINT8Coder::Resize() {
 int Conv2DINT8Coder::DoCode(CoderContext *const context) {
  std::vector<std::string> asm_files;
  if (target_ == kARM32A) {
-    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon32.S"};
+    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S"};
  } else if (target_ == kARM64) {
-    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon64.S"};
+    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S", "MatmulDpInt8.S"};
  }
-  Collect(context, {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h"},
+  Collect(context,
+          {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h",
+           "wrapper/base/common_wrapper.h", "wrapper/base/optimize_handler_wrapper.h"},
          {"common_func.c", "pack_int8.c", "conv_int8.c", "winograd_transform.c", "matmul_int8.c", "fixed_point.c",
-           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "thread_pool.c"},
+           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "common_wrapper.c", "optimize_handler_wrapper.c"},
          asm_files);
  // call the op function
  nnacl::NNaclInt8Serializer code;
@ -202,7 +204,6 @@ int Conv2DINT8Coder::DoCode(CoderContext *const context) {
  code.CodeBaseStruct("ConvolutionInt8Args", kRunArgs, input_tensor_, packed_input_, matmul_packed_input_,
                      packed_weight_, bias_data_, output_tensor_, filter_zp_ptr_, input_sum_,
                      "(ConvParameter *)&conv_param", matmul_func_, support_optimize_);
-  code.CodeFunction("CheckSupportOptimize", kRunArgsAddr);
  if (support_parallel_) {
    code.CodeFunction(kParallelLaunch, gThreadPool, "ConvolutionInt8Run", kRunArgsAddr, gThreadNum);
  } else {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
@ -44,10 +44,8 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
  }

 private:
-  int InitWeightBias(CoderContext *ctx);
-
  void CheckSupportOptimize();
-
+  int InitWeightBias(CoderContext *ctx);
  int InitTmpBuffer(CoderContext *ctx);

  int Resize();
@ -70,7 +68,7 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
  int32_t *input_sum_{nullptr};
  int8_t *matmul_packed_input_{nullptr};

-  std::string matmul_func_;
+  std::string matmul_func_{"NULL"};

  std::function<int(nnacl::NNaclInt8Serializer &, const std::string &, const std::string &)> pack_weight_init_{nullptr};
 };
--- a/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
+++ b/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
@ -168,9 +168,13 @@ class Serializer {
   *    "int pointer_gen[4] = {1 ,3, 2, 42};\n
   *    const Foo foo_gen = {{1, 2, 3}, pointer_gen, 4};\n"
   */
-  template <typename... PARAMETERS>
+  template <bool immutable = true, typename... PARAMETERS>
  void CodeBaseStruct(const std::string &type, const std::string &name, PARAMETERS... parameters) {
-    code << "const " << type << " " << name << " = {";
+    if constexpr (immutable) {
+      code << "const " << type << " " << name << " = {";
+    } else {
+      code << type << " " << name << " = {";
+    }
    GenCode(parameters...);
    code << "};\n";
  }
--- a/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
@ -22,7 +22,6 @@ endif()
 set(MICRO_CMAKE_PATH ${MICRO_DIR}/cmake)
 set(OPERATOR_LIBRARY_PATH ${CMAKE_BINARY_DIR}/operator_library)
 set(HEADER_PATH "${OPERATOR_LIBRARY_PATH}/include")
-set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")

 message("===========>start to pack operators' head file")
 file(REMOVE_RECURSE ${OPERATOR_LIBRARY_PATH})
@ -36,14 +35,31 @@ file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/assembly)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16_grad)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp32_grad)
-file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/intrinsics)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/optimize)

+if(PLATFORM_ARM64)
+    set(MICRO_BUILD_ARM64 ON)
+endif()
+if(PLATFORM_ARM32)
+    set(MICRO_BUILD_ARM32A ON)
+endif()
+
 include(${MICRO_CMAKE_PATH}/package_android.cmake)
 include(${MICRO_CMAKE_PATH}/package_nnacl.cmake)
 include(${MICRO_CMAKE_PATH}/package_cmsis.cmake)
 include(${MICRO_CMAKE_PATH}/package_wrapper.cmake)

+list(APPEND OP_FILES ${NNACL_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})
+
+if(PLATFORM_ARM64)
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm64")
+elseif(PLATFORM_ARM32)
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm32a")
+else()
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")
+    list(APPEND OP_FILES ${CMSIS_OPS})
+endif()
+
 # generate static library
-add_library(ops STATIC ${NNACL_OPS} ${CMSIS_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})
+add_library(ops STATIC ${OP_FILES})
 install(TARGETS ops ARCHIVE DESTINATION ${LIB_PATH})
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
@ -0,0 +1,36 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/base/common_wrapper.h"
+#ifdef __ANDROID__
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+
+bool GetSupportOptFlag() {
+  bool status = false;
+#ifdef ENABLE_ARM64
+  int hwcap_type = 16;
+  // getHwCap
+  uint32_t hwcap = getauxval(hwcap_type);
+  if (hwcap & HWCAP_ASIMDDP) {
+    status = true;
+  } else {
+    status = false;
+  }
+#endif
+  return status;
+}
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
@ -0,0 +1,24 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
+
+#include "nnacl/op_base.h"
+
+bool GetSupportOptFlag();
+
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/base/optimize_handler_wrapper.h"
+
+extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                  const int *input_sum, const int *bias);
+extern void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
+                               const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
+                               int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride,
+                               size_t peroc);
+extern void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_t row8, size_t col8, size_t deep4,
+                            const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int *multiplier,
+                            int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);
+
+#ifdef ENABLE_ARM64
+void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                   const int *input_sum, const int *bias) {
+  return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
+}
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, size_t per_channel) {
+  return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, C8NUM), UP_ROUND(col, C8NUM), deep_4, input_sum, bias, mini, maxi,
+                            output_zp, multiplier, left_shift, right_shift, row, col, stride, per_channel);
+}
+void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                   int32_t maxi, size_t per_channel, int32_t *filter_zp) {
+  return MatmulInt8DpOpt(a, b, dst, row, col, deep_4, input_sum, bias, mini, maxi, output_zp, multiplier, left_shift,
+                         right_shift, stride, per_channel, filter_zp);
+}
+#endif
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef ENABLE_ARM64
+void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
+                                       size_t ksize, size_t ic4, size_t output_channel, size_t offset,
+                                       const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
+                                       int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
+                                       size_t asymmetric, size_t per_channel, size_t per_channel_offset);
+void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                   const int *input_sum, const int *bias);
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, size_t per_channel);
+void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                   int32_t maxi, size_t per_channel, int32_t *filter_zp);
+#endif
+
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
--- a/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
@ -22,21 +22,12 @@ void InitMatrixA(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
  }
  for (int i = 0; i < params_->batch; i++) {
    const float *src = src_ptr + i * params_->deep_ * params_->row_;
-#ifdef ENABLE_ARM32
-    float *dst = dst_ptr + i * params_->deep_ * params_->row_4_;
-    if (params_->a_transpose_) {
-      RowMajor2Row4Major(src, dst, params_->deep_, params_->row_);
-    } else {
-      RowMajor2Col4Major(src, dst, params_->row_, params_->deep_);
-    }
-#else
-    float *dst = dst_ptr + i * params_->deep_ * params_->row_12_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->row_align_;
    if (params_->a_transpose_) {
      RowMajor2Row12Major(src, dst, params_->deep_, params_->row_);
    } else {
      RowMajor2Col12Major(src, dst, params_->row_, params_->deep_);
    }
-#endif
  }
 }

@ -55,11 +46,19 @@ void InitMatrixB(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
  }
  for (int i = 0; i < params_->batch; i++) {
    const float *src = src_ptr + i * params_->deep_ * params_->col_;
-    float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->col_align_;
+#ifdef ENABLE_ARM32
+    if (params_->b_transpose_) {
+      RowMajor2Col4Major(src, dst, params_->col_, params_->deep_);
+    } else {
+      RowMajor2Row4Major(src, dst, params_->deep_, params_->col_);
+    }
+#else
    if (params_->b_transpose_) {
      RowMajor2Col8Major(src, dst, params_->col_, params_->deep_);
    } else {
      RowMajor2Row8Major(src, dst, params_->deep_, params_->col_);
    }
+#endif
  }
 }
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
@ -16,24 +16,6 @@

 #include "wrapper/int8/convolution_int8_wrapper.h"

-void CheckSupportOptimize(const ConvolutionInt8Args *args) {
-  int tile_num = 8;
-#ifdef ENABLE_ARM32
-  tile_num = 4;
-  args->is_optimize_ = false;
-#endif
-#ifdef ENABLE_ARM64
-  if (mindspore::lite::IsSupportSDot()) {
-    matmul_func_ = MatMulRInt8_optimize_handler;
-    args->is_optimize_ = true;
-  } else {
-    tile_num = 4;
-    args->is_optimize_ = false;
-  }
-#endif
-  args->conv_param_->tile_num_ = tile_num;
-}
-
 int ConvolutionInt8Run(void *cdata, int task_id) {
  ConvolutionInt8Args *args = (ConvolutionInt8Args *)cdata;
  ConvInt8(args->input_data_, args->packed_input_, args->matmul_input_, args->packed_weight_, args->bias_data_,
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
@ -36,8 +36,6 @@ typedef struct {
  bool is_optimize_;
 } ConvolutionInt8Args;

-void CheckSupportOptimize(const ConvolutionInt8Args *args);
-
 int ConvolutionInt8Run(void *cdata, int task_id);

 #endif  // MINDSPORE_LITE_MICRO_INT8_CONVOLUTION_WRAPPER_INT8_WRAPPER_H_