diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index ecdd2b1668..0a06c0d4a3 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -136,6 +136,8 @@ if(PLATFORM_ARM64)
             COMPONENT ${RUNTIME_COMPONENT_NAME})
     install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
             COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend* ops*" EXCLUDE)
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+            COMPONENT ${CODEGEN_COMPONENT_NAME})
     if(ENABLE_TOOLS)
         install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
     endif()
@@ -157,6 +159,8 @@ elseif(PLATFORM_ARM32)
             COMPONENT ${RUNTIME_COMPONENT_NAME})
     install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
             COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend*" EXCLUDE)
+    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+            COMPONENT ${CODEGEN_COMPONENT_NAME})
     if(ENABLE_TOOLS)
         install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
     endif()
@@ -231,6 +235,8 @@ else()
         install(FILES ${glog_LIBPATH}/libglog.so.0.4.0
                 DESTINATION ${CONVERTER_PKG_NAME}/third_party/glog/lib RENAME libglog.so.0
                 COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
+                COMPONENT ${CODEGEN_COMPONENT_NAME})
         install(TARGETS codegen RUNTIME DESTINATION ${CODEGEN_PKG_NAME}/
                 COMPONENT ${CODEGEN_COMPONENT_NAME})
     endif()
@@ -249,7 +255,7 @@ else()
 endif()
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 if(PLATFORM_ARM64 OR PLATFORM_ARM32)
-    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME})
+    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 else()
     set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CONVERTER_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 endif()
diff --git a/mindspore/core/utils/log_adapter.h b/mindspore/core/utils/log_adapter.h
index 63329fda0f..a70764073e 100644
--- a/mindspore/core/utils/log_adapter.h
+++ b/mindspore/core/utils/log_adapter.h
@@ -34,7 +34,7 @@
 #define LOG_HDR_FILE_REL_PATH "mindspore/core/utils/log_adapter.h"
 
 // Get start index of file relative path in __FILE__
-static constexpr int GetRelPathPos() noexcept {
+static constexpr size_t GetRelPathPos() noexcept {
   return sizeof(__FILE__) > sizeof(LOG_HDR_FILE_REL_PATH) ? sizeof(__FILE__) - sizeof(LOG_HDR_FILE_REL_PATH) : 0;
 }
 
diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index 65128f7fd3..8180cdd2f3 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -89,8 +89,10 @@ if(SUPPORT_TRAIN)
 else()
     if(PLATFORM_ARM64)
         set(RUNTIME_COMPONENT_NAME inference-android-aarch64)
+        set(CODEGEN_COMPONENT_NAME codegen-android-aarch64)
     elseif(PLATFORM_ARM32)
         set(RUNTIME_COMPONENT_NAME inference-android-aarch32)
+        set(CODEGEN_COMPONENT_NAME codegen-android-aarch32)
     elseif(WIN32)
         if("${X86_64_SIMD}" STREQUAL "off")
             set(RUNTIME_COMPONENT_NAME inference-win-x64)
@@ -218,7 +220,6 @@ if(ENABLE_CONVERTER)
     include(${TOP_DIR}/cmake/external_libs/eigen.cmake)
     include(${TOP_DIR}/cmake/external_libs/protobuf.cmake)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 endif()
 
 if(ENABLE_MINDRT)
@@ -272,6 +273,7 @@ endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/nnacl)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 if(ENABLE_TOOLS)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark)
     if(SUPPORT_TRAIN)
diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake
index e72274f2a5..70d2818dee 100644
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -301,6 +301,30 @@ set(LITE_KERNEL_SRC
         ${LITE_DIR}/nnacl/infer/splice_infer.c
         )
 
+#### sse
+if("${X86_64_SIMD}" STREQUAL "sse")
+    set(SSE_SRC
+            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
+            )
+    set_property(SOURCE ${SSE_SRC} PROPERTY LANGUAGE C)
+endif()
+
+#### avx
+if("${X86_64_SIMD}" STREQUAL "avx")
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse4.1 -mavx -mavx2")
+    set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2")
+    set(AVX_SRC
+            ${LITE_DIR}/nnacl/intrinsics/avx/common_utils.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
+            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
+            ${LITE_DIR}/nnacl/assembly/avx/MatmulAvx.S
+            )
+    set_property(SOURCE ${AVX_SRC} PROPERTY LANGUAGE C)
+endif()
+
 list(APPEND FILE_SET ${CODER_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
-        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE})
+        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE} ${SSE_SRC} ${AVX_SRC})
 
diff --git a/mindspore/lite/micro/coder/CMakeLists.txt b/mindspore/lite/micro/coder/CMakeLists.txt
index 97a05fe2e1..70ef0b23b6 100644
--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@@ -25,10 +25,12 @@ include(${MICRO_DIR}/cmake/file_list.cmake)
 include(${MICRO_DIR}/cmake/package_wrapper.cmake)
 add_subdirectory(operator_library)
 
-add_executable(codegen main.cc ${FILE_SET})
-add_dependencies(codegen fbs_src)
-add_dependencies(codegen fbs_inner_src)
-target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
-if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
-    add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
+if(NOT PLATFORM_ARM32 AND NOT PLATFORM_ARM64)
+    add_executable(codegen main.cc ${FILE_SET})
+    add_dependencies(codegen fbs_src)
+    add_dependencies(codegen fbs_inner_src)
+    target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
+    if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+        add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
+    endif()
 endif()
diff --git a/mindspore/lite/micro/coder/allocator/allocator.h b/mindspore/lite/micro/coder/allocator/allocator.h
index 660d48e121..0ca0c859c6 100644
--- a/mindspore/lite/micro/coder/allocator/allocator.h
+++ b/mindspore/lite/micro/coder/allocator/allocator.h
@@ -92,19 +92,17 @@ class MemoryAllocator {
    * including tensor, workspace
    */
   template <typename T>
-  std::string GetRuntimeAddr(T t, bool is_const = false) {
+  std::string GetRuntimeAddr(T t, bool immutable = false) {
     if (!t) {
       return "";
     }
-    std::string type_info = is_const ? "const " : "";
     std::string type_name;
     if (std::type_index(typeid(T)) == std::type_index(typeid(Tensor *))) {
       type_name = GetTensorDataType(reinterpret_cast<Tensor *>(t)->data_type()) + "*";
     } else {
       type_name = GetVariableTypeName<T>();
     }
-    type_info = wrap(type_info + type_name);
-
+    std::string type_info = wrap(type_name);
     void *variable = reinterpret_cast<void *>(t);
     auto item = inputs_addr_.find(variable);
     if (item != inputs_addr_.end()) {
@@ -133,6 +131,9 @@ class MemoryAllocator {
                         [&variable](const std::pair<Tensor *, std::string> &a) { return variable == a.first; });
     if (iter != origin_weights_addr_.end()) {
       saved_weights_addr_.insert(std::make_pair(iter->second, reinterpret_cast<Tensor *>(variable)));
+      if (immutable) {
+        malloc_weights_addr_.insert({reinterpret_cast<Tensor *>(variable), iter->second});
+      }
       return iter->second;
     }
     MS_LOG(ERROR) << "uninitialized memory";
diff --git a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
index 1b24efc4ed..3c6e482862 100644
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
@@ -134,7 +134,7 @@ void CodeBenchmarkInference(std::ofstream &ofs, const std::string &module_name)
       << "    uint64_t timeAvg = 0;\n"
       << "    int loop_count = atoi(argv[3]);\n"
       << "    printf(\"======Inference Start======\\n\");\n"
-      << "    printf(\"cycles: %d\", loop_count);\n"
+      << "    printf(\"cycles: %d\\n\", loop_count);\n"
       << "    for (int i = 0; i < loop_count; i++) {\n"
       << "      uint64_t runBegin = GetTimeUs();\n"
       << "      " << module_name << "_Inference();\n"
diff --git a/mindspore/lite/micro/coder/generator/component/cmake_component.cc b/mindspore/lite/micro/coder/generator/component/cmake_component.cc
index 9096f3872e..a115fe425b 100644
--- a/mindspore/lite/micro/coder/generator/component/cmake_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/cmake_component.cc
@@ -48,7 +48,7 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::string &module_name, con
   }
 
   ofs << "file(GLOB NET_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.c)\n"
-      << "add_library(${PROJ_NAME} STATIC ${NET_SRC})\n";
+      << "add_library(net STATIC ${NET_SRC})\n";
 }
 
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h b/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
index 3e0166c6f1..d9a07fffdc 100644
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
@@ -19,9 +19,8 @@
 
 const char *bench_cmake_lists_txt =
   "cmake_minimum_required(VERSION 3.14)\n"
-  "project(${PROJ_NAME})\n"
+  "project(benchmark)\n"
   "\n"
-  "message(\"project name: ${PROJ_NAME}\")\n"
   "message(\"project name: ${MODEL_LIB_PATH}\")\n"
   "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
   "\n"
@@ -54,14 +53,13 @@ const char *bench_cmake_lists_txt =
   "endif ()\n"
   "link_directories(${MODEL_LIB_PATH})\n"
   "include(benchmark.cmake)\n"
-  "add_executable(${PROJ_NAME}_bench ${SRC_FILES})\n"
-  "target_link_libraries(${PROJ_NAME}_bench ${MODEL_LIB_NAME} -lm -pthread)\n";
+  "add_executable(benchmark ${SRC_FILES})\n"
+  "target_link_libraries(benchmark ${MODEL_LIB_NAME} -lm -pthread)\n";
 
 const char *src_cmake_lists_txt =
   "cmake_minimum_required(VERSION 3.14)\n"
-  "project(${PROJ_NAME})\n"
+  "project(net)\n"
   "\n"
-  "message(\"project name: ${PROJ_NAME}\")\n"
   "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
   "message(\"operator lib path: ${OP_LIB}\")\n"
   "message(\"operator header path: ${OP_HEADER_PATH}\")\n"
@@ -83,10 +81,11 @@ const char *src_cmake_lists_txt =
   "else()\n"
   "    set(CMAKE_C_FLAGS \"-fPIC -fPIE -O3 -Werror -fstack-protector-strong -fomit-frame-pointer ${CMAKE_C_FLAGS}\")\n"
   "    set(CMAKE_C_FLAGS_Release \"${CMAKE_C_FLAGS_Release} -O3 -ffunction-sections -Werror -fdata-sections\")\n"
+  "    string(REPLACE \"-g\" \"\" CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}\")\n"
   "endif()\n"
   "\n"
   "function(create_library)\n"
-  "    add_custom_command(TARGET ${PROJ_NAME}\n"
+  "    add_custom_command(TARGET net\n"
   "            POST_BUILD\n"
   "            COMMAND rm -rf tmp\n"
   "            COMMAND mkdir tmp\n"
@@ -97,9 +96,9 @@ const char *src_cmake_lists_txt =
   "            COMMENT \"unzip raw static library ${library_name}\"\n"
   "            )\n"
   "    foreach (object_file ${OP_SRC})\n"
-  "        add_custom_command(TARGET ${PROJ_NAME} POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
+  "        add_custom_command(TARGET net POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
   "    endforeach ()\n"
-  "    add_custom_command(TARGET ${PROJ_NAME}\n"
+  "    add_custom_command(TARGET net\n"
   "            POST_BUILD\n"
   "            COMMAND ar cr ${library_name} *.o\n"
   "            COMMAND ranlib ${library_name}\n"
@@ -109,7 +108,7 @@ const char *src_cmake_lists_txt =
   "            COMMENT \"generate specified static library ${library_name}\"\n"
   "            )\n"
   "endfunction(create_library)\n"
-  "string(CONCAT library_name \"lib\" ${PROJ_NAME} \".a\")\n"
+  "string(CONCAT library_name \"lib\" net \".a\")\n"
   "create_library()\n";
 
 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
diff --git a/mindspore/lite/micro/coder/generator/component/parallel_component.cc b/mindspore/lite/micro/coder/generator/component/parallel_component.cc
index a4083498ab..f01322f61a 100644
--- a/mindspore/lite/micro/coder/generator/component/parallel_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/parallel_component.cc
@@ -36,7 +36,7 @@ void CodeCreateThreadPool(std::ofstream &ofs, const std::string &module_name) {
          "    MICRO_ERROR(\"set global thread pool failed\");\n"
          "    return RET_ERROR;\n"
          "  }\n"
-         "  MICRO_INFO(\"config: ThreadNum: %d, BindMode: %d\", thread_num, bind_mode);\n";
+         "  printf(\"config: ThreadNum: %d, BindMode: %d\\n\", thread_num, bind_mode);\n";
 }
 
 void CodeDestroyThreadPool(std::ofstream &ofs) { ofs << "  DestroyThreadPool(thread_pool);\n"; }
diff --git a/mindspore/lite/micro/coder/generator/component/weight_component.cc b/mindspore/lite/micro/coder/generator/component/weight_component.cc
index 28d2929c5b..3a8406a93c 100644
--- a/mindspore/lite/micro/coder/generator/component/weight_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/weight_component.cc
@@ -17,9 +17,9 @@
 #include "coder/generator/component/weight_component.h"
 #include <memory>
 #include <utility>
-#include <algorithm>
 #include "coder/generator/component/const_blocks/license.h"
 #include "coder/utils/coder_utils.h"
+#include "coder/opcoders/parallel.h"
 
 namespace mindspore::lite::micro {
 void CodeWeightFileHeader(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
@@ -89,7 +89,7 @@ void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, cons
       << "  if (weight_buffer == NULL) {\n"
       << "    return RET_ERROR;\n"
       << "  }\n";
-
+  ofs << "  int " << gThreadNum << " = 1;\n\n";
   ofs << "  struct ModelParameter {\n"
       << "    void *addr;\n"
       << "    size_t size;\n"
diff --git a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
index cd0684d2d9..c6646d4b64 100644
--- a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
@@ -82,9 +82,9 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
   MS_CHECK_PTR(params_->decoded_boxes_);
   params_->nms_candidate_ = allocator_->Malloc(kNumberTypeUInt8, num_boxes_ * sizeof(uint8_t), kWorkspace);
   MS_CHECK_PTR(params_->nms_candidate_);
-  params_->selected_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
+  params_->selected_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
   MS_CHECK_PTR(params_->selected_);
-  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
+  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
   MS_CHECK_PTR(params_->single_class_indexes_);
 
   if (params_->use_regular_nms_) {
@@ -92,13 +92,13 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
       allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
     MS_CHECK_PTR(params_->scores_);
     params_->indexes_ =
-      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
+      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
     MS_CHECK_PTR(params_->indexes_);
     params_->all_class_scores_ =
       allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
     MS_CHECK_PTR(params_->all_class_scores_);
     params_->all_class_indexes_ =
-      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
+      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
     MS_CHECK_PTR(params_->all_class_indexes_);
   } else {
     params_->scores_ = allocator_->Malloc(kNumberTypeFloat, num_boxes_ * sizeof(float), kWorkspace);
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
index 5ff71219e6..5914d31e4a 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
@@ -36,7 +36,7 @@ int BiasAddFP32Coder::DoCode(CoderContext *ctx) {
     return RET_ERROR;
   }
   size_t data_size = input_tensor_->ElementsNum();
-  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex));
+  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex), true);
   Collect(ctx,
           {"nnacl/arithmetic.h", "nnacl/nnacl_utils.h", "nnacl/nnacl_common.h", "nnacl/base/arithmetic_base.h",
            "nnacl/fp32/add_fp32.h", "nnacl/fp32/arithmetic_fp32.h"},
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
index 0c75fff1cb..6f37ec18fe 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
@@ -183,13 +183,15 @@ int Conv2DINT8Coder::Resize() {
 int Conv2DINT8Coder::DoCode(CoderContext *const context) {
   std::vector<std::string> asm_files;
   if (target_ == kARM32A) {
-    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon32.S"};
+    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S"};
   } else if (target_ == kARM64) {
-    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon64.S"};
+    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S", "MatmulDpInt8.S"};
   }
-  Collect(context, {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h"},
+  Collect(context,
+          {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h",
+           "wrapper/base/common_wrapper.h", "wrapper/base/optimize_handler_wrapper.h"},
           {"common_func.c", "pack_int8.c", "conv_int8.c", "winograd_transform.c", "matmul_int8.c", "fixed_point.c",
-           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "thread_pool.c"},
+           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "common_wrapper.c", "optimize_handler_wrapper.c"},
           asm_files);
   // call the op function
   nnacl::NNaclInt8Serializer code;
@@ -202,7 +204,6 @@ int Conv2DINT8Coder::DoCode(CoderContext *const context) {
   code.CodeBaseStruct("ConvolutionInt8Args", kRunArgs, input_tensor_, packed_input_, matmul_packed_input_,
                       packed_weight_, bias_data_, output_tensor_, filter_zp_ptr_, input_sum_,
                       "(ConvParameter *)&conv_param", matmul_func_, support_optimize_);
-  code.CodeFunction("CheckSupportOptimize", kRunArgsAddr);
   if (support_parallel_) {
     code.CodeFunction(kParallelLaunch, gThreadPool, "ConvolutionInt8Run", kRunArgsAddr, gThreadNum);
   } else {
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
index 6f3b0e233d..c34ccf670b 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
@@ -44,10 +44,8 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
   }
 
  private:
-  int InitWeightBias(CoderContext *ctx);
-
   void CheckSupportOptimize();
-
+  int InitWeightBias(CoderContext *ctx);
   int InitTmpBuffer(CoderContext *ctx);
 
   int Resize();
@@ -70,7 +68,7 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
   int32_t *input_sum_{nullptr};
   int8_t *matmul_packed_input_{nullptr};
 
-  std::string matmul_func_;
+  std::string matmul_func_{"NULL"};
 
   std::function<int(nnacl::NNaclInt8Serializer &, const std::string &, const std::string &)> pack_weight_init_{nullptr};
 };
diff --git a/mindspore/lite/micro/coder/opcoders/serializers/serializer.h b/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
index 6a5401c1d2..9938a82387 100644
--- a/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
+++ b/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
@@ -168,9 +168,13 @@ class Serializer {
    *    "int pointer_gen[4] = {1 ,3, 2, 42};\n
    *    const Foo foo_gen = {{1, 2, 3}, pointer_gen, 4};\n"
    */
-  template <typename... PARAMETERS>
+  template <bool immutable = true, typename... PARAMETERS>
   void CodeBaseStruct(const std::string &type, const std::string &name, PARAMETERS... parameters) {
-    code << "const " << type << " " << name << " = {";
+    if constexpr (immutable) {
+      code << "const " << type << " " << name << " = {";
+    } else {
+      code << type << " " << name << " = {";
+    }
     GenCode(parameters...);
     code << "};\n";
   }
diff --git a/mindspore/lite/micro/coder/operator_library/CMakeLists.txt b/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
index c86d5f7fd3..fe7cb4b7a2 100644
--- a/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
@@ -22,7 +22,6 @@ endif()
 set(MICRO_CMAKE_PATH ${MICRO_DIR}/cmake)
 set(OPERATOR_LIBRARY_PATH ${CMAKE_BINARY_DIR}/operator_library)
 set(HEADER_PATH "${OPERATOR_LIBRARY_PATH}/include")
-set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")
 
 message("===========>start to pack operators' head file")
 file(REMOVE_RECURSE ${OPERATOR_LIBRARY_PATH})
@@ -36,14 +35,31 @@ file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/assembly)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16_grad)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp32_grad)
-file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/intrinsics)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/optimize)
 
+if(PLATFORM_ARM64)
+    set(MICRO_BUILD_ARM64 ON)
+endif()
+if(PLATFORM_ARM32)
+    set(MICRO_BUILD_ARM32A ON)
+endif()
+
 include(${MICRO_CMAKE_PATH}/package_android.cmake)
 include(${MICRO_CMAKE_PATH}/package_nnacl.cmake)
 include(${MICRO_CMAKE_PATH}/package_cmsis.cmake)
 include(${MICRO_CMAKE_PATH}/package_wrapper.cmake)
 
+list(APPEND OP_FILES ${NNACL_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})
+
+if(PLATFORM_ARM64)
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm64")
+elseif(PLATFORM_ARM32)
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm32a")
+else()
+    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")
+    list(APPEND OP_FILES ${CMSIS_OPS})
+endif()
+
 # generate static library
-add_library(ops STATIC ${NNACL_OPS} ${CMSIS_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})
+add_library(ops STATIC ${OP_FILES})
 install(TARGETS ops ARCHIVE DESTINATION ${LIB_PATH})
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
new file mode 100644
index 0000000000..014a80d2ca
--- /dev/null
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/base/common_wrapper.h"
+#ifdef __ANDROID__
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+
+bool GetSupportOptFlag() {
+  bool status = false;
+#ifdef ENABLE_ARM64
+  int hwcap_type = 16;
+  // getHwCap
+  uint32_t hwcap = getauxval(hwcap_type);
+  if (hwcap & HWCAP_ASIMDDP) {
+    status = true;
+  } else {
+    status = false;
+  }
+#endif
+  return status;
+}
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
new file mode 100644
index 0000000000..a68fdf8240
--- /dev/null
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
+
+#include "nnacl/op_base.h"
+
+bool GetSupportOptFlag();
+
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
new file mode 100644
index 0000000000..bee2c6e35e
--- /dev/null
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/base/optimize_handler_wrapper.h"
+
+extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                  const int *input_sum, const int *bias);
+extern void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
+                               const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
+                               int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride,
+                               size_t peroc);
+extern void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_t row8, size_t col8, size_t deep4,
+                            const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int *multiplier,
+                            int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);
+
+#ifdef ENABLE_ARM64
+void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                   const int *input_sum, const int *bias) {
+  return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
+}
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, size_t per_channel) {
+  return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, C8NUM), UP_ROUND(col, C8NUM), deep_4, input_sum, bias, mini, maxi,
+                            output_zp, multiplier, left_shift, right_shift, row, col, stride, per_channel);
+}
+void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                   int32_t maxi, size_t per_channel, int32_t *filter_zp) {
+  return MatmulInt8DpOpt(a, b, dst, row, col, deep_4, input_sum, bias, mini, maxi, output_zp, multiplier, left_shift,
+                         right_shift, stride, per_channel, filter_zp);
+}
+#endif
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
new file mode 100644
index 0000000000..40e82acbab
--- /dev/null
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef ENABLE_ARM64
+void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
+                                       size_t ksize, size_t ic4, size_t output_channel, size_t offset,
+                                       const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
+                                       int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
+                                       size_t asymmetric, size_t per_channel, size_t per_channel_offset);
+void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                   const int *input_sum, const int *bias);
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, size_t per_channel);
+void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                   int32_t maxi, size_t per_channel, int32_t *filter_zp);
+#endif
+
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c b/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
index db505679c0..50dbab12dd 100644
--- a/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
@@ -22,21 +22,12 @@ void InitMatrixA(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
   }
   for (int i = 0; i < params_->batch; i++) {
     const float *src = src_ptr + i * params_->deep_ * params_->row_;
-#ifdef ENABLE_ARM32
-    float *dst = dst_ptr + i * params_->deep_ * params_->row_4_;
-    if (params_->a_transpose_) {
-      RowMajor2Row4Major(src, dst, params_->deep_, params_->row_);
-    } else {
-      RowMajor2Col4Major(src, dst, params_->row_, params_->deep_);
-    }
-#else
-    float *dst = dst_ptr + i * params_->deep_ * params_->row_12_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->row_align_;
     if (params_->a_transpose_) {
       RowMajor2Row12Major(src, dst, params_->deep_, params_->row_);
     } else {
       RowMajor2Col12Major(src, dst, params_->row_, params_->deep_);
     }
-#endif
   }
 }
 
@@ -55,11 +46,19 @@ void InitMatrixB(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
   }
   for (int i = 0; i < params_->batch; i++) {
     const float *src = src_ptr + i * params_->deep_ * params_->col_;
-    float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
+    float *dst = dst_ptr + i * params_->deep_ * params_->col_align_;
+#ifdef ENABLE_ARM32
+    if (params_->b_transpose_) {
+      RowMajor2Col4Major(src, dst, params_->col_, params_->deep_);
+    } else {
+      RowMajor2Row4Major(src, dst, params_->deep_, params_->col_);
+    }
+#else
     if (params_->b_transpose_) {
       RowMajor2Col8Major(src, dst, params_->col_, params_->deep_);
     } else {
       RowMajor2Row8Major(src, dst, params_->deep_, params_->col_);
     }
+#endif
   }
 }
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
index 3f916829bf..48f5ccf778 100644
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
@@ -16,24 +16,6 @@
 
 #include "wrapper/int8/convolution_int8_wrapper.h"
 
-void CheckSupportOptimize(const ConvolutionInt8Args *args) {
-  int tile_num = 8;
-#ifdef ENABLE_ARM32
-  tile_num = 4;
-  args->is_optimize_ = false;
-#endif
-#ifdef ENABLE_ARM64
-  if (mindspore::lite::IsSupportSDot()) {
-    matmul_func_ = MatMulRInt8_optimize_handler;
-    args->is_optimize_ = true;
-  } else {
-    tile_num = 4;
-    args->is_optimize_ = false;
-  }
-#endif
-  args->conv_param_->tile_num_ = tile_num;
-}
-
 int ConvolutionInt8Run(void *cdata, int task_id) {
   ConvolutionInt8Args *args = (ConvolutionInt8Args *)cdata;
   ConvInt8(args->input_data_, args->packed_input_, args->matmul_input_, args->packed_weight_, args->bias_data_,
diff --git a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
index ec19d41aa1..082ccf4156 100644
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
@@ -36,8 +36,6 @@ typedef struct {
   bool is_optimize_;
 } ConvolutionInt8Args;
 
-void CheckSupportOptimize(const ConvolutionInt8Args *args);
-
 int ConvolutionInt8Run(void *cdata, int task_id);
 
 #endif  // MINDSPORE_LITE_MICRO_INT8_CONVOLUTION_WRAPPER_INT8_WRAPPER_H_