diff --git a/mindspore/lite/include/lite_session.h b/mindspore/lite/include/lite_session.h
index 865cbb0857..28222de1e1 100644
--- a/mindspore/lite/include/lite_session.h
+++ b/mindspore/lite/include/lite_session.h
@@ -113,7 +113,8 @@ class MS_API LiteSession {
 
   /// \brief Resize inputs shape.
   ///
-  /// \param[in] inputs Define the new inputs shape.
+  /// \param[in] inputs Define the inputs of the model.
+  /// \param[in] inputs Define the inputs new shape.
   ///
   /// \return STATUS as an error code of resize inputs, STATUS is defined in errorcode.h.
   virtual int Resize(const std::vector<tensor::MSTensor *> &inputs, const std::vector<std::vector<int>>& dims) = 0;
diff --git a/mindspore/lite/internal/CMakeLists.txt b/mindspore/lite/internal/CMakeLists.txt
index 77706f721e..174c6fa4d5 100644
--- a/mindspore/lite/internal/CMakeLists.txt
+++ b/mindspore/lite/internal/CMakeLists.txt
@@ -4,18 +4,22 @@ set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../)
 
 include_directories(${TOP_DIR})
 
-file(GLOB_RECURSE C_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB KERNEL_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/*.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/*.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32_grad/*.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/int8/*.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/quantization/*.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/arithmetic_common.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/activation.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/arithmetic_self.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/arithmetic.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/matmul.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32_grad/activation_grad.c
         ${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/fp32/*.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/fp32_grad/*.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/common/*.cc
         )
 list(REMOVE_ITEM KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/opt_op_handler.c)
 
 set(CCSRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/lite_session.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/ms_tensor.cc
         ${TOP_DIR}/src/common/log_adapter.cc
         ${TOP_DIR}/src/runtime/allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../core/gvar/logging_level.cc
@@ -23,11 +27,12 @@ set(CCSRC
 
 if (PLATFORM_ARM64)
     # assembly
-    file(GLOB ASSEMBLY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/*.s
-            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/*.S)
+    file(GLOB ASSEMBLY_SRC
+            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32OptRemain.S
+            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S)
     set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
     set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC})
-    add_library(mslite_internal SHARED ${C_SRC} ${CCSRC} ${KERNEL_SRC})
+    add_library(mslite_internal SHARED ${CCSRC} ${KERNEL_SRC})
     target_link_libraries(mslite_internal log)
 endif()
 
diff --git a/mindspore/lite/internal/include/lite_session.h b/mindspore/lite/internal/include/lite_session.h
index 55d2095a27..c1bf5242df 100644
--- a/mindspore/lite/internal/include/lite_session.h
+++ b/mindspore/lite/internal/include/lite_session.h
@@ -84,7 +84,7 @@ typedef struct LiteSession {
   /// \param[in] inputs Define the new inputs shape.
   ///
   /// \return STATUS as an error code of resize inputs, STATUS is defined in errorcode.h.
-  int Resize(const TensorPtrVector &inputs, Int32VectorVector dims);
+  int Resize(const TensorPtrVector &inputs, const Int32VectorVector &dims);
 } LiteSession;
 
 #endif  // MINDSPORE_LITE_INCLUDE_LITE_SESSION_H
diff --git a/mindspore/lite/internal/include/lite_utils.h b/mindspore/lite/internal/include/lite_utils.h
index 5661b75f0c..a9236f04ea 100644
--- a/mindspore/lite/internal/include/lite_utils.h
+++ b/mindspore/lite/internal/include/lite_utils.h
@@ -21,12 +21,13 @@
 
 struct MSTensor;
 struct Node;
+using TensorPtr = MSTensor *;
 using TensorPtrVector = std::vector<MSTensor *>;
 using Uint32Vector = std::vector<uint32_t>;
 using String = std::string;
 using StringVector = std::vector<std::string>;
 using ShapeVector = std::vector<int>;
 using NodePtrVector = std::vector<struct Node *>;
-using Int32Vector = std::vector<int32_t>;
+using Int32Vector = std::vector<int>;
 using Int32VectorVector = std::vector<Int32Vector>;
 #endif  // MINDSPORE_LITE_INCLUDE_LITE_UTILS_H_
diff --git a/mindspore/lite/internal/include/model.h b/mindspore/lite/internal/include/model.h
index eafbda9e77..d0cbe315dc 100644
--- a/mindspore/lite/internal/include/model.h
+++ b/mindspore/lite/internal/include/model.h
@@ -182,6 +182,7 @@ enum KernelType {
     NegGrad,
     LogGrad,
     BatchToSpaceND,
+    END,
 };
 
 enum ActivationType {
diff --git a/mindspore/lite/internal/src/kernel/common/common_infershape.cc b/mindspore/lite/internal/src/kernel/common/common_infershape.cc
new file mode 100644
index 0000000000..c88754cd91
--- /dev/null
+++ b/mindspore/lite/internal/src/kernel/common/common_infershape.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/src/kernel/common/common_infershape.h"
+#include "internal/include/errorcode.h"
+#include "internal/include/ms_tensor.h"
+#include "utils/log_adapter.h"
+
+int DoCommonInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors) {
+  TensorPtr input = in_tensors.at(0);
+  MS_ASSERT(input != nullptr);
+  TensorPtr output = out_tensors.at(0);
+  MS_ASSERT(output != nullptr);
+  output->format_ = input->format_;
+  output->data_type_ = input->data_type_;
+  output->shape_ = input->shape_;
+  return RET_OK;
+}
diff --git a/mindspore/lite/internal/src/kernel/common/common_infershape.h b/mindspore/lite/internal/src/kernel/common/common_infershape.h
new file mode 100644
index 0000000000..20cc3549e2
--- /dev/null
+++ b/mindspore/lite/internal/src/kernel/common/common_infershape.h
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_INTERNAL_SRC_KERNEL_COMMON_INFERSHAPE_H_
+#define MINDSPORE_LITE_INTERNAL_SRC_KERNEL_COMMON_INFERSHAPE_H_
+
+#include "internal/include/model.h"
+
+int DoCommonInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors);
+
+#endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_COMMON_INFERSHAPE_H_
diff --git a/mindspore/lite/internal/src/kernel/fp32/activation.cc b/mindspore/lite/internal/src/kernel/fp32/activation.cc
index d2eb03175f..d89d90200d 100644
--- a/mindspore/lite/internal/src/kernel/fp32/activation.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/activation.cc
@@ -15,13 +15,18 @@
  */
 
 #include "internal/src/kernel/fp32/activation.h"
+#include "internal/src/kernel/common/common_infershape.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/ms_tensor.h"
 #include "nnacl/fp32/activation.h"
 #include "utils/log_adapter.h"
 #include "nnacl/errorcode.h"
 
-int DoActivation(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoActivationInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param) {
+  return DoCommonInferShape(in_tensors, out_tensors);
+}
+
+int DoActivation(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                  mindspore::lite::Allocator *allocator) {
   ActivationParameter *param = (ActivationParameter *)node->primitive_;
   int ret = RET_OK;
diff --git a/mindspore/lite/internal/src/kernel/fp32/activation.h b/mindspore/lite/internal/src/kernel/fp32/activation.h
index 50f28f26e1..dbb5c1b79e 100644
--- a/mindspore/lite/internal/src/kernel/fp32/activation.h
+++ b/mindspore/lite/internal/src/kernel/fp32/activation.h
@@ -20,7 +20,8 @@
 #include "internal/include/model.h"
 #include "src/runtime/allocator.h"
 
-int DoActivation(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoActivationInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);
+int DoActivation(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                  mindspore::lite::Allocator *allocator);
 
 #endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_FP32_ACTIVATION_H_
diff --git a/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.cc b/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.cc
index 9ca58508fa..58334dad88 100644
--- a/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.cc
@@ -15,12 +15,18 @@
  */
 
 #include "internal/src/kernel/fp32/arithmetic_self.h"
+#include "internal/src/kernel/common/common_infershape.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/ms_tensor.h"
 #include "utils/log_adapter.h"
 #include "nnacl/fp32/arithmetic_self.h"
 
-int DoArithmeticSelf(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoArithmeticSelfInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                               OpParameter *param) {
+  return DoCommonInferShape(in_tensors, out_tensors);
+}
+
+int DoArithmeticSelf(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                      mindspore::lite::Allocator *allocator) {
   size_t data_size = in_tensors[0]->ElementsNum();
   OpParameter *param = node->primitive_;
diff --git a/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.h b/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.h
index 3d4c285a6d..37b23c81fc 100644
--- a/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.h
+++ b/mindspore/lite/internal/src/kernel/fp32/arithmetic_self.h
@@ -20,7 +20,9 @@
 #include "internal/include/model.h"
 #include "src/runtime/allocator.h"
 
-int DoArithmeticSelf(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoArithmeticSelfInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                               OpParameter *param);
+int DoArithmeticSelf(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                      mindspore::lite::Allocator *allocator);
 
 #endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_FP32_ARITHMETIC_SELF_H_
diff --git a/mindspore/lite/internal/src/kernel/fp32/matmul.cc b/mindspore/lite/internal/src/kernel/fp32/matmul.cc
index d525359bc4..2b106346d0 100644
--- a/mindspore/lite/internal/src/kernel/fp32/matmul.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/matmul.cc
@@ -71,7 +71,50 @@ void FreeMatMulKernelData(MatMulCPUKernelData *kernel_data, mindspore::lite::All
   free(kernel_data);
 }
 
-int DoMatMul(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+static void SwapDims(Int32Vector *dims, int index1, int index2) {
+  int tmp = dims->at(index1);
+  dims->at(index1) = dims->at(index2);
+  dims->at(index2) = tmp;
+}
+
+int DoMatMulInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param) {
+  MS_ASSERT(this->primitive_ != nullptr);
+  TensorPtr input0 = in_tensors.at(0);
+  MS_ASSERT(input0 != nullptr);
+  TensorPtr input1 = in_tensors.at(1);
+  MS_ASSERT(input1 != nullptr);
+  TensorPtr output = out_tensors.at(0);
+  MS_ASSERT(output != nullptr);
+
+  output->data_type_ = input0->data_type_;
+  output->format_ = input0->format_;
+
+  Int32Vector a_shape = input0->shape_;
+  Int32Vector b_shape = input1->shape_;
+  if (a_shape.size() < 2 || b_shape.size() < 2) {
+    MS_LOG(ERROR) << "inputs shape is invalid";
+    return RET_INPUT_TENSOR_ERROR;
+  }
+  for (size_t i = 0; i < a_shape.size() - 2; ++i) {
+    if (a_shape[i] != b_shape[i]) {
+      MS_LOG(ERROR) << "Op MatMul's dimensions must be equal";
+      return RET_INPUT_TENSOR_ERROR;
+    }
+  }
+
+  MatMulParameter *matmul_param = (MatMulParameter *)param;
+  if (matmul_param->a_transpose_) {
+    SwapDims(&a_shape, a_shape.size() - 1, a_shape.size() - 2);
+  }
+  if (matmul_param->b_transpose_) {
+    SwapDims(&b_shape, b_shape.size() - 1, b_shape.size() - 2);
+  }
+  output->shape_ = a_shape;
+  output->shape_.at(a_shape.size() - 1) = b_shape.at(b_shape.size() - 1);
+  return RET_OK;
+}
+
+int DoMatMul(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
              mindspore::lite::Allocator *allocator) {
   if (in_tensors[0]->data_ == NULL || in_tensors[1]->data_ ==NULL) {
     MS_LOG(ERROR) << "input data is NULL!";
diff --git a/mindspore/lite/internal/src/kernel/fp32/matmul.h b/mindspore/lite/internal/src/kernel/fp32/matmul.h
index 3d9a701392..a0b98dacc7 100644
--- a/mindspore/lite/internal/src/kernel/fp32/matmul.h
+++ b/mindspore/lite/internal/src/kernel/fp32/matmul.h
@@ -20,7 +20,8 @@
 #include "internal/include/model.h"
 #include "src/runtime/allocator.h"
 
-int DoMatMul(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoMatMulInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);
+int DoMatMul(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
              mindspore::lite::Allocator *allocator);
 
 #endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_FP32_MATMUL_H_
diff --git a/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.cc b/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.cc
index 0d9d60caa8..639a2c4034 100644
--- a/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.cc
@@ -15,13 +15,19 @@
  */
 
 #include "internal/src/kernel/fp32_grad/activation_grad.h"
+#include "internal/src/kernel/common/common_infershape.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/ms_tensor.h"
 #include "nnacl/fp32_grad/activation_grad.h"
 #include "utils/log_adapter.h"
 #include "nnacl/errorcode.h"
 
-int DoActivationGrad(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoActivationGradInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                               OpParameter *param) {
+  return DoCommonInferShape(in_tensors, out_tensors);
+}
+
+int DoActivationGrad(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                      mindspore::lite::Allocator *allocator) {
   ActivationGradParameter *param = (ActivationGradParameter *)node->primitive_;
   int ret = RET_OK;
diff --git a/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.h b/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.h
index fee6b5ec49..8e3eddc3b3 100644
--- a/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.h
+++ b/mindspore/lite/internal/src/kernel/fp32_grad/activation_grad.h
@@ -20,7 +20,9 @@
 #include "internal/include/model.h"
 #include "src/runtime/allocator.h"
 
-int DoActivationGrad(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoActivationGradInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                               OpParameter *param);
+int DoActivationGrad(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                      mindspore::lite::Allocator *allocator);
 
 #endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_FP32_GRAD_ACTIVATION_GRAD_H_
diff --git a/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.cc b/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.cc
index 80356b4400..375bc976ad 100644
--- a/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.cc
+++ b/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.cc
@@ -15,13 +15,19 @@
  */
 
 #include "internal/src/kernel/fp32_grad/arithmetic_self_grad.h"
+#include "internal/src/kernel/common/common_infershape.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/ms_tensor.h"
 #include "utils/log_adapter.h"
 #include "nnacl/fp32/arithmetic_self.h"
 #include "nnacl/fp32/arithmetic.h"
 
-int DoArithmeticGradSelf(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoArithmeticSelfGradInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                                   OpParameter *param) {
+  return DoCommonInferShape(in_tensors, out_tensors);
+}
+
+int DoArithmeticSelfGrad(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                          mindspore::lite::Allocator *allocator) {
   size_t data_size = in_tensors[0]->ElementsNum();
   OpParameter *param = node->primitive_;
diff --git a/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.h b/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.h
index 952ab2bc7b..8f1d06aae2 100644
--- a/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.h
+++ b/mindspore/lite/internal/src/kernel/fp32_grad/arithmetic_self_grad.h
@@ -20,7 +20,9 @@
 #include "internal/include/model.h"
 #include "src/runtime/allocator.h"
 
-int DoArithmeticGradSelf(TensorPtrVector in_tensors, TensorPtrVector out_tensors, Node *node,
+int DoArithmeticSelfGradInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors,
+                                   OpParameter *param);
+int DoArithmeticSelfGrad(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                          mindspore::lite::Allocator *allocator);
 
 #endif  // MINDSPORE_LITE_INTERNAL_SRC_KERNEL_FP32_GRAD_ARITHMETIC_SELF_GRAD_H_
diff --git a/mindspore/lite/internal/src/lite_session.cc b/mindspore/lite/internal/src/lite_session.cc
index d791529971..e691103bb7 100644
--- a/mindspore/lite/internal/src/lite_session.cc
+++ b/mindspore/lite/internal/src/lite_session.cc
@@ -25,29 +25,97 @@
 #include "internal/src/kernel/fp32_grad/arithmetic_self_grad.h"
 #include "internal/src/kernel/fp32_grad/activation_grad.h"
 
-static Context *g_Ctx;
-static Model *g_Model;
-static LiteSession g_Session;
-static mindspore::lite::DefaultAllocator allocator;
+static Context *g_ctx;
+static Model *g_model;
+static LiteSession g_session;
+static mindspore::lite::DefaultAllocator g_allocator;
+static bool g_infershape_interrupt = false;
+static bool g_first_load = true;
+typedef int (*InferShape)(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);
+typedef int (*RunKernel)(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
+                         mindspore::lite::Allocator *allocator);
+static InferShape g_infershape_funcs[KernelType::END];
+static RunKernel g_runkernel_funcs[KernelType::END];
+
+static int ModelInferShape() {
+  NodePtrVector nodes = g_model->nodes_;
+  size_t nodes_size = nodes.size();
+  for (size_t i = 0; i < nodes_size; ++i) {
+    auto node = nodes[i];
+    if (node->primitive_ == nullptr) {
+      MS_LOG(ERROR) << "node's primitive is NULL!";
+      return RET_ERROR;
+    }
+    TensorPtrVector in_tensors;
+    for (size_t j = 0; j < node->input_indices_.size(); ++j) {
+      in_tensors.push_back(g_model->all_tensors_[node->input_indices_[j]]);
+    }
+    TensorPtrVector out_tensors;
+    for (size_t j = 0; j < node->output_indices_.size(); ++j) {
+      out_tensors.push_back(g_model->all_tensors_[node->output_indices_[j]]);
+    }
+    int type = node->primitive_->type_;
+    InferShape infershape = g_infershape_funcs[type];
+    if (infershape == NULL) {
+      MS_LOG(ERROR) << "Unsupport kernel type: " << type;
+      return RET_PARAM_INVALID;
+    }
+    int ret = (*infershape)(in_tensors, out_tensors, node->primitive_);
+    if (ret == RET_INFER_INVALID) {
+      g_infershape_interrupt = true;
+      MS_LOG(INFO) << node->name_ << "inferShape shouldn't be done before runtime, inferShape interrupt!";
+    }
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Infer shape fail!ret: " << ret;
+      return ret;
+    }
+  }
+  return RET_OK;
+}
+
+static void InitFuncs() {
+  if (g_first_load) {
+    g_infershape_funcs[KernelType::MatMul] = DoMatMulInferShape;
+    g_infershape_funcs[KernelType::Activation] = DoActivationInferShape;
+    g_infershape_funcs[KernelType::Log] = DoArithmeticSelfInferShape;
+    g_infershape_funcs[KernelType::Neg] = DoArithmeticSelfInferShape;
+    g_infershape_funcs[KernelType::ActivationGrad] = DoActivationGradInferShape;
+
+    g_runkernel_funcs[KernelType::MatMul] = DoMatMul;
+    g_runkernel_funcs[KernelType::Activation] = DoActivation;
+    g_runkernel_funcs[KernelType::Log] = DoArithmeticSelf;
+    g_runkernel_funcs[KernelType::LogGrad] = DoArithmeticSelfGrad;
+    g_runkernel_funcs[KernelType::Neg] = DoArithmeticSelf;
+    g_runkernel_funcs[KernelType::NegGrad] = DoArithmeticSelfGrad;
+    g_runkernel_funcs[KernelType::ActivationGrad] = DoActivationGrad;
+    g_first_load = false;
+  }
+}
 
 LiteSession *LiteSession::CreateSession(Context *context) {
-  g_Ctx = context;
-  return &g_Session;
+  g_ctx = context;
+  return &g_session;
 }
 
 int LiteSession::CompileGraph(Model *model) {
-  g_Model = model;
-  for (auto in : g_Model->input_indices_) {
-    g_Model->all_tensors_[in]->data_ = allocator.Malloc(g_Model->all_tensors_[in]->Size());
+  InitFuncs();
+  g_model = model;
+  for (auto in : g_model->input_indices_) {
+    g_model->all_tensors_[in]->data_ = g_allocator.Malloc(g_model->all_tensors_[in]->Size());
+  }
+  g_infershape_interrupt = false;
+  int ret = ModelInferShape();
+  if (ret != RET_OK && ret != RET_INFER_INVALID) {
+    return ret;
   }
-  return 0;
+  return RET_OK;
 }
 
 TensorPtrVector LiteSession::GetInputs() const {
-  TensorPtrVector in(g_Model->input_indices_.size());
-  //    for(auto index : g_Model->input_indices_){
-  //        in.emplace_back(g_Model->all_tensors_[index]);
-  //    }
+  TensorPtrVector in(g_model->input_indices_.size());
+  for (size_t i = 0; i < g_model->input_indices_.size(); ++i) {
+    in.at(i) = g_model->all_tensors_[i];
+  }
   return in;
 }
 
@@ -56,16 +124,15 @@ TensorPtrVector LiteSession::GetInputsByName(const String &node_name) const { re
 TensorPtrVector LiteSession::GetOutputsByNodeName(const String &node_name) const { return TensorPtrVector(); }
 
 TensorPtrVector LiteSession::GetOutputs() const {
-  TensorPtrVector out(g_Model->output_indices_.size());
-  //    for(auto index : g_Model->output_indices_){
-  //        out.emplace_back(g_Model->all_tensors_[index]);
-  //    }
+  TensorPtrVector out(g_model->output_indices_.size());
+  for (size_t i = 0; i < g_model->output_indices_.size(); ++i) {
+    out.at(i) = g_model->all_tensors_[i];
+  }
   return out;
 }
 
 int LiteSession::RunGraph() {
-  // invoke nnacl kernel
-  NodePtrVector nodes = g_Model->nodes_;
+  NodePtrVector nodes = g_model->nodes_;
   size_t nodes_size = nodes.size();
   for (size_t i = 0; i < nodes_size; ++i) {
     auto node = nodes[i];
@@ -75,41 +142,45 @@ int LiteSession::RunGraph() {
     }
     TensorPtrVector in_tensors;
     for (size_t j = 0; j < node->input_indices_.size(); ++j) {
-      in_tensors.push_back(g_Model->all_tensors_[node->input_indices_[j]]);
+      in_tensors.push_back(g_model->all_tensors_[node->input_indices_[j]]);
     }
     TensorPtrVector out_tensors;
     for (size_t j = 0; j < node->output_indices_.size(); ++j) {
-      out_tensors.push_back(g_Model->all_tensors_[node->output_indices_[j]]);
+      out_tensors.push_back(g_model->all_tensors_[node->output_indices_[j]]);
     }
     int type = node->primitive_->type_;
-    int ret = RET_ERROR;
-    switch (type) {
-      case KernelType::MatMul:
-        ret = DoMatMul(in_tensors, out_tensors, node, &allocator);
-        break;
-      case KernelType::Activation:
-        ret = DoActivation(in_tensors, out_tensors, node, &allocator);
-        break;
-      case KernelType::Log:
-      case KernelType::Neg:
-        ret = DoArithmeticSelf(in_tensors, out_tensors, node, &allocator);
-        break;
-      case KernelType::LogGrad:
-      case KernelType::NegGrad:
-        ret = DoArithmeticGradSelf(in_tensors, out_tensors, node, &allocator);
-        break;
-      case KernelType::ActivationGrad:
-        ret = DoActivationGrad(in_tensors, out_tensors, node, &allocator);
-        break;
-      default:
+    if (g_infershape_interrupt) {
+      InferShape infershape = g_infershape_funcs[type];
+      if (infershape == NULL) {
         MS_LOG(ERROR) << "Unsupport kernel type: " << type;
         return RET_PARAM_INVALID;
+      }
+      int ret = (*infershape)(in_tensors, out_tensors, node->primitive_);
+      if (ret != RET_OK) {
+        MS_LOG(ERROR) << "InferShape fail!ret: " << ret;
+        return ret;
+      }
+    }
+    for (size_t j = 0; j < out_tensors.size(); ++j) {
+      out_tensors[j]->data_ = g_allocator.Malloc(out_tensors[j]->Size());
+      if (out_tensors[j]->data_ == NULL) {
+        MS_LOG(ERROR) << "Malloc data for out tensor fail!";
+        return RET_NULL_PTR;
+      }
     }
+    RunKernel run_kernel = g_runkernel_funcs[type];
+    if (run_kernel == NULL) {
+      MS_LOG(ERROR) << "Unsupport kernel type: " << type;
+      return RET_PARAM_INVALID;
+    }
+
+    int ret = (*run_kernel)(in_tensors, out_tensors, node, &g_allocator);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "run kernel fail!ret: " << ret;
       return ret;
     }
   }
+  g_infershape_interrupt = false;
   return RET_OK;
 }
 
@@ -117,4 +188,4 @@ StringVector LiteSession::GetOutputTensorNames() const { return StringVector();
 
 MSTensor *LiteSession::GetOutputByTensorName(const String &tensor_name) const { return NULL; }
 
-int LiteSession::Resize(const TensorPtrVector &inputs, Int32VectorVector dims) { return 0; }
+int LiteSession::Resize(const TensorPtrVector &inputs, const Int32VectorVector &dims) { return 0; }
diff --git a/mindspore/lite/internal/src/ms_tensor.cc b/mindspore/lite/internal/src/ms_tensor.cc
index cdff8119f5..6c88322ba6 100644
--- a/mindspore/lite/internal/src/ms_tensor.cc
+++ b/mindspore/lite/internal/src/ms_tensor.cc
@@ -17,15 +17,24 @@
 #include <vector>
 #include <numeric>
 #include <string>
-#include <functional>
 #include "internal/include/ms_tensor.h"
 MSTensor *CreateTensor(TypeId data_type, const ShapeVector &shape) {
-  MSTensor *tensor = new MSTensor();
+  MSTensor *tensor = (MSTensor *)malloc(sizeof(MSTensor));
+  if (tensor == NULL) {
+    return NULL;
+  }
   tensor->shape_ = shape;
   tensor->data_type_ = data_type;
   return tensor;
 }
-int MSTensor::ElementsNum() const { return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies<int>()); }
+
+int MSTensor::ElementsNum() const {
+  int result = 1;
+  for (size_t i = 0; i < shape_.size(); ++i) {
+    result *= shape_.at(i);
+  }
+  return result;
+}
 
 size_t MSTensor::Size() const {
   size_t size = 0;
diff --git a/mindspore/lite/test/ut/internal/CMakeLists.txt b/mindspore/lite/test/ut/internal/CMakeLists.txt
new file mode 100644
index 0000000000..848d63cc74
--- /dev/null
+++ b/mindspore/lite/test/ut/internal/CMakeLists.txt
@@ -0,0 +1,73 @@
+set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+set(TEST_DIR ${TOP_DIR}/mindspore/lite/test)
+set(LITE_DIR ${TOP_DIR}/mindspore/lite)
+
+include_directories(${TOP_DIR})
+include_directories(${TEST_DIR})
+
+string(REPLACE " -Werror " " " CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+string(REPLACE " -Werror " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+STRING(REPLACE " -fvisibility=hidden " " -fvisibility=default " CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+STRING(REPLACE " -fvisibility=hidden " " -fvisibility=default " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+### cpu kernel
+file(GLOB KERNEL_OP_SRC
+        ${LITE_DIR}/internal/src/kernel/*.cc
+        ${LITE_DIR}/internal/src/kernel/common/*.cc
+        ${LITE_DIR}/internal/src/kernel/fp32/*.cc
+        ${LITE_DIR}/internal/src/kernel/fp32_grad/*.cc
+        ${LITE_DIR}/nnacl/*.c
+        ${LITE_DIR}/nnacl/fp32/*.c
+        ${LITE_DIR}/nnacl/fp32_grad/*.c
+        ${LITE_DIR}/nnacl/int8/*.c
+        ${LITE_DIR}/nnacl/quantization/*.c
+        )
+
+if (PLATFORM_ARM64)
+    # assembly
+    file(GLOB TEST_ASSEMBLY_SRC ${LITE_DIR}/nnacl/assembly/arm64/*.s
+            ${LITE_DIR}/nnacl/assembly/arm64/*.S)
+
+    set_property(SOURCE ${TEST_ASSEMBLY_SRC} PROPERTY LANGUAGE C)
+    set(KERNEL_OP_SRC
+            ${KERNEL_OP_SRC}
+            ${TEST_ASSEMBLY_SRC}
+            )
+endif()
+
+### runtime framework
+set(TEST_LITE_SRC
+        ${LITE_DIR}/internal/src/lite_session.cc
+        ${LITE_DIR}/src/runtime/allocator.cc
+        ${LITE_DIR}/internal/src/ms_tensor.cc
+        ${TOP_DIR}/mindspore/core/utils/log_adapter.cc
+        ${TOP_DIR}/mindspore/core/gvar/logging_level.cc
+        )
+
+### test src
+file(GLOB_RECURSE TEST_CASE_KERNEL_SRC
+    ${TEST_DIR}/ut/internal/*.cc
+)
+
+file(GLOB_RECURSE TEST_CASE_KERNEL_TRAIN_SRC
+    ${TEST_DIR}/ut/src/runtime/kernel/arm/fp32_grad/*.cc
+)
+
+set(TEST_SRC
+    ${TEST_LITE_SRC}
+    ${TEST_CASE_KERNEL_SRC}
+    ${KERNEL_OP_SRC}
+    ${TEST_DIR}/common/common_test.cc
+    ${TEST_DIR}/main.cc
+)
+
+add_executable(lite-test-internal ${TEST_SRC})
+
+target_link_libraries(lite-test-internal dl ${GTEST_LIBRARY})
+if (PLATFORM_ARM64)
+    target_link_libraries(lite-test-internal mslite_internal)
+endif()
+
+if (PLATFORM_ARM32 OR PLATFORM_ARM64)
+       target_link_libraries(lite-test-internal log)
+endif()
diff --git a/mindspore/lite/test/ut/internal/infer_test.cc b/mindspore/lite/test/ut/internal/infer_test.cc
index f7f7ae121b..78e8c81cdd 100644
--- a/mindspore/lite/test/ut/internal/infer_test.cc
+++ b/mindspore/lite/test/ut/internal/infer_test.cc
@@ -22,7 +22,7 @@
 #include "internal/include/context.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/ms_tensor.h"
-#include "nnacl/conv_parameter.h"
+#include "nnacl/op_base.h"
 
 namespace mindspore {
 class InferTest : public mindspore::CommonTest {
@@ -31,33 +31,42 @@ class InferTest : public mindspore::CommonTest {
 };
 
 TEST_F(InferTest, TestSession) {
-//  Model model;
-//  Node *node = (Node *)malloc(sizeof(Node));
-//  node->name_ = "conv2d";
-//  uint32_t index = model.all_tensors_.size();
-//  node->input_indices_ = {index};
-//  MSTensor *in = CreateTensor(kNumberTypeFloat32, {3, 3, 24, 24});
-//  model.all_tensors_.emplace_back(in);
-//
-//  index = model.all_tensors_.size();
-//  node->output_indices_ = {index};
-//  MSTensor *out = CreateTensor(kNumberTypeFloat32, {3, 3, 24, 24});
-//  model.all_tensors_.emplace_back(out);
-//
-//  ConvParameter *param = (ConvParameter *)malloc(sizeof(ConvParameter));
-//  param->kernel_w_ = 3;
-//  // todo: fill other param fields
-//  node->primitive_ = (PrimitiveC *)param;
-//  model.nodes_.push_back(node);
-//
-//  LiteSession session;
-//  session.CompileGraph(&model);
-//  TensorPtrVector invec = session.GetInputs();
-//  ASSERT_EQ(invec.size(), 1);
-//  // todo: fill inputs data
-//  session.RunGraph();
-//  TensorPtrVector outvec = session.GetOutputs();
-//  ASSERT_EQ(outvec.size(), 1);
+  Model model;
+  Node *node = reinterpret_cast<Node *>(malloc(sizeof(Node)));
+
+  node->name_ = "Neg";
+  node->node_type_ = NodeType::NodeType_CNode;
+  PrimitiveC *prim = reinterpret_cast<PrimitiveC *>(malloc(sizeof(PrimitiveC)));
+  prim->type_ = KernelType::Neg;
+  node->input_indices_.push_back(0);
+  node->output_indices_.push_back(1);
+
+  MSTensor *in = CreateTensor(kNumberTypeFloat32, {1, 1, 1, 10});
+  model.all_tensors_.push_back(in);
+  model.input_indices_.push_back(0);
+
+  MSTensor *out = CreateTensor(kNumberTypeFloat32, {1, 1, 1, 10});
+  model.all_tensors_.emplace_back(out);
+  node->output_indices_.push_back(1);
+
+  LiteSession session;
+  session.CompileGraph(&model);
+  TensorPtrVector invec = session.GetInputs();
+  ASSERT_EQ(invec.size(), 1);
+  constexpr int kOutSize = 10;
+  float expect_out[kOutSize];
+  for (int i = 0; i < kOutSize; ++i) {
+    *(reinterpret_cast<float *>(in->data_) + i) = i + 1;
+    expect_out[i] = -(i + 1);
+  }
+  session.RunGraph();
+  TensorPtrVector outvec = session.GetOutputs();
+  ASSERT_EQ(outvec.size(), 1);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << *(reinterpret_cast<float *>(outvec.at(0)->data_)+ i) << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(reinterpret_cast<float *>(outvec.at(0)->data_), expect_out, kOutSize, 0.000001);
 }
 
 }  // namespace mindspore