From ccd6b9a41569c212722ca48e214b2eda46dcdc52 Mon Sep 17 00:00:00 2001
From: zhanyuan <flyingcow8@gmail.com>
Date: Mon, 3 Aug 2020 17:49:05 +0800
Subject: [PATCH] Add fp32 & int8 ops of Matmul(Batchmatmul)

---
 mindspore/lite/src/ops/matmul.cc              |  29 +--
 mindspore/lite/src/ops/ops.cc                 |   2 +
 .../runtime/kernel/arm/base/matmul_base.cc    |  72 ++++++++
 .../src/runtime/kernel/arm/base/matmul_base.h |  49 +++++
 .../src/runtime/kernel/arm/fp32/matmul.cc     | 108 ++++++++---
 .../lite/src/runtime/kernel/arm/fp32/matmul.h |  19 +-
 .../kernel/arm/int8/fullconnection_int8.h     |   2 +-
 .../runtime/kernel/arm/int8/matmul_int8.cc    | 142 +++++++++++++++
 .../src/runtime/kernel/arm/int8/matmul_int8.h |  47 +++++
 .../runtime/kernel/arm/opclib/common_func.cc  |  17 ++
 .../runtime/kernel/arm/opclib/common_func.h   |   2 +
 .../runtime/kernel/arm/opclib/fp32/matmul.cc  |   4 +-
 .../runtime/kernel/arm/opclib/int8/matmul.cc  |  13 +-
 .../runtime/kernel/arm/opclib/int8/matmul.h   |   2 +-
 .../src/runtime/kernel/arm/opclib/matmul.h    |   1 +
 .../kernel/arm/opclib/quantization/quantize.h |  21 ++-
 .../kernel/arm/fp32/matmul_fp32_tests.cc      | 169 ++++++++++++++++++
 .../arm/int8/fullconnection_int8_tests.cc     |  19 +-
 .../kernel/arm/int8/matmul_int8_tests.cc      | 126 +++++++++++++
 19 files changed, 769 insertions(+), 75 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
diff --git a/mindspore/lite/src/ops/matmul.cc b/mindspore/lite/src/ops/matmul.cc
index 2d031378bf..619fcade8c 100644
--- a/mindspore/lite/src/ops/matmul.cc
+++ b/mindspore/lite/src/ops/matmul.cc
@@ -33,29 +33,30 @@ int MatMul::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor
   auto output = outputs_.front();
   MS_ASSERT(output != nullptr);
 
-  std::vector<int> x_shape = input0->shape();
-  std::vector<int> w_shape = input1->shape();
-  if (x_shape.size() < 2 || w_shape.size() < 2) {
+  std::vector<int> a_shape = input0->shape();
+  std::vector<int> b_shape = input1->shape();
+  if (a_shape.size() < 3 || b_shape.size() < 3) {
     MS_LOG(ERROR) << "inputs shape is invalid";
     return RET_INPUT_TENSOR_ERROR;
   }
 
+  for (int i = 0; i < a_shape.size() - 2; ++i) {
+    if (a_shape[i] != b_shape[i]) {
+      MS_LOG(ERROR) << "Op MatMul's dimensions must be equal";
+      return RET_INPUT_TENSOR_ERROR;
+    }
+  }
+
   auto matmul_prim = this->primitive->value_as_MatMul();
   if (matmul_prim->transposeA()) {
-    int tmp = x_shape.back();
-    x_shape[x_shape.size() - 1] = x_shape[x_shape.size() - 2];
-    x_shape[x_shape.size() - 2] = tmp;
+    std::swap(a_shape[a_shape.size() - 1], a_shape[a_shape.size() - 2]);
   }
   if (matmul_prim->transposeB()) {
-    int tmp = w_shape.back();
-    w_shape[w_shape.size() - 1] = w_shape[w_shape.size() - 2];
-    w_shape[w_shape.size() - 2] = tmp;
+    std::swap(b_shape[b_shape.size() - 1], b_shape[b_shape.size() - 2]);
   }
-  auto y_shape_size = std::max(x_shape.size(), w_shape.size());
-  std::vector<int> y_shape(y_shape_size);
-  y_shape = x_shape;
-  y_shape[y_shape_size - 1] = w_shape[w_shape.size() - 1];
-  output->set_shape(y_shape);
+  std::vector<int> c_shape(a_shape);
+  c_shape[c_shape.size() - 1] = b_shape[b_shape.size() - 1];
+  output->set_shape(c_shape);
   output->set_data_type(input0->data_type());
   output->SetFormat(input0->GetFormat());
 
diff --git a/mindspore/lite/src/ops/ops.cc b/mindspore/lite/src/ops/ops.cc
index a90f1a75d9..85c20267ee 100644
--- a/mindspore/lite/src/ops/ops.cc
+++ b/mindspore/lite/src/ops/ops.cc
@@ -139,6 +139,8 @@ Primitive *Primitive::CreatePrimitive(schema::Primitive *primitive) {
       return new lite::SpaceToBatch(const_cast<schema::Primitive *>(primitive));
     case schema::PrimitiveType_QuantDTypeCast:
       return new lite::QuantDTypeCast(const_cast<schema::Primitive *>(primitive));
+    case schema::PrimitiveType_MatMul:
+      return new lite::MatMul(const_cast<schema::Primitive *>(primitive));
     default:
       break;
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
new file mode 100644
index 0000000000..eb88cfb4b3
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/base/matmul_base.h"
+#include "src/runtime/kernel/arm/fp32/matmul.h"
+#include "src/runtime/kernel/arm/int8/matmul_int8.h"
+#include "src/kernel_factory.h"
+#include "include/errorcode.h"
+#include "include/context.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_MatMul;
+
+namespace mindspore::kernel {
+kernel::LiteKernel *CpuMatmulKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                           const std::vector<lite::tensor::Tensor *> &outputs, OpParameter *opParameter,
+                                           const lite::Context *ctx, const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Concat);
+  auto input_tensor = inputs.at(kInputIndex);
+  auto data_type = input_tensor->data_type();
+  kernel::LiteKernel *kernel = nullptr;
+  switch (data_type) {
+    case kNumberTypeInt8:
+    case kNumberTypeUInt8: {
+      kernel = new (std::nothrow) MatmulInt8CPUKernel(opParameter, inputs, outputs, ctx);
+      if (!kernel) {
+        MS_LOG(ERROR) << "kernel is nullptr.";
+        return nullptr;
+      }
+      break;
+    }
+
+    case kNumberTypeFloat32: {
+      kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx);
+      if (!kernel) {
+        MS_LOG(ERROR) << "kernel is nullptr.";
+        return nullptr;
+      }
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulKernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h
new file mode 100644
index 0000000000..92c8dd07e3
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "include/context.h"
+#include "src/runtime/kernel/arm/opclib/matmul.h"
+
+using mindspore::lite::Context;
+
+namespace mindspore::kernel {
+class MatmulBaseCPUKernel : public LiteKernel {
+ public:
+  MatmulBaseCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
+      : LiteKernel(parameter, inputs, outputs), ctx_(ctx), thread_count_(ctx->threadNum) {
+    params_ = reinterpret_cast<MatMulParameter *>(opParameter);
+  }
+  ~MatmulBaseCPUKernel() = default;
+
+  int Init() override { return 0; }
+  int ReSize() override { return 0; }
+  int Run() override { return 0; }
+
+ protected:
+  MatMulParameter *params_;
+  int thread_count_;
+  int thread_stride_;
+  const Context *ctx_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
index 79f7f65b58..ee6606d66c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
@@ -15,44 +15,102 @@
  */
 
 #include "src/runtime/kernel/arm/fp32/matmul.h"
-#include <vector>
-#include "schema/model_generated.h"
-#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/opclib/fp32/matmul.h"
+#include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"
 
-using mindspore::kernel::KERNEL_ARCH::kCPU;
-using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_MatMul;
 
 namespace mindspore::kernel {
+MatmulCPUKernel::~MatmulCPUKernel() {
+  ctx_->allocator->Free(a_c8_ptr_);
+  ctx_->allocator->Free(b_r8_ptr_);
+  ctx_->allocator->Free(c_r8x8_ptr_);
+}
 
 int MatmulCPUKernel::ReSize() { return RET_OK; }
 
-int MatmulCPUKernel::Run() { return RET_OK; }
+int MatmulCPUKernel::Init() {
+  int batch = 1;
+  auto x_shape = inputs_[0]->shape();
+  auto o_shape = outputs_[0]->shape();
+  for (int i = 0; i < x_shape.size() - 2; ++i) {
+    batch *= x_shape[i];
+  }
+  params_->batch = batch;
+  params_->row_ = o_shape[o_shape.size() - 2];
+  params_->col_ = o_shape[o_shape.size() - 1];
+  params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
+  params_->row_8_ = UP_ROUND(params_->row_, 8);
+  params_->col_8_ = UP_ROUND(params_->col_, 8);
+  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
+  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
 
-int MatmulCPUKernel::Init() { return RET_OK; }
+  a_c8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(float)));
+  if (!a_c8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(float));
+  b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float)));
+  if (!b_r8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
+  c_r8x8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(float)));
+  if (!c_r8x8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(float));
+  return RET_OK;
+}
 
-kernel::LiteKernel *CpuMatmulFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
-                                               const std::vector<lite::tensor::Tensor *> &outputs,
-                                               OpParameter *opParameter, const lite::Context *ctx,
-                                               const kernel::KernelKey &desc) {
-  MS_ASSERT(desc.type == schema::PrimitiveType_MatMul);
-  auto *kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new MatmulCPUKernel fail!";
-    return nullptr;
+int MatmulCPUKernel::RunImpl(int task_id) {
+  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
+  if (cur_oc <= 0) {
+    return RET_OK;
   }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    delete kernel;
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    return nullptr;
+  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
+  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
+  MatMul(a_c8_ptr_, cur_b, cur_c, NULL, ActType_No, params_->deep_, params_->row_8_, cur_oc * 8);
+  return RET_OK;
+}
+
+int MatmulFloatRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto op = reinterpret_cast<MatmulCPUKernel *>(cdata);
+  auto error_code = op->RunImpl(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
   }
-  return kernel;
+  return RET_OK;
 }
 
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulFp32KernelCreator)
+int MatmulCPUKernel::Run() {
+  auto a_ptr = reinterpret_cast<float *>(inputs_[0]->Data());
+  auto b_ptr = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto c_ptr = reinterpret_cast<float *>(outputs_[0]->Data());
+  auto a_stride = params_->row_ * params_->deep_;
+  auto b_stride = params_->deep_ * params_->col_;
+  auto c_stride = params_->row_ * params_->col_;
+  for (int i = 0; i < params_->batch; ++i) {
+    auto cur_a_ptr = a_ptr + i * a_stride;
+    auto cur_b_ptr = b_ptr + i * b_stride;
+    auto cur_c_ptr = c_ptr + i * c_stride;
+    if (params_->a_transpose_) {
+      RowMajor2Row8Major(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
+    } else {
+      RowMajor2Col8Major(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
+    }
+    if (params_->b_transpose_) {
+      RowMajor2Col8Major(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
+    } else {
+      RowMajor2Row8Major(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
+    }
+    LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
+    Row8x8Major2RowMajor(c_r8x8_ptr_, cur_c_ptr, params_->row_, params_->col_);
+  }
+  return RET_OK;
+}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
index 3dfc4521eb..14594dc3f0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
@@ -19,27 +19,26 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-
 #include "src/runtime/kernel/arm/opclib/matmul.h"
+#include "src/runtime/kernel/arm/base/matmul_base.h"
 
 namespace mindspore::kernel {
-class MatmulCPUKernel : public LiteKernel {
+class MatmulCPUKernel : public MatmulBaseCPUKernel {
  public:
   explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
-                           const std::vector<lite::tensor::Tensor *> &outputs)
-      : LiteKernel(parameter, inputs, outputs) {
-    matmul_param_ = reinterpret_cast<MatMulParameter *>(parameter);
-  }
-  ~MatmulCPUKernel() override = default;
-
+                           const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
+      : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+  ~MatmulCPUKernel() override;
   int Init() override;
   int ReSize() override;
   int Run() override;
+  int RunImpl(int task_id);
 
  private:
-  MatMulParameter *matmul_param_;
+  float *a_c8_ptr_;
+  float *b_r8_ptr_;
+  float *c_r8x8_ptr_;
 };
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
index 4bc62f1b08..08c9673b71 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
@@ -42,7 +42,7 @@ class FullconnectionInt8CPUKernel : public FullconnectionBaseCPUKernel {
   int RunImpl(int task_id);
 
  private:
-  FcQuantArg quant_params_;
+  MatmulQuantArg quant_params_;
   int8_t *a_c8_ptr_;
   int8_t *b_r8_ptr_;
   int *c_r8x8_ptr_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
new file mode 100644
index 0000000000..926d282679
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/matmul_int8.h"
+#include "src/runtime/kernel/arm/opclib/int8/matmul.h"
+#include "src/runtime/kernel/arm/opclib/common_func.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+MatmulInt8CPUKernel::~MatmulInt8CPUKernel() {
+  ctx_->allocator->Free(a_c8_ptr_);
+  ctx_->allocator->Free(b_r8_ptr_);
+  ctx_->allocator->Free(c_r8x8_ptr_);
+}
+
+int MatmulInt8CPUKernel::Init() {
+  int batch = 1;
+  auto x_shape = inputs_[0]->shape();
+  auto o_shape = outputs_[0]->shape();
+  for (int i = 0; i < x_shape.size() - 2; ++i) {
+    batch *= x_shape[i];
+  }
+  params_->batch = batch;
+  params_->row_ = o_shape[o_shape.size() - 2];
+  params_->col_ = o_shape[o_shape.size() - 1];
+  params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
+  params_->row_8_ = UP_ROUND(params_->row_, 8);
+  params_->col_8_ = UP_ROUND(params_->col_, 8);
+  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
+  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
+
+  a_c8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(int8_t)));
+  if (!a_c8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(int8_t));
+  b_r8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(int8_t)));
+  if (!b_r8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(int8_t));
+  c_r8x8_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(int)));
+  if (!c_r8x8_ptr_) {
+    return RET_MEMORY_FAILED;
+  }
+  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(int));
+
+  auto input_tensor = inputs_[0];
+  auto params = input_tensor->GetQuantParams();
+  MS_ASSERT(params.size() == 1);
+  quant_params_.input.zp_ = params.front().zeroPoint;
+  quant_params_.input.scale_ = params.front().scale;
+  auto weight_tensor = inputs_[1];
+  params = weight_tensor->GetQuantParams();
+  MS_ASSERT(params.size() == 1);
+  quant_params_.weight.zp_ = params.front().zeroPoint;
+  quant_params_.weight.scale_ = params.front().scale;
+  auto output_tensor = outputs_[0];
+  params = output_tensor->GetQuantParams();
+  MS_ASSERT(params.size() == 1);
+  quant_params_.output.zp_ = params.front().zeroPoint;
+  quant_params_.output.scale_ = params.front().scale;
+
+  double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
+  QuantizeRoundParameter(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
+                         &quant_params_.right_shift);
+  return RET_OK;
+}
+
+int MatmulInt8CPUKernel::ReSize() { return RET_OK; }
+
+int MatmulInt8CPUKernel::RunImpl(int task_id) {
+  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
+  if (cur_oc <= 0) {
+    return RET_OK;
+  }
+  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
+  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
+  MatMulInt8(a_c8_ptr_, cur_b, cur_c, params_->row_8_, cur_oc * 8, params_->deep_, quant_params_.input.zp_,
+             quant_params_.weight.zp_);
+  return RET_OK;
+}
+
+int MatmulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
+  auto ret = op->RunImpl(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int MatmulInt8CPUKernel::Run() {
+  auto a_ptr = reinterpret_cast<int8_t *>(inputs_[0]->Data());
+  auto b_ptr = reinterpret_cast<int8_t *>(inputs_[1]->Data());
+  auto c_ptr = reinterpret_cast<int8_t *>(outputs_[0]->Data());
+  auto a_stride = params_->row_ * params_->deep_;
+  auto b_stride = params_->deep_ * params_->col_;
+  auto c_stride = params_->row_ * params_->col_;
+
+  for (int i = 0; i < params_->batch; ++i) {
+    auto cur_a_ptr = a_ptr + i * a_stride;
+    auto cur_b_ptr = b_ptr + i * b_stride;
+    auto cur_c_ptr = c_ptr + i * c_stride;
+    if (params_->a_transpose_) {
+      RowMajor2Row8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
+    } else {
+      RowMajor2Col8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
+    }
+    if (params_->b_transpose_) {
+      RowMajor2Col8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
+    } else {
+      RowMajor2Row8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
+    }
+    LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
+    auto &q = quant_params_;
+    SimplePostFuncInt8(c_r8x8_ptr_, cur_c_ptr, params_->col_, params_->row_, params_->row_8_, q.quant_multiplier,
+                       q.left_shift, q.right_shift, q.output.zp_);
+  }
+
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
new file mode 100644
index 0000000000..9081babe85
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
+
+#include <vector>
+#include "include/context.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"
+#include "src/runtime/kernel/arm/base/matmul_base.h"
+
+using mindspore::lite::Context;
+
+namespace mindspore::kernel {
+class MatmulInt8CPUKernel : public MatmulBaseCPUKernel {
+ public:
+  MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
+      : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+  ~MatmulInt8CPUKernel() override;
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int RunImpl(int task_id);
+
+ private:
+  MatmulQuantArg quant_params_;
+  int8_t *a_c8_ptr_;
+  int8_t *b_r8_ptr_;
+  int *c_r8x8_ptr_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
index ae2eaf554f..d2f7929fd4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
@@ -236,3 +236,20 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
   }
   return;
 }
+
+void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
+                        int32_t left_shift, int32_t right_shift, int32_t zp) {
+  /*  (int32_t)row8x8-major * multiplier => (int8_t)row-major  */
+  for (int r = 0; r < plane; r++) {
+    for (int c = 0; c < oc; c++) {
+      int c8div = c / 8, c8mod = c % 8;
+      int src_index = c8div * plane8 * 8 + r * 8 + c8mod;
+      int dst_index = r * oc + c;
+      int32_t value = in[src_index];
+      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
+      value = MSMIN(CHAR_MAX, value);
+      value = MSMAX(CHAR_MIN, value);
+      out[dst_index] = (int8_t)value;
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
index e8c3f587f0..f35d158dfb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
@@ -33,6 +33,8 @@ void ReluFp32(float *data, int ele_num);
 void Relu6Fp32(float *data, int ele_num);
 void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                   int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
+void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
+                        int32_t left_shift, int32_t right_shift, int32_t zp);
 void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
                           size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                           size_t relu6);
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
index 8f396cd44a..8c64a4a66f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
@@ -65,9 +65,7 @@ void MatMul8x8(const float *a, const float *b, float *c, const float *bias, ActT
         size_t bi = c8div * deep * 8 + d * 8 + c8mod;
         value = value + a[ai] * b[bi];
       }
-      if (bias != nullptr) {
-        value += bias[col];
-      }
+      if (bias != nullptr) value += bias[col];
       if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
       if (act_type != ActType_No) value = MSMAX(0.0f, value);
       c[ci] = value;
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc
index 0517f8b5fd..67dfb40f4b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc
@@ -18,6 +18,17 @@
 #include <limits.h>
 #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
 
+void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  for (int r = 0; r < row; r++) {
+    int8_t *src = src_ptr + r * col;
+    for (int c = 0; c < col; c++) {
+      int cd8 = c / 8;
+      int cm8 = c % 8;
+      dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src[c];
+    }
+  }
+}
+
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     int rd8 = r / 8;
@@ -26,7 +37,6 @@ void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
       dst_ptr[rd8 * col * 8 + c * 8 + rm8] = src_ptr[r * col + c];
     }
   }
-  return;
 }
 
 void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
@@ -46,5 +56,4 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, co
       c[ci] = value;
     }
   }
-  return;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h
index 6fc2166461..d51b783932 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h
@@ -22,7 +22,7 @@
 
 void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
                 const int32_t a_zp, const int32_t b_zp);
+void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 
 #endif  // MINDSPORE_LITE_SRC_BACKEND_ARM_OPCLIB_INT8_MATMUL_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h b/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
index ad105d8d25..2851bd55ae 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
@@ -29,6 +29,7 @@ struct MatMulParameter {
   int col_8_;
   int deep_;
   bool has_bias_;
+  int batch;
   bool a_transpose_; /* false :  row-major  */
   bool b_transpose_; /* true  :  col-major  */
   ActType act_type_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h b/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
index b0bf995535..aafa2b6883 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <limits>
+#include "src/runtime/kernel/arm/opclib/op_base.h"
 
 struct QuantArg {
   double scale_;
@@ -49,7 +50,7 @@ struct ConcatQuantArg {
   QuantArg out_quant_args_;
 };
 
-struct FcQuantArg {
+struct MatmulQuantArg {
   QuantArg input;
   QuantArg weight;
   QuantArg output;
@@ -130,4 +131,22 @@ inline void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32
   *mini = min;
   *maxi = max;
 }
+
+// quantize from float to int8
+inline void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
+  for (int i = 0; i < length; ++i) {
+    int r = (int)round(input_data[i] / scale + zero_point);
+    int8_t q = r > CHAR_MAX ? CHAR_MAX : r;
+    q = q < CHAR_MIN ? CHAR_MIN : q;
+    output_data[i] = q;
+  }
+}
+
+// dequantize from int8 to float
+inline void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
+  for (int i = 0; i < length; ++i) {
+    output_data[i] = scale * (input_data[i] - zero_point);
+  }
+}
+
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_QUANTIZATION_QUANTIZE_H_
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
new file mode 100644
index 0000000000..0f22149ed4
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
@@ -0,0 +1,169 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h"
+#include "src/kernel_registry.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore {
+class TestMatMulFp32 : public mindspore::Common {
+ public:
+  TestMatMulFp32() {}
+};
+
+int MMTestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+               float *a_ptr, float *b_ptr, std::vector<int> a_shape, std::vector<int> b_shape,
+               std::vector<int> c_shape) {
+  auto in_t =
+    new lite::tensor::Tensor(kNumberTypeFloat, a_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  in_t->MallocData();
+  memcpy(in_t->Data(), a_ptr, sizeof(float) * in_t->ElementsNum());
+  inputs_->push_back(in_t);
+
+  auto weight_t =
+    new lite::tensor::Tensor(kNumberTypeFloat, b_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  memcpy(weight_t->Data(), b_ptr, sizeof(float) * weight_t->ElementsNum());
+  inputs_->push_back(weight_t);
+
+  auto out_t =
+    new lite::tensor::Tensor(kNumberTypeFloat, c_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  outputs_->push_back(out_t);
+
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestMatMulFp32, simple) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  matmul_param->a_transpose_ = false;
+  matmul_param->b_transpose_ = false;
+  matmul_param->has_bias_ = false;
+  float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
+               17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
+  float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671,  -0.012470592,   -0.0053063435,
+               0.006050155,   0.008656233,  0.012911413,  -0.0028635843, -0.00034080597, -0.0010622552,
+               -0.012254699,  -0.01312836,  0.0025241964, -0.004706142,  0.002451482,    -0.009558459,
+               0.004481974,   0.0033251503, -0.011705584, -0.001720293,  -0.0039410214,  -0.0073637343};
+  std::vector<int> a_shape = {1, 2, 8};
+  std::vector<int> b_shape = {1, 8, 3};
+  std::vector<int> c_shape = {1, 2, 3};
+  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
+  auto ctx = new lite::Context;
+  ctx->threadNum = 2;
+  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+  mm->Init();
+  mm->Run();
+  float correct[] = {-0.1256939023733139, -0.07744802534580231,  0.07410638779401779,
+                     -0.3049793541431427, -0.027687929570674896, -0.18109679222106934};
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+  delete matmul_param;
+  delete mm;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+}
+
+TEST_F(TestMatMulFp32, simple_transb) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  matmul_param->a_transpose_ = false;
+  matmul_param->b_transpose_ = true;
+  matmul_param->has_bias_ = false;
+  float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
+               17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
+  float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671,  -0.012470592,   -0.0053063435,
+               0.006050155,   0.008656233,  0.012911413,  -0.0028635843, -0.00034080597, -0.0010622552,
+               -0.012254699,  -0.01312836,  0.0025241964, -0.004706142,  0.002451482,    -0.009558459,
+               0.004481974,   0.0033251503, -0.011705584, -0.001720293,  -0.0039410214,  -0.0073637343};
+  std::vector<int> a_shape = {1, 2, 8};
+  std::vector<int> b_shape = {1, 3, 8};
+  std::vector<int> c_shape = {1, 2, 3};
+  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
+  auto ctx = new lite::Context;
+  ctx->threadNum = 2;
+  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+  mm->Init();
+  mm->Run();
+  float correct[] = {0.00533547, 0.002545945, 0.062974121, -0.445441471, -0.246223617, -0.142070031};
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+  delete matmul_param;
+  delete mm;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+}
+
+TEST_F(TestMatMulFp32, batch) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  matmul_param->a_transpose_ = false;
+  matmul_param->b_transpose_ = true;
+  matmul_param->has_bias_ = false;
+  float a[] = {-4.946672525326248,  11.154420027909701,  -7.831129637356922,  17.309845099949953,  -10.46177877610444,
+               2.5412751480833897,  2.700113860276929,   -12.616715572097341, -15.513316568881574, -9.513294738065516,
+               17.931148376418896,  -10.83801964632579,  -14.023733862948017, -14.50805001403956,  0.7952221556310306,
+               6.619720423569035,   -19.277904230909357, -13.450479287024839, 19.914652156692625,  16.542571697048878,
+               -2.9715041389268926, 4.949555349889412,   -1.9408110276290103, -15.062828261031868, 0.20012569643335,
+               8.260383531209776,   3.1092344458607357,  16.742272486091487,  17.31277252415167,   -16.60303202099434,
+               -8.980314693173042,  -11.735087989358268, -14.918976184088514, -11.347592686892733, 11.808756029220604,
+               -18.76179414554809,  7.579758962360987,   3.13240880962163,    6.528181981442103,   -16.802624652419794,
+               -14.323146919914901, -16.197579076296144, 9.738053920125779,   -12.245780062949866, 8.817905278096319,
+               0.5261391331275007,  -18.26152522535471,  -2.400461208771226};
+  float b[] = {
+    -0.895183867395529,    -0.8146900207660068,   -0.27931593219652817,  0.783554361201179,     -0.05080215007779798,
+    -0.9879631271568501,   0.07710949009001333,   -0.9562579726211344,   0.29505553318356825,   -0.26651960351085124,
+    -0.12755456259718279,  -0.8221417897250098,   -0.5094334041431876,   -0.9117373380256013,   0.991501784215064,
+    0.20131976450979394,   0.07889260559412059,   -0.8138407752750305,   -0.047622075866657454, -0.2778043115153188,
+    -0.6269973420163957,   -0.44345812666611617,  -0.8571568605933642,   0.020192166011526735,  0.4860054298402434,
+    0.41525925469513614,   -0.40270506445219967,  -0.8716538067535347,   0.5276448387223114,    0.6064500154192936,
+    -0.9553204135772526,   0.3253219646257437,    -0.7237956595774822,   0.3271284879679077,    -0.534543967339336,
+    -0.4076498484281894,   0.01574797075171963,   -0.37322004720586244,  0.16425071396119928,   -0.5328652244800547,
+    0.7389336170615435,    -0.6552069958923377,   -0.042305872596973604, -0.6714941466767734,   -0.9281411415119043,
+    -0.7748558258281224,   -0.6209799945964443,   0.02526428593887675,   -0.44984776800225856,  0.6281401952319337,
+    0.9907258228680276,    0.6288646615999687,    -0.82076880150175,     0.3065944740797497,    -0.29201038744043584,
+    -0.025685501802048982, -0.07273175145419652,  0.9370449239208709,    -0.8233807408078093,   -0.4195634619023012,
+    0.9799555630257346,    -0.23461882935715228,  -0.8884793313829993,   -0.4760267734754635,   -0.2874539543614072,
+    -0.8795685985480997,   -0.08099698251915255,  -0.1626521023321741,   -0.9337167240793414,   0.40924842916829207,
+    -0.7375713045221615,   -0.0065659291539015285};
+  std::vector<int> a_shape = {3, 2, 8};
+  std::vector<int> b_shape = {3, 3, 8};
+  std::vector<int> c_shape = {3, 2, 3};
+  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
+  auto ctx = new lite::Context;
+  ctx->threadNum = 1;
+  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+  mm->Init();
+  mm->Run();
+  float correct[] = {21.38518524169922,  -14.514888763427734, -11.040614128112793, 16.91403579711914,
+                     27.07421112060547,  23.35394287109375,   -39.006141662597656, -2.021998405456543,
+                     -17.63555145263672, -8.490625381469727,  5.317771911621094,   -14.561882019042969,
+                     -7.251564025878906, -2.508212089538574,  5.86458683013916,    -3.466249465942383,
+                     8.869029998779297,  25.034008026123047};
+
+  float *output = reinterpret_cast<float *>(outputs_[0]->Data());
+  for (int i = 0; i < 18; ++i) printf("%f ", output[i]);
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+  delete matmul_param;
+  delete mm;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
index 4c789e63fd..e4739ff921 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
@@ -13,13 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h"
-#include "mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"
 
@@ -30,21 +28,6 @@ class TestFcInt8 : public mindspore::Common {
   TestFcInt8() {}
 };
 
-void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
-  for (int i = 0; i < length; ++i) {
-    int8_t q = static_cast<int8_t>(std::max<float>(
-      std::numeric_limits<int8_t>::min(),
-      std::min<float>(std::numeric_limits<int8_t>::max(), std::round(zero_point + (input_data[i] / scale)))));
-    output_data[i] = q;
-  }
-}
-
-void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
-  for (int i = 0; i < length; ++i) {
-    output_data[i] = scale * (input_data[i] - zero_point);
-  }
-}
-
 int FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
                    MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
   float input_max = 20;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
new file mode 100644
index 0000000000..db4ea4054f
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
@@ -0,0 +1,126 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/lite_kernel.h"
+
+namespace mindspore {
+class TestMatmulInt8 : public mindspore::Common {
+ public:
+  TestMatmulInt8() {}
+};
+
+int MMInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                   MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
+  float input_max = 20;
+  float input_min = -20;
+  float weight_max = 1;
+  float weight_min = -1;
+  float output_max = 30;
+  float output_min = -30;
+
+  double input_scale =
+    (input_max - input_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
+  int input_zp = std::numeric_limits<int8_t>::max() - input_max / input_scale;
+  double weight_scale =
+    (weight_max - weight_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
+  int weight_zp = std::numeric_limits<int8_t>::max() - weight_max / weight_scale;
+  double output_scale =
+    (output_max - output_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
+  int output_zp = std::numeric_limits<int8_t>::max() - output_max / output_scale;
+  *scale = output_scale;
+  *zeropoint = output_zp;
+
+  auto in_t =
+    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  in_t->MallocData();
+  float in[] = {6.583835634764597,   11.337275140963907,  -4.125256949459629, 10.994337291530833,
+                19.086065139532636,  3.620842999158455,   13.167624585590346, -18.326739299407755,
+                14.877693740734841,  -17.092677920571653, 19.24147072807235,  -15.14805323833401,
+                -18.075654829688737, -0.9164404591894204, -3.836646280336332, -10.870298671273918};
+  Quantize(in, in_t->ElementsNum(), input_scale, input_zp, reinterpret_cast<int8_t *>(in_t->Data()));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = input_zp;
+  in_quant_arg->scale = input_scale;
+  in_t->AddQuantParam(*in_quant_arg);
+  inputs_->push_back(in_t);
+
+  auto weight_t =
+    new lite::tensor::Tensor(kNumberTypeInt8, {1, 3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  float weight[] = {0.3651070698591563,    -0.5856943921727129,  -0.7472032663840145,  0.9489992871641959,
+                    -0.8179490270358738,   -0.873058811259344,   0.39876672713807215,  -0.1816769383004213,
+                    -0.13584645926733696,  -0.7614673836659709,  -0.2535825872616164,  -0.05265760030895916,
+                    0.28558728305658754,   0.15404213943520118,  -0.1634824450738006,  -0.5068199082730189,
+                    -0.026961256849111326, -0.1508441942453307,  0.9375335677537737,   0.3304690744194263,
+                    -0.5091563780251127,   0.029887336278646925, -0.39540496207319276, 0.46094065001445084};
+  Quantize(weight, weight_t->ElementsNum(), weight_scale, weight_zp, reinterpret_cast<int8_t *>(weight_t->Data()));
+  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg->zeroPoint = weight_zp;
+  weight_quant_arg->scale = weight_scale;
+  weight_t->AddQuantParam(*weight_quant_arg);
+  inputs_->push_back(weight_t);
+
+  auto out_t =
+    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = output_zp;
+  output_quant_arg->scale = output_scale;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
+  float nchw_co[] = {-0.912632942, 4.08398056, -25.385608673, 2.720281124, 7.745952606, 20.893184662};
+  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));
+
+  matmal_param->b_transpose_ = true;
+  matmal_param->a_transpose_ = false;
+  matmal_param->has_bias_ = false;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestMatmulInt8, mmint8) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  float *correct;
+  double output_scale;
+  int output_zp;
+  int total_size = MMInt8TestInit(&inputs_, &outputs_, matmul_param, &correct, &output_scale, &output_zp);
+  auto ctx = new lite::Context;
+  ctx->threadNum = 2;
+  kernel::MatmulInt8CPUKernel *mm =
+    new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+
+  mm->Init();
+  mm->Run();
+  float fout[6] = {0};
+  Dequantize(reinterpret_cast<int8_t *>(outputs_[0]->Data()), outputs_[0]->ElementsNum(), output_scale, output_zp,
+             fout);
+  CompareOutputData(fout, correct, 6, 0.3);
+  delete matmul_param;
+  delete mm;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+}  // namespace mindspore