1.change pack && rewrite winograd

2.rewrite fp16 winograd 3.remove useless code
5 years ago · 6a9eead482
parent 5247fe7315
commit 6a9eead482
29 changed files with 1995 additions and 9903 deletions
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@ -540,7 +540,7 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i
 // fp16 convolution winograd
 void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data,
                      TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
-                      InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func) {
+                      MatricesFp16 *matrices) {
  int thread_num = conv_param->thread_num_;
  int input_unit = conv_param->input_unit_;
  int in_batch = conv_param->input_batch_;
@ -575,14 +575,14 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
      cal_num = cal_num > tile_num ? tile_num : cal_num;
      WinogradInputTransformFp16(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
                                 tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
-                                 input_trans_func);
+                                 matrices[2], matrices[3]);
      // step 3 : gemm
      IndirectGemmFp16_16x8(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset,
                            trans_weight, NULL, input_unit_square, ic8 * 2, oc8 * C8NUM, output_offset, 1, 1, 0, 0);

      // step 4 : output transform
      WinogradOutputTransformFp16(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data,
-                                  cal_num, out_tile_index, out_w_block, conv_param, output_trans_func);
+                                  cal_num, out_tile_index, out_w_block, conv_param, matrices[0], matrices[1]);
    }
  }
 }
--- a/mindspore/lite/nnacl/fp16/conv_fp16.h
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.h
@ -22,6 +22,7 @@
 #include "nnacl/fp16/winograd_transform_fp16.h"

 typedef float16_t *TmpBufferAddressFp16;
+typedef float16_t *MatricesFp16;

 #ifndef ENABLE_NEON
 void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step,
@ -69,7 +70,7 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i
 // fp16 convolution winograd
 void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data,
                      TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
-                      InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func);
+                      MatricesFp16 *matrices);

 void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
                              int output_unit);
--- a/mindspore/lite/nnacl/fp16/matrix_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matrix_fp16.c
@ -0,0 +1,65 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/matrix_fp16.h"
+
+void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k,
+                        int n) {
+  int count = 0;
+  for (int h = 0; h < m; h++) {
+    int h_offset = h * k;
+    for (int w = 0; w < n; w++) {
+      float16_t res = 0;
+      for (int i = 0; i < k; i++) {
+        res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
+      }
+      *(matrix_c + count) = res;
+      count++;
+    }
+  }
+}
+
+void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matrix_b, float16x8_t *matrix_c,
+                           const float16_t *bias, int m, int k, int n) {
+  if (bias == NULL) {
+    int count = 0;
+    for (int h = 0; h < m; h++) {
+      int h_offset = h * k;
+      for (int w = 0; w < n; w++) {
+        float16x8_t res = vmovq_n_f16(0);
+        for (int i = 0; i < k; i++) {
+          res = vaddq_f16(res, vmulq_f16(matrix_a[h_offset + i], matrix_b[w + i * n]));
+        }
+        matrix_c[count] = res;
+        count++;
+      }
+    }
+  } else {
+    int count = 0;
+    float16x8_t bias_ptr = vld1q_f16(bias);
+    for (int h = 0; h < m; h++) {
+      int h_offset = h * k;
+      for (int w = 0; w < n; w++) {
+        float16x8_t res = vmovq_n_f16(0);
+        for (int i = 0; i < k; i++) {
+          res = vaddq_f16(res, vmulq_f16(matrix_a[h_offset + i], matrix_b[w + i * n]));
+        }
+        matrix_c[count] = vaddq_f16(res, bias_ptr);
+        count++;
+      }
+    }
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h
@ -14,14 +14,20 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_
+#ifndef MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_
+#define MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_

-#include "src/runtime/kernel/arm/base/matrix.h"
+#include <arm_neon.h>

-namespace mindspore::kernel {
-void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n,
-                        bool row);
+#ifdef __cplusplus
+extern "C" {
+#endif
+void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n);
+
+void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matrix_b, float16x8_t *matrix_c,
+                           const float16_t *bias, int m, int k, int n);
+#ifdef __cplusplus
 }
+#endif

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_
+#endif  // MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_
--- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
+++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
@ -569,8 +569,8 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data,

 // fp16 common winograd
 void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
-                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
-                                InputTransformUnitFp16Func input_trans_func) {
+                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b,
+                                float16_t *matrix_bt) {
  const int tile_num = 16;
  int input_unit = conv_param->input_unit_;
  int output_unit = conv_param->output_unit_;
@ -622,7 +622,7 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
      int dst_ic8_offset = dst_plane_offset + ic * tile_num * C8NUM;
      size_t dst_step = ic8 * C8NUM * tile_num;
      float16_t *trans_input_ptr = trans_input + dst_ic8_offset;
-      input_trans_func(tmp_data, trans_input_ptr, C8NUM, dst_step);
+      GeneralInputTransformUnitFp16(tmp_data, trans_input_ptr, matrix_b, matrix_bt, C8NUM, dst_step, input_unit);
    }
    out_tile_index++;
  }  // cal_tile_num loop
@ -630,7 +630,7 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in

 void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
                                 int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                                 OutputTransformUnitFp16Func output_trans_func) {
+                                 float16_t *matrix_a, float16_t *matrix_at) {
  int output_unit = conv_param->output_unit_;
  int output_w = conv_param->output_w_;
  int output_h = conv_param->output_h_;
@ -655,7 +655,8 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
      const float16_t *src_ptr = gemm_out + src_oc8_offset;
      const float16_t *bias_ptr = bias_data + j * C8NUM;
      float16_t *dst_ptr = tmp_out_data + dst_oc8_offset;
-      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
+      GeneralOutputTransformUnitFp16(src_ptr, dst_ptr, bias_ptr, matrix_a, matrix_at, C8NUM,
+                                     output_w_unit_block * output_unit, input_unit, output_unit);
    }
    out_tile_index++;
  }
--- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h
+++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h
@ -43,12 +43,12 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data,

 // fp16 common winograd
 void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
-                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
-                                InputTransformUnitFp16Func input_trans_func);
+                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b,
+                                float16_t *matrix_bt);

 void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
                                 int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                                 OutputTransformUnitFp16Func output_trans_func);
+                                 float16_t *matrix_a, float16_t *matrix_at);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c
+++ b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c
--- a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h
+++ b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h
@ -21,45 +21,17 @@
 #include "nnacl/conv_parameter.h"
 #include "nnacl/op_base.h"

-typedef void (*InputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step);
-typedef void (*OutputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                            int src_step, int dst_step);
+#define MAX_LEN 256

 #ifdef __cplusplus
 extern "C" {
 #endif
-void InputTransform4x4UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step);
-
-void InputTransform8x8UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step);
-
-void OutputTransform4x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x5UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-void OutputTransform8x7UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                int src_step, int dst_step);
-
-InputTransformUnitFp16Func GetInputTransFuncFp16(int input_unit);
-
-OutputTransformUnitFp16Func GetOutputTransFuncFp16(int input_unit, int output_unit);
+void GeneralInputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, float16_t *matrix_b,
+                                   float16_t *matrix_bt, int src_step, int dst_step, int in_unit);

+void GeneralOutputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
+                                    float16_t *matrix_a, float16_t *matrix_at, int src_step, int dst_step, int in_unit,
+                                    int out_unit);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@ -259,8 +259,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons

 // fp32 conv winograd
 void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list,
-                      int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func,
-                      OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func) {
+                      int task_id, ConvParameter *conv_param, InputTransFunc in_func, OutputTransFunc out_func,
+                      GEMM_FUNC_FP32 gemm_func) {
  int thread_num = conv_param->thread_num_;
  int input_unit = conv_param->input_unit_;
  int in_batch = conv_param->input_batch_;
@ -296,7 +296,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
      cal_num = cal_num > C12NUM ? C12NUM : cal_num;
      WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
                             tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
-                             input_trans_func);
+                             in_func);
      // step 3 : gemm
      float *src_ptr = trans_input + task_id * trans_input_offset;
      float *dst_ptr = gemm_out + task_id * gemm_out_offset;
@ -309,7 +309,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_

      // step 4 : output transform
      WinogradOutputTransform(dst_ptr, tmp_out_data + tmp_out_batch_offset, bias_data, cal_num, out_tile_index,
-                              out_w_block, conv_param, output_trans_func);
+                              out_w_block, conv_param, out_func);
    }
  }
 }
--- a/mindspore/lite/nnacl/fp32/conv.h
+++ b/mindspore/lite/nnacl/fp32/conv.h
@ -28,6 +28,7 @@
 #include "nnacl/fp32/conv_depthwise.h"

 typedef float *TmpBufferAddress;
+typedef float *Matrices;
 typedef void (*GEMM_FUNC_FP32)(float *output, const float *input, const float *weight, const float *bias, size_t step,
                               size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4,
                               size_t relu, size_t relu6);
@ -53,8 +54,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons

 // fp32 convolution winograd
 void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list,
-                      int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func,
-                      OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func);
+                      int task_id, ConvParameter *conv_param, InputTransFunc in_func, OutputTransFunc out_func,
+                      GEMM_FUNC_FP32 gemm_func);

 void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit);

--- a/mindspore/lite/nnacl/matrix_table.c
+++ b/mindspore/lite/nnacl/matrix_table.c
--- a/mindspore/lite/nnacl/matrix_table.h
+++ b/mindspore/lite/nnacl/matrix_table.h
@ -1,54 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_
-#define MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-void MatrixG4x2(float *matrix_data);
-
-void MatrixGT2x4(float *matrix_data);
-
-void MatrixG8x2(float *matrix_data);
-
-void MatrixGT2x8(float *matrix_data);
-
-void MatrixG8x3(float *matrix_data);
-
-void MatrixGT3x8(float *matrix_data);
-
-void MatrixG8x4(float *matrix_data);
-
-void MatrixGT4x8(float *matrix_data);
-
-void MatrixG8x5(float *matrix_data);
-
-void MatrixGT5x8(float *matrix_data);
-
-void MatrixG8x6(float *matrix_data);
-
-void MatrixGT6x8(float *matrix_data);
-
-void MatrixG8x7(float *matrix_data);
-
-void MatrixGT7x8(float *matrix_data);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_
--- a/mindspore/lite/nnacl/minimal_filtering_generator.c
+++ b/mindspore/lite/nnacl/minimal_filtering_generator.c
@ -0,0 +1,233 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/minimal_filtering_generator.h"
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+
+void Polynomial(float *interval, float *m, int degree) {
+  for (int i = 0; i < degree; ++i) {
+    float mul = 1;
+    for (int j = 0; j < degree; ++j) {
+      if (i == j) continue;
+      mul *= (interval[i] - interval[j]);
+    }
+    m[i] = mul;
+  }
+}
+
+void DiagonalPlusMatrix(float *matrix, float *diagonal_matrix, int degree) {
+  int data_num = (degree + 1) * (degree + 1);
+  memset(diagonal_matrix, 0, data_num * sizeof(float));
+  for (int i = 0; i < (degree + 1); ++i) {
+    for (int j = 0; j < (degree + 1); ++j) {
+      if (j == i) diagonal_matrix[i * (degree + 1) + j] = matrix[i];
+    }
+  }
+  diagonal_matrix[data_num - 1] = 1;
+}
+
+void ResidueMatrix(float *interval, float *b, int row, int col) {
+  // row : input unit, col : output_unit
+  // result : matrix b
+  int len = row * col;
+  memset(b, 0, len * sizeof(float));
+  for (int i = 0; i < row - 1; ++i) {
+    for (int j = 0; j < col; ++j) {
+      b[i * col + j] = pow(interval[i], j);
+    }
+  }
+  b[len - 1] = 1;
+}
+
+void LT(float *poly_array, float *matrix_lt, int n) {
+  float *coefficient_array = (float *)malloc(n * sizeof(float));
+  float *poly = (float *)malloc(n * sizeof(float));
+  Polynomial(poly_array, poly, n);
+  for (int i = 0; i < n; ++i) {
+    // get coefficient
+    int index = 1;
+    memset(coefficient_array, 0, n * sizeof(float));
+    coefficient_array[0] = 1;
+    for (int j = 0; j < n; ++j) {
+      if (j == i) continue;
+      float poly_coe = poly_array[j] == 0 ? 0 : -poly_array[j];
+      coefficient_array[index] = 1;
+      for (int k = index - 1; k > 0; --k) {
+        coefficient_array[k] = coefficient_array[k] * poly_coe + coefficient_array[k - 1];
+      }
+      coefficient_array[0] *= poly_coe;
+      index++;
+    }
+
+    // lx[i, 0].nth(j) / f[i]
+    int setp = i * n;
+    for (int l = 0; l < n; ++l) {
+      matrix_lt[setp + l] = coefficient_array[l] / poly[i];
+    }
+  }  // matrix L row loop
+  free(coefficient_array);
+  free(poly);
+}
+
+void T(float *poly_array, float *matrix_t, int n) {
+  memset(matrix_t, 0, n * (n + 1) * sizeof(float));
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n + 1; ++j) {
+      if (j == i) matrix_t[i * (n + 1) + j] = 1;
+      if (j == n) {
+        if (poly_array[i] == 0) {
+          matrix_t[i * (n + 1) + j] = 0;
+        } else {
+          matrix_t[i * (n + 1) + j] = -pow(poly_array[i], n);
+        }
+      }
+    }
+  }
+}
+
+void B(float *poly_array, float *matrix_b, int in_unit) {
+  memset(matrix_b, 0, in_unit * in_unit * sizeof(float));
+  int n = in_unit - 1;
+  float *matrix_l = (float *)malloc(n * n * sizeof(float));
+  float *matrix_lt = (float *)malloc(n * n * sizeof(float));
+  float *matrix_t = (float *)malloc(n * in_unit * sizeof(float));
+  T(poly_array, matrix_t, n);
+  LT(poly_array, matrix_lt, n);
+  MatrixTranspose(matrix_lt, matrix_l, n, n);
+  MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
+  matrix_b[in_unit * in_unit - 1] = 1;
+  free(matrix_l);
+  free(matrix_lt);
+  free(matrix_t);
+}
+
+void GenerateIntervalArray(float *array, float interval, int degree) {
+  array[0] = 0;
+  for (int i = 1; i < degree; ++i) {
+    int coefficient = pow(-1, i - 1);
+    array[i] = array[i - 1] + interval * i * coefficient;
+  }
+}
+
+void MatrixTranspose(float *matrix, float *trans_matrix, int row, int col) {
+  for (int i = 0; i < col; ++i) {
+    for (int j = 0; j < row; ++j) {
+      trans_matrix[i * row + j] = matrix[j * col + i];
+    }
+  }
+}
+
+void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n) {
+  int count = 0;
+  for (int h = 0; h < m; h++) {
+    int h_offset = h * k;
+    for (int w = 0; w < n; w++) {
+      float res = 0;
+      for (int i = 0; i < k; i++) {
+        res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
+      }
+      *(matrix_c + count) = res;
+      count++;
+    }
+  }
+}
+
+void CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g,
+                    float *matrix_gt, float coefficient, int out_unit, int filter_size) {
+  int in_unit = out_unit + filter_size - 1;
+  int degree = in_unit - 1;
+  float *polynomial_m = malloc(degree * sizeof(float));
+  float *diagonal_matrix = malloc(in_unit * in_unit * sizeof(float));
+  float *inverse_diagonal_matrix = malloc(in_unit * in_unit * sizeof(float));
+
+  // get diagonal matrix
+  float *interval = malloc(degree * sizeof(float));
+  GenerateIntervalArray(interval, coefficient, degree);
+  Polynomial(interval, polynomial_m, degree);
+  DiagonalPlusMatrix(polynomial_m, diagonal_matrix, degree);
+  if (diagonal_matrix[0] < 0) {
+    for (int i = 0; i < in_unit; ++i) {
+      if (diagonal_matrix[i] != 0) diagonal_matrix[i] *= -1;
+    }
+  }
+
+  // inverse diagonal matrix
+  for (int j = 0; j < in_unit * in_unit; ++j) {
+    if (diagonal_matrix[j] != 0) {
+      inverse_diagonal_matrix[j] = 1.0 / diagonal_matrix[j];
+    } else {
+      inverse_diagonal_matrix[j] = 0;
+    }
+  }
+
+  // get matrix A && AT
+  ResidueMatrix(interval, matrix_a, in_unit, out_unit);
+  MatrixTranspose(matrix_a, matrix_at, in_unit, out_unit);
+
+  // get matrix B
+  B(interval, matrix_bt, in_unit);
+  MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
+  MatrixMultiply(diagonal_matrix, matrix_b, matrix_bt, in_unit, in_unit, in_unit);
+  MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
+
+  // get matrix G && GT
+  float *tmp_g = malloc(in_unit * filter_size * sizeof(float));
+  ResidueMatrix(interval, matrix_g, in_unit, filter_size);
+  MatrixTranspose(matrix_g, tmp_g, in_unit, filter_size);
+  MatrixMultiply(tmp_g, inverse_diagonal_matrix, matrix_gt, filter_size, in_unit, in_unit);
+  MatrixTranspose(matrix_gt, matrix_g, filter_size, in_unit);
+
+  free(interval);
+  free(polynomial_m);
+  free(diagonal_matrix);
+  free(inverse_diagonal_matrix);
+  free(tmp_g);
+}
+
+#ifdef ENABLE_ARM
+void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c,
+                       const float *bias, int m, int k, int n) {
+  if (bias == NULL) {
+    int count = 0;
+    for (int h = 0; h < m; h++) {
+      int h_offset = h * k;
+      for (int w = 0; w < n; w++) {
+        float32x4_t res = vmovq_n_f32(0);
+        for (int i = 0; i < k; i++) {
+          res = vmlaq_f32(res, matrix_a[h_offset + i], matrix_b[w + i * n]);
+        }
+        matrix_c[count] = res;
+        count++;
+      }
+    }
+  } else {
+    int count = 0;
+    float32x4_t bias_ptr = vld1q_f32(bias);
+    for (int h = 0; h < m; h++) {
+      int h_offset = h * k;
+      for (int w = 0; w < n; w++) {
+        float32x4_t res = vmovq_n_f32(0);
+        for (int i = 0; i < k; i++) {
+          res = vmlaq_f32(res, matrix_a[h_offset + i], matrix_b[w + i * n]);
+        }
+        matrix_c[count] = vaddq_f32(res, bias_ptr);
+        count++;
+      }
+    }
+  }
+}
+#endif
--- a/mindspore/lite/nnacl/minimal_filtering_generator.h
+++ b/mindspore/lite/nnacl/minimal_filtering_generator.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_
+#define MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_
+
+#ifdef ENABLE_ARM
+#include <arm_neon.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Polynomial(float *interval, float *m, int degree);
+
+void DiagonalPlusMatrix(float *matrix, float *diagonal_matrix, int degree);
+
+void ResidueMatrix(float *interval, float *b, int row, int col);
+
+void LT(float *poly_array, float *matrix_lt, int n);
+
+void T(float *poly_array, float *matrix_t, int n);
+
+void B(float *poly_array, float *matrix_b, int in_unit);
+
+void GenerateIntervalArray(float *array, float interval, int degree);
+
+void MatrixTranspose(float *matrix, float *trans_matrix, int row, int col);
+
+void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n);
+
+void CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g,
+                    float *matrix_gt, float coefficient, int out_unit, int filter_size);
+
+#ifdef ENABLE_ARM
+void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c,
+                       const float *bias, int m, int k, int n);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@ -101,10 +101,6 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
            int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
            int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM;
            *packed_data_ptr = origin_data_ptr[0];
-            // value of weight must between [-127, 127]
-            if (packed_data_ptr[0] == -128) {
-              packed_data_ptr[0] = -127;
-            }
            weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0];
          }
        }  // kernel block loop
@ -146,9 +142,6 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
            int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
            int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM;
            *packed_data_ptr = origin_data_ptr[0];
-            if (packed_data_ptr[0] == -128) {
-              packed_data_ptr[0] = -127;
-            }
            weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]);
          }
        }  // kernel block loop
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@ -18,8 +18,7 @@

 // fp32 conv winograd
 void WinogradInputTransform(const float *input_data, float *trans_input, float *tmp_data, int cal_num,
-                            int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
-                            InputTransformUnitFunc input_trans_func) {
+                            int out_tile_index, int out_w_block_num, ConvParameter *conv_param, InputTransFunc func) {
  int input_unit = conv_param->input_unit_;
  int output_unit = conv_param->output_unit_;
  int in_channel = conv_param->input_channel_;
@ -31,6 +30,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
  if (out_w_block_num == 0) {
    return;
  }
+
  for (int c = 0; c < cal_num; c++) {  // actual tiled number
    int src_x_s = (out_tile_index % out_w_block_num) * output_unit - pad_w;
    int src_y_s = (out_tile_index / out_w_block_num) * output_unit - pad_h;
@ -70,15 +70,15 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
      size_t dst_step = C12NUM * ic4 * C4NUM;
      float *trans_input_ptr = trans_input + dst_ic4_offset;
-      input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step);
+      func(tmp_data, trans_input_ptr, C4NUM, dst_step);
+      //      GeneralInputTransformUnit(tmp_data, trans_input_ptr, matrix_b, matrix_bt, C4NUM, dst_step, input_unit);
    }
    out_tile_index++;
  }  // cal_tile_num loop
 }

 void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const float *bias_data, int cal_num,
-                             int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                             OutputTransformUnitFunc output_trans_func) {
+                             int out_tile_index, int output_unit_num, ConvParameter *conv_param, OutputTransFunc func) {
  int output_unit = conv_param->output_unit_;
  int output_w = conv_param->output_w_;
  int output_h = conv_param->output_h_;
@ -106,7 +106,9 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
      const float *src_ptr = gemm_out + src_oc4_offset;
      const float *bias_ptr = bias_data + j * C4NUM;
      float *dst_ptr = tmp_out_data + dst_oc4_offset;
-      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
+      func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
+      //      GeneralOutputTransformUnit(src_ptr, dst_ptr, bias_ptr, matrix_a, matrix_at, C8NUM,
+      //                                 output_w_unit_block * output_unit, input_unit, output_unit);
    }
    out_tile_index++;
  }
@ -865,7 +867,7 @@ void Conv3x3Int8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t s
 }

 void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index,
-                                int real_cal_num, int out_w_block, ConvParameter *conv_param) {
+                               int real_cal_num, int out_w_block, ConvParameter *conv_param) {
  // input data format : nhwc
  int input_channel = conv_param->input_channel_;
  int input_width = conv_param->input_w_;
@ -1176,7 +1178,7 @@ void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weigh
 }

 void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound,
-                            bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param) {
+                           bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param) {
  int32_t *left_shift = conv_param->conv_quant_arg_.left_shift_;
  int32_t *right_shift = conv_param->conv_quant_arg_.right_shift_;
  int32_t *quant_multiplier = conv_param->conv_quant_arg_.quant_multiplier_;
@ -1457,7 +1459,7 @@ void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, in
 }

 void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index,
-                                 int real_cal_num, int out_w_block, ConvParameter *conv_param) {
+                                int real_cal_num, int out_w_block, ConvParameter *conv_param) {
  int output_channel = conv_param->output_channel_;
  int output_w = conv_param->output_w_;
  int output_h = conv_param->output_h_;
@ -1484,7 +1486,7 @@ void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const
      bool w_not_bound = out_w_index * OUPUT_UNIT + 1 < output_w;
      bool h_not_bound = out_h_index * OUPUT_UNIT + 1 < output_h;
      Conv3x3Int8OutputUnit(src_ptr, bias_ptr, dst_ptr, h_not_bound, w_not_bound, output_w, real_num, j * C4NUM,
-                             conv_param);
+                            conv_param);
    }
  }
 }
--- a/mindspore/lite/nnacl/winograd_transform.h
+++ b/mindspore/lite/nnacl/winograd_transform.h
@ -33,12 +33,10 @@ extern "C" {
 #endif
 // for fp32 winograd input/output transform
 void WinogradInputTransform(const float *input_data, float *trans_input, float *tmp_data, int cal_num,
-                            int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
-                            InputTransformUnitFunc input_trans_func);
+                            int out_tile_index, int out_w_block_num, ConvParameter *conv_param, InputTransFunc func);

 void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const float *bias_data, int cal_num,
-                             int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                             OutputTransformUnitFunc output_trans_func);
+                             int out_tile_index, int output_unit_num, ConvParameter *conv_param, OutputTransFunc func);

 // for fp32 convolution 3x3 filter/input/output transform
 void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t step);
--- a/mindspore/lite/nnacl/winograd_utils.c
+++ b/mindspore/lite/nnacl/winograd_utils.c
--- a/mindspore/lite/nnacl/winograd_utils.h
+++ b/mindspore/lite/nnacl/winograd_utils.h
--- a/mindspore/lite/src/runtime/kernel/arm/base/matrix.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matrix.cc
@ -1,86 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/base/matrix.h"
-#include "utils/log_adapter.h"
-
-namespace mindspore::kernel {
-Matrix *TransformMatrixGenerator(int m, int k) {
-  auto matrix = new (std::nothrow) Matrix;
-  if (matrix == nullptr) {
-    MS_LOG(ERROR) << "matrix is nullptr.";
-    return nullptr;
-  }
-  auto data = malloc(m * k * sizeof(float));
-  if (data == nullptr) {
-    MS_LOG(ERROR) << "Malloc matrix data failed.";
-    return nullptr;
-  }
-  matrix->SetData(data);
-  matrix->SetNum(m, k);
-  return matrix;
-}
-
-void ChooseMatrixG(Matrix *matrix_g, Matrix *matrix_gt) {
-  int m = matrix_g->GetM();
-  int k = matrix_g->GetK();
-  auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData());
-  auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData());
-  // m represents input unit, only 4 or 8 can be accepted for input unit.
-  // k represents kernel unit, varies from 2 to 7.
-  if (m == 4 && k == 2) {
-    MatrixG4x2(matrix_g_data);
-    MatrixGT2x4(matrix_gt_data);
-  } else if (m == 8 && k == 2) {
-    MatrixG8x2(matrix_g_data);
-    MatrixGT2x8(matrix_gt_data);
-  } else if (m == 8 && k == 3) {
-    MatrixG8x3(matrix_g_data);
-    MatrixGT3x8(matrix_gt_data);
-  } else if (m == 8 && k == 4) {
-    MatrixG8x4(matrix_g_data);
-    MatrixGT4x8(matrix_gt_data);
-  } else if (m == 8 && k == 5) {
-    MatrixG8x5(matrix_g_data);
-    MatrixGT5x8(matrix_gt_data);
-  } else if (m == 8 && k == 6) {
-    MatrixG8x6(matrix_g_data);
-    MatrixGT6x8(matrix_gt_data);
-  } else if (m == 8 && k == 7) {
-    MatrixG8x7(matrix_g_data);
-    MatrixGT7x8(matrix_gt_data);
-  } else {
-    MS_LOG(ERROR) << "Unsupported input unit or kernel unit.";
-    return;
-  }
-}
-
-void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n, bool row) {
-  // row-major implementation
-  int count = 0;
-  for (int h = 0; h < m; h++) {
-    int h_offset = h * k;
-    for (int w = 0; w < n; w++) {
-      float res = 0;
-      for (int i = 0; i < k; i++) {
-        res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
-      }
-      *(matrix_c + count) = res;
-      count++;
-    }
-  }
-}
-}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/matrix.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matrix.h
@ -1,77 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_
-
-#include <stdlib.h>
-#include <vector>
-#include "nnacl/winograd_utils.h"
-
-namespace mindspore::kernel {
-class Matrix {
- public:
-  Matrix() = default;
-  ~Matrix() {
-    if (data_ != nullptr) {
-      free(data_);
-    }
-  }
-
-  void SetData(void *data) { this->data_ = data; }
-
-  void *GetData() { return this->data_; }
-
-  void SetNDim(int dim) { this->n_dim_ = dim; }
-
-  int GetNDim() { return this->n_dim_; }
-
-  void SetShape(std::vector<int> shape) { this->shape_ = shape; }
-
-  std::vector<int> GetShape() { return this->shape_; }
-
-  void SetStride(std::vector<int> stride) { this->stride_ = stride; }
-
-  std::vector<int> GetStride() { return this->stride_; }
-
-  void SetNum(int m, int k) {
-    this->m_ = m;
-    this->k_ = k;
-  }
-
-  int GetM() { return this->m_; }
-
-  int GetK() { return this->k_; }
-
- protected:
-  void *data_ = nullptr;
-  std::vector<int> shape_;
-  std::vector<int> stride_;
-  int m_;
-  int k_;
-  int n_dim_;
-  bool row_major_;
-};
-
-Matrix *TransformMatrixGenerator(int m, int k);
-
-// Chinese Remainder Theorem interp: 0.5
-void ChooseMatrixG(Matrix *matrix_g, Matrix *matrix_gt);
-
-void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n, bool row);
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@ -238,9 +238,7 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &
  } else {
    bool use_winograd = false;
    int out_unit;
-    InputTransformUnitFunc input_trans_func = nullptr;
-    OutputTransformUnitFunc output_trans_func = nullptr;
-    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
+    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
    if (use_winograd) {
      kernel = new (std::nothrow)
        kernel::ConvolutionWinogradFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@ -22,9 +22,9 @@
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
 #include "nnacl/fp16/conv_fp16.h"
-#include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
 #include "nnacl/fp16/winograd_utils_fp16.h"
 #include "nnacl/optimized_kernel.h"
+#include "nnacl/minimal_filtering_generator.h"

 namespace mindspore::kernel {
 class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
@ -39,9 +39,10 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
      fp16_weight_ = nullptr;
    }
    if (trans_weight_ != nullptr) {
-      delete trans_weight_;
+      free(trans_weight_);
      trans_weight_ = nullptr;
    }
+    FreeTransformMatrices();
  }

  int Init() override;
@ -49,10 +50,12 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  int Run() override;
  int RunImpl(int task_id);
  int InitWeightBias();
-  int MallocFilterMatrix(int oc_block, int oc_block_num);
+  int MallocTransformMatrices();
+  void FreeTransformMatrices();
  int InitTmpBuffer();
  int ConfigInputOutput();
  int PostProcess();
+  int WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, float *matrix_gt, int oc_block);

 private:
  void FreeTmpBuffer() {
@ -80,13 +83,14 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  float16_t *trans_input_ = nullptr;
  float16_t *gemm_out_ = nullptr;
  float16_t *tmp_out_data_ = nullptr;
-  Matrix *trans_weight_ = nullptr;
-  InputTransformUnitFp16Func input_trans_func_;
-  OutputTransformUnitFp16Func output_trans_func_;
+  float16_t *matrix_a_ = nullptr;
+  float16_t *matrix_at_ = nullptr;
+  float16_t *matrix_b_ = nullptr;
+  float16_t *matrix_bt_ = nullptr;
+  float16_t *trans_weight_ = nullptr;
  TmpBufferAddressFp16 tmp_buffer_address_list_[4];
+  MatricesFp16 matrices_[4];
 };
-int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
-                                ConvParameter *conv_param, int oc_block);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc
@ -1,36 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
-
-namespace mindspore::kernel {
-
-void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n,
-                        bool row) {
-  // row-major implementation
-  int count = 0;
-  for (int h = 0; h < m; h++) {
-    int h_offset = h * k;
-    for (int w = 0; w < n; w++) {
-      float16_t res = 0;
-      for (int i = 0; i < k; i++) {
-        res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
-      }
-      *(matrix_c + count) = res;
-      count++;
-    }
-  }
-}
-}  // namespace mindspore::kernel
--- a/Show More
+++ b/Show More