tod new ops, performance improvment and bug fix

4 years ago · ae389ae1f9
parent 3bfcf8947c
commit ae389ae1f9
40 changed files with 1713 additions and 430 deletions
--- a/mindspore/lite/nnacl/fp32_grad/batch_norm.c
+++ b/mindspore/lite/nnacl/fp32_grad/batch_norm.c
@ -50,3 +50,35 @@ void backwardAll(const float *restrict in, const float *restrict yt, const float
    }
  }
 }
+void backwardP1(const float *restrict in, const float *restrict yt, const float *restrict mean,
+                const float *restrict invar, const float *restrict scale, int size, int ch, float *restrict dxhat_sum,
+                float *restrict dxhathat_sum, float *restrict dbias, float *restrict dscale) {
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      int ix = i * ch + c;
+      dbias[c] += yt[ix];
+      // dscale
+      float x_hat = (in[ix] - mean[c]) * invar[c];
+      dscale[c] += (yt[ix] * x_hat);
+      // dx_1
+      float dx_hat = yt[ix] * scale[c];
+      dxhat_sum[c] += dx_hat;
+      dxhathat_sum[c] += dx_hat * x_hat;
+    }
+  }
+}
+
+void backwardP2(const float *restrict in, const float *restrict yt, const float *restrict mean,
+                const float *restrict invar, const float *restrict scale, int size, int total_size, int ch,
+                const float *dxhat_sum, const float *dxhathat_sum, float *restrict dx) {
+  float N = (float)total_size;
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      // dx_2
+      int ix = i * ch + c;
+      float x_hat = (in[ix] - mean[c]) * invar[c];
+      float dx_hat = yt[ix] * scale[c];
+      dx[ix] = 1.0f / N * (invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]);
+    }
+  }
+}
--- a/mindspore/lite/nnacl/fp32_grad/batch_norm.h
+++ b/mindspore/lite/nnacl/fp32_grad/batch_norm.h
@ -32,6 +32,10 @@ extern "C" {
 void var2Invar(float *save_var, int size, float eps);
 void backwardAll(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size,
                 int ch, float *dxhat_sum, float *dxhathat_sum, float *dbias, float *dscale, float *dx);
+void backwardP1(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size,
+                int ch, float *dxhat_sum, float *dxhathat_sum, float *dbias, float *dscale);
+void backwardP2(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size,
+                int total_size, int ch, const float *dxhat_sum, const float *dxhathat_sum, float *dx);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32_grad/convolution_grad_filter.c
+++ b/mindspore/lite/nnacl/fp32_grad/convolution_grad_filter.c
--- a/mindspore/lite/nnacl/fp32_grad/convolution_grad_filter.h
+++ b/mindspore/lite/nnacl/fp32_grad/convolution_grad_filter.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_FP32_GRAD_CONVOLUTION_GRAD_FILTER_H_
+#define MINDSPORE_LITE_NNACL_FP32_GRAD_CONVOLUTION_GRAD_FILTER_H_
+
+#include <stddef.h>
+#include "nnacl/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ConvDwFilterGrad(const float *x, const float *dy, float *dw, int start, int count, const ConvParameter *conv_param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_GRAD_CONVOLUTION_GRAD_FILTER_H_
--- a/mindspore/lite/nnacl/fp32_grad/pack_ext.c
+++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.c
@ -18,6 +18,56 @@
 #include "nnacl/fp32_grad/pack_ext.h"
 #include "nnacl/pack.h"

+void RollingIm2ColPackDwUnitFp32(const float *in_data, const ConvParameter *conv_param, float *data_col_orig,
+                                 int real_cal_num, int start) {
+  const int pad_left = conv_param->pad_l_;
+  const int pad_up = conv_param->pad_u_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_w = conv_param->output_w_;
+
+  const int channels = conv_param->input_channel_;
+  const int stride = kernel_h * kernel_w;
+
+  int kernel_row, kernel_col;
+
+  for (int i = 0; i < real_cal_num; i++) {
+    int block_start = start + i;
+    int input_h = block_start / output_w * stride_h;
+    int input_w = block_start % output_w * stride_w;
+    float *data_col = data_col_orig + i * channels * stride;
+    for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      int input_row = -pad_up + kernel_row * dilation_h + input_h;
+      for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_col = -pad_left + kernel_col * dilation_w + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+          const int offset = (input_row * in_width + input_col) * channels;
+          for (int c = 0; c < channels; c++) {
+            data_col[c * stride] = in_data[offset + c];
+          }
+          data_col++;
+        } else {
+          for (int c = 0; c < channels; c++) {
+            data_col[c * stride] = 0;
+          }
+          data_col++;
+        }
+      }
+    }
+  }
+}
+
 void rolling_im2col_hwc(const float *in_data, float *data_col, const ConvParameter *conv_param, int real_cal_num,
                        int start) {
  const int pad_left = conv_param->pad_l_;
@ -90,85 +140,6 @@ void RollingIm2ColPackUnitFp32(const float *input_data, const ConvParameter *con
  rolling_im2col_hwc(input_data, packed_input, conv_param, real_cal_num, block_index);
 }

-void im2row_hwc(const float *in_data, float *data_row, const ConvParameter *conv_param, bool transpose) {
-  const int pad_left = conv_param->pad_l_;
-  const int pad_up = conv_param->pad_u_;
-
-  const int stride_h = conv_param->stride_h_;
-  const int stride_w = conv_param->stride_w_;
-
-  const int dilation_h = conv_param->dilation_h_;
-  const int dilation_w = conv_param->dilation_w_;
-
-  const int kernel_h = conv_param->kernel_h_;
-  const int kernel_w = conv_param->kernel_w_;
-
-  const int in_height = (transpose) ? conv_param->output_h_ : conv_param->input_h_;
-  const int in_width = (transpose) ? conv_param->output_w_ : conv_param->input_w_;
-
-  const int output_h = (transpose) ? conv_param->input_h_ : conv_param->output_h_;
-  const int output_w = (transpose) ? conv_param->input_w_ : conv_param->output_w_;
-
-  const int tot_channels = (transpose) ? conv_param->output_channel_ : conv_param->input_channel_;
-  const int channels = tot_channels / conv_param->group_;
-  int channel, kernel_row, kernel_col, output_rows, output_col;
-
-  if (transpose) {
-    for (channel = 0; channel < channels; channel++) {
-      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-          int input_row = -pad_up + kernel_row * dilation_h;
-          for (output_rows = output_h; output_rows; output_rows--) {
-            if (!((unsigned)(input_row) < (unsigned)(in_height))) {
-              for (output_col = output_w; output_col; output_col--) {
-                *(data_row++) = 0;
-              }
-            } else {
-              int input_col = -pad_left + kernel_col * dilation_w;
-              for (output_col = output_w; output_col; output_col--) {
-                if (((unsigned)(input_col) < (unsigned)(in_width))) {
-                  const int offset = (input_row * in_width + input_col) * tot_channels + channel;
-                  *(data_row++) = in_data[offset];
-                } else {
-                  *(data_row++) = 0;
-                }
-                input_col += stride_w;
-              }
-            }
-            input_row += stride_h;
-          }
-        }
-      }
-    }
-  } else {
-    for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        for (channel = 0; channel < channels; channel++) {
-          int input_row = -pad_up + kernel_row * dilation_h;
-          for (output_rows = output_h; output_rows; output_rows--) {
-            if (!((unsigned)(input_row) < (unsigned)(in_height))) {
-              for (output_col = output_w; output_col; output_col--) {
-                *(data_row++) = 0;
-              }
-            } else {
-              int input_col = -pad_left + kernel_col * dilation_w;
-              for (output_col = output_w; output_col; output_col--) {
-                if (((unsigned)(input_col) < (unsigned)(in_width))) {
-                  const int offset = (input_row * in_width + input_col) * tot_channels + channel;
-                  *(data_row++) = in_data[offset];
-                } else {
-                  *(data_row++) = 0;
-                }
-                input_col += stride_w;
-              }
-            }
-            input_row += stride_h;
-          }
-        }
-      }
-    }
-  }
-}
 void rolling_im2row_hwc(const float *in_data, float *data_row, const ConvParameter *conv_param, int rows, int start) {
  const int pad_left = conv_param->pad_l_;
  const int pad_up = conv_param->pad_u_;
--- a/mindspore/lite/nnacl/fp32_grad/pack_ext.h
+++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.h
@ -26,6 +26,9 @@ extern "C" {

 void RollingIm2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input,
                               int real_cal_num, int block_index);
+void RollingIm2ColPackDwUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input,
+                                 int real_cal_num, int block_index);
+
 void rolling_im2col_hwc(const float *in_data, float *data_col, const ConvParameter *conv_param, int rows, int start);
 void rolling_im2row_hwc(const float *in_data, float *data_row, const ConvParameter *conv_param, int rows, int start);
 void rolling_col2im_hwc(const float *data_col, float *data_im, const ConvParameter *conv_param, int rows, int start);
--- a/mindspore/lite/nnacl/fp32_grad/pooling_grad.c
+++ b/mindspore/lite/nnacl/fp32_grad/pooling_grad.c
@ -18,7 +18,7 @@
 #include <float.h>
 #include "nnacl/fp32_grad/pooling_grad.h"

-void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
+void AvgPoolingGrad(const float *input_ptr, float *output_ptr, int count, PoolingParameter *pooling_param) {
  int stride_w = pooling_param->stride_w_;
  int stride_h = pooling_param->stride_h_;
  int pad_w = pooling_param->pad_l_;
@ -30,29 +30,58 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter
  int in_h = pooling_param->input_h_;
  int output_w = pooling_param->output_w_;
  int output_h = pooling_param->output_h_;
-  int output_batch = pooling_param->output_batch_;

-  memset(output_ptr, 0, in_h * in_w * channel * output_batch * sizeof(float));
-  float kk = (float)(win_h * win_w);
-  for (int ib = 0; ib < output_batch; ib++) {
+  const float kk = 1.0f / (float)(win_h * win_w);
+#if ENABLE_ARM
+  const float32x4_t factor = vdupq_n_f32(kk);
+#endif
+  for (int ib = 0; ib < count; ib++) {
    float *out = &output_ptr[(ib * in_h * in_w * channel)];
    const float *inPtr = &input_ptr[(ib * output_h * output_w * channel)];
    // iterate over yt
    for (int yh = 0; yh < output_h; yh++) {
+      int over_h = pad_h - yh * stride_h;
+      int kh_s = MSMAX(0, over_h);
+      int kh_e = MSMIN(win_h, in_h + over_h);
      for (int yw = 0; yw < output_w; yw++) {
-        for (int ic = 0; ic < channel; ic++) {
+        int over_w = pad_w - yw * stride_w;
+        int kw_s = MSMAX(0, over_w);
+        int kw_e = MSMIN(win_w, in_w + over_w);
+        int ic = 0;
+        for (; ic < channel - 4; ic += 4) {
          int idx = (yw + yh * output_w) * channel + ic;
-          float delta = inPtr[idx] / kk;
-          for (int kh = 0; kh < win_h; kh++) {
+#ifdef ENABLE_ARM
+          float32x4_t in = vld1q_f32(inPtr + idx);
+          float32x4_t delta = vmulq_f32(in, factor);
+#else
+          float delta[4] = {inPtr[idx], inPtr[idx + 1], inPtr[idx + 2], inPtr[idx + 3]};
+          for (int i = 0; i < 4; i++) delta[i] *= kk;
+#endif
+          for (int kh = kh_s; kh < kh_e; kh++) {
            int xh = yh * stride_h + kh - pad_h;
-            if ((xh < 0) || (xh >= in_h)) {
-              continue;
-            }
-            for (int kw = 0; kw < win_w; kw++) {
+            for (int kw = kw_s; kw < kw_e; kw++) {
              int xw = yw * stride_w + kw - pad_w;
-              if ((xw < 0) || (xw >= in_w)) {
-                continue;
+#ifdef ENABLE_ARM
+              float *out_vec = out + (xw + in_w * xh) * channel + ic;
+              float32x4_t outr = vld1q_f32(out + (xw + in_w * xh) * channel + ic);
+              float32x4_t outs = vaddq_s32(outr, delta);
+              vst1q_f32(out_vec, outs);
+#else
+
+              for (int i = 0; i < 4; i++) {
+                out[(xw + in_w * xh) * channel + ic + i] += ((float *)&delta)[i];
              }
+#endif
+            }
+          }
+        }
+        for (; ic < channel; ic++) {
+          int idx = (yw + yh * output_w) * channel + ic;
+          float delta = inPtr[idx] * kk;
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            for (int kw = kw_s; kw < kw_e; kw++) {
+              int xw = yw * stride_w + kw - pad_w;
              out[(xw + in_w * xh) * channel + ic] += delta;
            }
          }
@ -62,8 +91,17 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter
  }
 }

-void MaxPoolingGrad(const float *input_ptr, const float *dx_ptr, const float *dy_ptr, float *output_ptr,
-                    PoolingParameter *pooling_param, int task_id) {
+#ifdef ENABLE_ARM
+static int32x4_t MaxIndex(float32x4_t in, float32x4_t *max, int32x4_t index, int32x4_t prev_index) {
+  uint32x4_t res = vcgtq_f32(in, *max);
+  uint32x4_t m_index = vbslq_f32(res, index, prev_index);
+  *max = vbslq_f32(res, in, *max);
+  return m_index;
+}
+#endif
+
+void MaxPoolingGrad(const float *input_ptr, const float *dy_ptr, float *output_ptr, int output_batch,
+                    PoolingParameter *pooling_param) {
  int stride_w = pooling_param->stride_w_;
  int stride_h = pooling_param->stride_h_;
  int pad_w = pooling_param->pad_l_;
@ -75,36 +113,71 @@ void MaxPoolingGrad(const float *input_ptr, const float *dx_ptr, const float *dy
  int in_h = pooling_param->input_h_;
  int output_w = pooling_param->output_w_;
  int output_h = pooling_param->output_h_;
-  int output_batch = pooling_param->output_batch_;

-  memset(output_ptr, 0, in_h * in_w * channel * output_batch * sizeof(float));
  for (int ib = 0; ib < output_batch; ib++) {
    float *out = &output_ptr[(ib * in_h * in_w * channel)];
-    const float *inPtr = (const float *)(&input_ptr[(ib * in_h * in_w * channel)]);
-    const float *dyPtr = (const float *)(&dy_ptr[(ib * output_h * output_w * channel)]);
-
+    const float *inPtr = &input_ptr[(ib * in_h * in_w * channel)];
+    const float *dyPtr = &dy_ptr[(ib * output_h * output_w * channel)];
    for (int yh = 0; yh < output_h; yh++) {
+      int over_h = pad_h - yh * stride_h;
+      int kh_s = MSMAX(0, over_h);
+      int kh_e = MSMIN(win_h, in_h + over_h);
      for (int yw = 0; yw < output_w; yw++) {
-        for (int ic = 0; ic < channel; ic++) {
+        int over_w = pad_w - yw * stride_w;
+        int kw_s = MSMAX(0, over_w);
+        int kw_e = MSMIN(win_w, in_w + over_w);
+        int ic = 0;
+        for (; ic < channel - 4; ic += 4) {
          int idx = (yw + yh * output_w) * channel + ic;
-
-          float delta = dyPtr[idx];
-          float max_val = -FLT_MAX;
-          int max_idx = 0;
-          for (int kh = 0; kh < win_h; kh++) {
+#ifdef ENABLE_ARM
+          uint32x4_t max_idx = vdupq_n_u32(0);
+          float32x4_t max_val = vdupq_n_f32(-FLT_MAX);
+          float32x4_t delta = vld1q_f32(dyPtr + idx);
+#else
+          float delta[4] = {dyPtr[idx], dyPtr[idx + 1], dyPtr[idx + 2], dyPtr[idx + 3]};
+          float max_val[4] = {-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX};
+          int max_idx[4] = {0};
+#endif
+          for (int kh = kh_s; kh < kh_e; kh++) {
            int xh = yh * stride_h + kh - pad_h;
-            if ((xh < 0) || (xh >= in_h)) {
-              continue;
-            }
-            for (int kw = 0; kw < win_w; kw++) {
+            for (int kw = kw_s; kw < kw_e; kw++) {
              int xw = yw * stride_w + kw - pad_w;
-              if ((xw < 0) || (xw >= in_w)) {
-                continue;
+              int val_idx = (xw + in_w * xh) * channel + ic;
+#ifdef ENABLE_ARM
+              unsigned int val_idx_vec[] = {val_idx, val_idx + 1, val_idx + 2, val_idx + 3};
+              uint32x4_t index = vld1q_u32(val_idx_vec);
+              float32x4_t in = vld1q_f32(inPtr + val_idx);
+              max_idx = MaxIndex(in, &max_val, index, max_idx);
+#else
+              float val[4] = {inPtr[val_idx], inPtr[val_idx + 1], inPtr[val_idx + 2], inPtr[val_idx + 3]};
+              for (int i = 0; i < 4; i++) {
+                if (val[i] > max_val[i]) {
+                  max_val[i] = val[i];
+                  max_idx[i] = val_idx + i;
+                }
              }
-
-              if (inPtr[(xw + in_w * xh) * channel + ic] > max_val) {
-                max_val = inPtr[(xw + in_w * xh) * channel + ic];
-                max_idx = (xw + in_w * xh) * channel + ic;
+#endif
+            }
+          }
+          for (int i = 0; i < 4; i++) {
+            out[((int *)&max_idx)[i]] += ((float *)&delta)[i];
+          }
+        }
+        for (; ic < channel; ic++) {
+          float max_val = -FLT_MAX;
+          int max_idx = 0;
+          int idx = (yw + yh * output_w) * channel + ic;
+          float delta = dyPtr[idx];
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            int loop = kw_e - kw_s;
+            for (int kw = 0; kw < loop; kw++) {
+              int xw = yw * stride_w + kw + kw_s - pad_w;
+              int val_idx = (xw + in_w * xh) * channel + ic;
+              float val = inPtr[val_idx];
+              if (val > max_val) {
+                max_val = val;
+                max_idx = val_idx;
              }
            }
          }
--- a/mindspore/lite/nnacl/fp32_grad/pooling_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/pooling_grad.h
@ -22,9 +22,9 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-void MaxPoolingGrad(const float *input_ptr, const float *dx_ptr, const float *dy_ptr, float *output_ptr,
-                    PoolingParameter *pooling_param, int task_id);
+void AvgPoolingGrad(const float *input_ptr, float *output_ptr, int count, PoolingParameter *pooling_param);
+void MaxPoolingGrad(const float *input_ptr, const float *dy_ptr, float *output_ptr, int output_batch,
+                    PoolingParameter *pooling_param);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32_grad/strided_slice_grad.c
+++ b/mindspore/lite/nnacl/fp32_grad/strided_slice_grad.c
@ -0,0 +1,61 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32_grad/strided_slice_grad.h"
+#include "nnacl/errorcode.h"
+
+static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
+  size_t res = 1;
+  for (size_t j = 0; j < size; j++) {
+    res *= shape[(i + 1) + j];
+  }
+  return (pos / res % shape[i]);
+}
+
+int DoStridedSliceGrad(const float *inputs, float *output, const int *dx_shape, StridedSliceParameter *param) {
+  if (inputs == NULL || output == NULL || param == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  if (param->num_axes_ > DIMENSION_7D) {
+    return NNACL_PARAM_INVALID;
+  }
+
+  size_t size = 1;
+  int *s = param->strides_;
+  int *b = param->begins_;
+  for (int i = 0; i < DIMENSION_7D; i++) {
+    size *= param->in_shape_[i];
+  }
+
+  for (size_t pos = 0; pos < size; pos++) {
+    size_t i = CalcIndex(param->in_shape_, 6, 0, pos);
+    size_t j = CalcIndex(param->in_shape_, 5, 1, pos);
+    size_t k = CalcIndex(param->in_shape_, 4, 2, pos);
+    size_t l = CalcIndex(param->in_shape_, 3, 3, pos);
+    size_t m = CalcIndex(param->in_shape_, 2, 4, pos);
+    size_t n = CalcIndex(param->in_shape_, 1, 5, pos);
+    size_t o = CalcIndex(param->in_shape_, 0, 6, pos);
+
+    size_t input_idx =
+      (i * s[0] + b[0]) * dx_shape[1] * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (j * s[1] + b[1]) * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (k * s[2] + b[2]) * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (l * s[3] + b[3]) * dx_shape[4] * dx_shape[5] * dx_shape[6] + (m * s[4] + b[4]) * dx_shape[5] * dx_shape[6] +
+      (n * s[5] + b[5]) * dx_shape[6] + (o * s[6] + b[6]);
+    output[input_idx] = inputs[pos];
+  }
+  return NNACL_OK;
+}
--- a/mindspore/lite/nnacl/fp32_grad/strided_slice_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/strided_slice_grad.h
@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_GRAD_STRIDED_SLICE_GRAD_H_
+#define MINDSPORE_LITE_NNACL_FP32_GRAD_STRIDED_SLICE_GRAD_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/strided_slice_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int DoStridedSliceGrad(const float *inputs, float *output, const int *dx_shape, StridedSliceParameter *param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_GRAD_STRIDED_SLICE_GRAD_H_
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@ -53,6 +53,7 @@

 #define DIMENSION_4D 4
 #define DIMENSION_6D 6
+#define DIMENSION_7D 7
 #define kInputIndex 0
 #define kWeightIndex 1
 #define kBiasIndex 2
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@ -273,6 +273,7 @@ union PrimitiveType {
    RandomStandardNormal,
    CropAndResize,
    Erf,
+    StridedSliceGrad
 }

 enum QuantType: int {
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@ -1259,6 +1259,18 @@ table RandomStandardNormal {
 table CropAndResize {
    method : ResizeMethod;
    extrapolation_value : float;
+}    
+
+table StridedSliceGrad {
+    beginMask: int;
+    endMask: int;
+    ellipsisMask: int;
+    newAxisMask: int;
+    shrinkAxisMask: int;
+    begin: [int];
+    end: [int];
+    stride: [int];
+    isScale: [int];
 }

 table Erf {
--- a/mindspore/lite/src/ops/flatten_grad.cc
+++ b/mindspore/lite/src/ops/flatten_grad.cc
@ -31,7 +31,7 @@ int FlattenGrad::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *>
    MS_LOG(ERROR) << "FlattenGrad input or output is null!";
    return RET_ERROR;
  }
-  if (inputs_.size() != kSingleNum || outputs_.size() != kSingleNum) {
+  if (inputs_.size() != kDoubleNum || outputs_.size() != kSingleNum) {
    MS_LOG(ERROR) << "input size: " << inputs_.size() << ", output size: " << outputs_.size();
    return RET_INPUT_TENSOR_ERROR;
  }
@ -42,16 +42,15 @@ int FlattenGrad::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *>
    return RET_INFER_INVALID;
  }

-  auto input_shape = input->shape();
-  std::vector<int> output_shape(2);
-  output_shape.at(0) = input_shape.at(0);
-  output_shape.at(1) = 1;
-  for (size_t i = 1; i < input_shape.size(); i++) {
-    output_shape.at(1) *= input_shape.at(i);
+  auto output_size = inputs_.at(1)->shape().at(0);
+  std::vector<int> output_shape(output_size);
+  for (int i = 0; i < output_size; i++) {
+    output_shape.at(i) = static_cast<int *>(inputs_.at(1)->data_c())[i];
  }
  output->set_shape(output_shape);
  return RET_OK;
 }
+
 #ifdef PRIMITIVE_WRITEABLE
 int FlattenGrad::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
  if (this->primitive_ == nullptr) {
--- a/mindspore/lite/src/ops/pooling_grad.cc
+++ b/mindspore/lite/src/ops/pooling_grad.cc
@ -91,6 +91,8 @@ int PoolingGrad::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr>
      attr->poolingMode = schema::PoolMode_MEAN_POOLING;
    } else if (prim.instance_name() == "AvgPoolGradGpu") {
      attr->poolingMode = schema::PoolMode_MEAN_POOLING;
+    } else if (prim.instance_name() == "AvgPoolGradCpu") {
+      attr->poolingMode = schema::PoolMode_MEAN_POOLING;
    } else {
      attr->poolingMode = schema::PoolMode_MAX_POOLING;
    }
--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@ -202,6 +202,7 @@
 #include "src/ops/smooth_l1_loss_grad.h"
 #include "src/ops/sigmoid_cross_entropy_with_logits.h"
 #include "src/ops/sigmoid_cross_entropy_with_logits_grad.h"
+#include "src/ops/strided_slice_grad.h"
 #endif
 #endif
 namespace mindspore {
@ -724,6 +725,8 @@ std::shared_ptr<PrimitiveC> PrimitiveC::Create(const Primitive &prim, const std:
    return NewPrimitiveC<SigmoidCrossEntropyWithLogitsGrad>(prim, inputs, quantType);
  } else if (op_type == "Pad") {
    return NewPrimitiveC<Pad>(prim, inputs, quantType);
+  } else if (op_type == "StridedSliceGrad") {
+    return NewPrimitiveC<StridedSliceGrad>(prim, inputs, quantType);
 #else
  } else if (op_type == "Conv2DBackpropInput") {
    return NewPrimitiveC<DeConv2D>(prim, inputs, quantType);
@ -1102,6 +1105,8 @@ PrimitiveC *PrimitiveC::Create(mindspore::schema::PrimitiveT *primitive) {
      return new (std::nothrow) SigmoidCrossEntropyWithLogits(primitive);
    case schema::PrimitiveType_SigmoidCrossEntropyWithLogitsGrad:
      return new (std::nothrow) SigmoidCrossEntropyWithLogitsGrad(primitive);
+    case schema::PrimitiveType_StridedSliceGrad:
+      return new (std::nothrow) StridedSliceGrad(primitive);
 #endif
    default:
      MS_LOG(ERROR) << "Unsupported primitive type in Create : " << schema::EnumNamePrimitiveType(op_type);
--- a/mindspore/lite/src/ops/strided_slice_grad.cc
+++ b/mindspore/lite/src/ops/strided_slice_grad.cc
--- a/mindspore/lite/src/ops/strided_slice_grad.h
+++ b/mindspore/lite/src/ops/strided_slice_grad.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_OPS_STRIDED_SLICE_GRAD_H_
+#define MINDSPORE_LITE_SRC_OPS_STRIDED_SLICE_GRAD_H_
+
+#include <vector>
+#include <set>
+#include <cmath>
+#include <memory>
+
+#include "src/ops/strided_slice.h"
+
+namespace mindspore {
+namespace lite {
+class StridedSliceGrad : public StridedSlice {
+ public:
+  StridedSliceGrad() = default;
+  ~StridedSliceGrad() = default;
+#ifdef PRIMITIVE_WRITEABLE
+  MS_DECLARE_PARENT(StridedSliceGrad, StridedSlice);
+  explicit StridedSliceGrad(schema::PrimitiveT *primitive) : StridedSlice(primitive) {}
+  void SetBeginMask(int begin_mask);
+  void SetEndMask(int end_mask);
+  void SetEllipsisMask(int ellipsis_mask);
+  void SetNewAxisMask(int new_axis_mask);
+  void SetShrinkAxisMask(int shrink_axis_mask);
+  void SetBegin(const std::vector<int> &begin);
+  void SetEnd(const std::vector<int> &end);
+  void SetStride(const std::vector<int> &stride);
+  void SetIsScale(const std::vector<int> &is_scale);
+  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs);
+#else
+  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
+#endif
+  int InferShape(std::vector<lite::Tensor *> inputs_, std::vector<lite::Tensor *> outputs_) override;
+  // bool CheckInputs(std::vector<lite::Tensor *> inputs_);
+  int GetBeginMask() const;
+  int GetEndMask() const;
+  int GetEllipsisMask() const;
+  int GetNewAxisMask() const;
+  int GetShrinkAxisMask() const;
+  std::vector<int> GetBegin() const;
+  std::vector<int> GetEnd() const;
+  std::vector<int> GetStride() const;
+  std::vector<int> GetIsScale() const;
+};
+}  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_OPS_STRIDED_SLICE_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@ -91,10 +91,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {

  // init bias
  size_t new_bias_size = oc4 * C4NUM * sizeof(float);
-  bias_data_ = reinterpret_cast<float *>(malloc(new_bias_size));
  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "malloc bias_data_ failed.";
-    return RET_MEMORY_FAILED;
+    bias_data_ = reinterpret_cast<float *>(malloc(new_bias_size));
+    if (bias_data_ == nullptr) {
+      MS_LOG(ERROR) << "malloc bias_data_ failed.";
+      return RET_MEMORY_FAILED;
+    }
  }
  memset(bias_data_, 0, new_bias_size);
  if (in_tensors_.size() == kInputSize2) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
@ -91,10 +91,6 @@ int FusedBatchnormCPUKernel::Run() {
    memcpy(scale_, scale, in_tensors_[1]->Size());
    memcpy(offset_, offset, in_tensors_[2]->Size());

-    // save for next iteration
-    memcpy(in_tensors_[3]->MutableData(), save_mean, in_tensors_[3]->Size());
-    memcpy(in_tensors_[4]->MutableData(), save_variance, in_tensors_[4]->Size());
-
    trained_ = true;  // trained at least once
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
@ -40,17 +40,16 @@ int ApplyMomentumCPUKernel::Execute(int task_id) {

  size_t stride = UP_DIV(length, thread_count_);
  size_t count = MSMIN(stride, length - stride * task_id);
-
  size_t start = stride * task_id;
  size_t end = start + count;

  if (apply_momentum_param_->use_nesterov_) {
-    for (size_t i = start; i < end; ++i) {
+    for (size_t i = start; i < end; i++) {
      accumulate[i] = accumulate[i] * moment + gradient[i];
      weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate;
    }
  } else {
-    for (size_t i = start; i < end; ++i) {
+    for (size_t i = start; i < end; i++) {
      accumulate[i] = accumulate[i] * moment + gradient[i];
      weight[i] -= accumulate[i] * learning_rate;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.cc
@ -18,6 +18,10 @@
 #include <math.h>
 #include <algorithm>
 #include <vector>
+
+#include <thread>
+#include <fstream>
+
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "nnacl/fp32_grad/batch_norm.h"
@ -34,7 +38,8 @@ namespace mindspore::kernel {
 int BNGradCPUKernel::ReSize() {
  auto *input_x = in_tensors_.at(1);
  int channels = input_x->shape().at(kNHWC_C);
-  set_workspace_size(2 * channels * sizeof(float));
+  ws_size_ = 2 * channels;
+  set_workspace_size(ws_size_ * sizeof(float));
  return RET_OK;
 }

@ -46,7 +51,9 @@ int BNGradCPUKernel::Execute(int task_id) {
  auto *input_scale = in_tensors_.at(2);
  auto *input_mean = in_tensors_.at(3);
  auto *input_var = in_tensors_.at(4);
-
+  auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
+  int stage = stage_;
+  int thread_num = thread_num_;
  float *save_mean = reinterpret_cast<float *>(input_mean->MutableData());
  float *save_var = reinterpret_cast<float *>(input_var->MutableData());

@ -58,26 +65,57 @@ int BNGradCPUKernel::Execute(int task_id) {
  int32_t spatial = input_x->Height() * input_x->Width();

  float *workspace_temp = static_cast<float *>(workspace());
-  std::fill(workspace_temp, workspace_temp + workspace_size() / sizeof(*workspace_temp), 0.f);
  float *dxhat_sum = workspace_temp;
  float *dxhathat_sum = dxhat_sum + channels;
-
  float *x = reinterpret_cast<float *>(input_x->MutableData());
  float *yt = reinterpret_cast<float *>(input_yt->MutableData());
  float *scale = reinterpret_cast<float *>(input_scale->MutableData());
  float *dx = reinterpret_cast<float *>(output_dx->MutableData());
  float *dbias = reinterpret_cast<float *>(output_bias->MutableData());
  float *dscale = reinterpret_cast<float *>(output_scale->MutableData());
-  std::fill(dbias, dbias + channels, 0.f);
-  std::fill(dscale, dscale + channels, 0.f);
-  backwardAll(x, yt, save_mean, save_var, scale, batch * spatial, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx);
+  int total = spatial * batch;
+  int stride = UP_DIV(total, thread_num);
+  int count = MSMIN(stride, total - stride * task_id);
+  switch (stage) {
+    case 0: {
+      for (int job = task_id; job < 4; job += thread_num) {
+        switch (job) {
+          case 0:
+            var2Invar(save_var, input_var->ElementsNum(), bn_param->epsilon_);
+            break;
+          case 1:
+            std::fill(workspace_temp, workspace_temp + ws_size_, 0.f);
+            break;
+          case 2:
+            std::fill(dbias, dbias + channels, 0.f);
+            break;
+          case 3:
+            std::fill(dscale, dscale + channels, 0.f);
+            break;
+        }
+      }
+      if (thread_num == 1) {
+        backwardAll(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx);
+      }
+      break;
+    }
+    case 1: {
+      backwardP1(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale);
+      break;
+    }
+    case 2: {
+      backwardP2(x + task_id * stride * channels, yt + task_id * stride * channels, save_mean, save_var, scale, count,
+                 total, channels, dxhat_sum, dxhathat_sum, dx + task_id * stride * channels);
+      break;
+    }
+  }
+
  return RET_OK;
 }

 int BNGradRun(void *cdata, int task_id) {
  MS_ASSERT(cdata != nullptr);
  auto bn_kernel = reinterpret_cast<BNGradCPUKernel *>(cdata);
-
  auto error_code = bn_kernel->Execute(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "BNGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -87,15 +125,24 @@ int BNGradRun(void *cdata, int task_id) {
 }

 int BNGradCPUKernel::Run() {
-  auto *input_var = in_tensors_.at(4);
-  float *save_var = reinterpret_cast<float *>(input_var->MutableData());
-  auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
-  float eps = bn_param->epsilon_;
-  var2Invar(save_var, input_var->ElementsNum(), eps);
-  int error_code = ParallelLaunch(this->context_->thread_pool_, BNGradRun, this, 1);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
-    return RET_ERROR;
+  stage_ = 0;
+  thread_num_ = context_->thread_num_;
+  if (thread_num_ == 1) {
+    int error_code = ParallelLaunch(this->context_->thread_pool_, BNGradRun, this, thread_num_);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
+      return RET_ERROR;
+    }
+  } else {
+    const std::vector<int> threads = {thread_num_, 1, thread_num_};
+    for (size_t stage = 0; stage < threads.size(); stage++) {
+      stage_ = static_cast<int>(stage);
+      int error_code = ParallelLaunch(this->context_->thread_pool_, BNGradRun, this, threads.at(stage));
+      if (error_code != RET_OK) {
+        MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
+        return RET_ERROR;
+      }
+    }
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.h
@ -33,6 +33,11 @@ class BNGradCPUKernel : public LiteKernel {
  int ReSize() override;
  int Run() override;
  int Execute(int task_id);
+
+ private:
+  int thread_num_ = 1;
+  int stage_ = 0;
+  size_t ws_size_ = 0;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BN_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.cc
@ -54,9 +54,6 @@ int ConvolutionTrainCPUKernel::ReSize() {
  conv_param_->group_ = (conv_param_->group_ == 0) ? conv_param_->input_channel_ : conv_param_->group_;
  const int n = conv_param_->output_channel_ * conv_param_->group_;
  const int k = conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ / conv_param_->group_;
-  ws_size_ = chunk_ * k;
-  int mat_alloc = MatSizeTotal(chunk_, n, k, 0);
-  set_workspace_size((ws_size_ + mat_alloc) * sizeof(float));

  do_img2col_ = (conv_param_->kernel_h_ == 1) && (conv_param_->kernel_w_ == 1) && (conv_param_->pad_d_ == 0) &&
                    (conv_param_->pad_u_ == 0) && (conv_param_->pad_l_ == 0) && (conv_param_->pad_r_ == 0) &&
@ -64,6 +61,16 @@ int ConvolutionTrainCPUKernel::ReSize() {
                    (conv_param_->stride_h_ == 1) && (conv_param_->stride_w_ == 1) && (conv_param_->group_ == 1)
                  ? false
                  : true;
+  do_dw_ = (conv_param_->output_channel_ == conv_param_->group_) &&
+               (conv_param_->input_channel_ == conv_param_->output_channel_) && (conv_param_->dilation_h_ == 1) &&
+               (conv_param_->dilation_w_ == 1)
+             ? true
+             : false;
+
+  ws_size_ = chunk_ * conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_;
+  ws_size_ = do_dw_ ? ws_size_ : ws_size_ / conv_param_->group_;
+  int mat_alloc = MatSizeTotal(chunk_, n, k, 0);
+  set_workspace_size((ws_size_ + mat_alloc) * sizeof(float));

  return RET_OK;
 }
@ -97,7 +104,25 @@ int ConvolutionTrainCPUKernel::Execute(int task_id) {
  float *workspace_temp = static_cast<float *>(workspace());
  float *mat_workspace = workspace_temp + ws_size_;

-  if (do_img2col_) {
+  if (do_dw_) {
+    const int kernel_spatial = k_h * k_w;
+    for (int i = 0; i < batch; ++i) {
+      for (int ci = 0; ci < m; ci += chunk_) {
+        int real_chunk = MSMIN(m - ci, chunk_);
+        float *mat_a = workspace_temp;
+        float *im = x_addr + (i * in_ch * in_h * in_w);
+        RollingIm2ColPackDwUnitFp32(im, conv_param_, mat_a, real_chunk, ci);
+        for (int j = 0; j < groups; ++j) {
+          const float *mat_b = w_addr + j * nweights / groups;
+          float *mat_c = y_addr + (i * groups) * n * m + j * (out_ch / groups) + ci * out_ch;
+          // float *im = x_addr + i * in_ch * in_h * in_w + j * (in_ch / groups);
+          // RollingIm2ColPackUnitFp32(im, conv_param_, mat_a, real_chunk, ci);
+          GemmMatmul(0, 1, real_chunk, n, k, 1, mat_a + (j * kernel_spatial), k * groups, mat_b, k, 0, mat_c, out_ch,
+                     mat_workspace);
+        }
+      }
+    }
+  } else if (do_img2col_) {
    for (int i = 0; i < batch; ++i) {
      for (int j = 0; j < groups; ++j) {
        for (int ci = 0; ci < m; ci += chunk_) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.h
@ -37,6 +37,7 @@ class ConvolutionTrainCPUKernel : public LiteKernel {
 private:
  int ws_size_ = 0;
  bool do_img2col_ = true;
+  bool do_dw_ = false;
 #ifdef ENABLE_ARM32
  const int chunk_ = C4NUM * 2;
 #else
--- a/Show More
+++ b/Show More