!5202 [MS][LITE][Develop]fp16 pooling optimize

Merge pull request !5202 from ling/sr
5 years ago · 572c8fa22f
parent 25d5423640 36ebe58819
commit 572c8fa22f
8 changed files with 123 additions and 823 deletions
--- a/mindspore/lite/nnacl/assembly/arm64/matrix_add.S
+++ b/mindspore/lite/nnacl/assembly/arm64/matrix_add.S
@ -1,103 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global MatrixAdd
-#ifndef __APPLE__
-    .type MatrixAdd, %function
-#endif
-
-
-
-//void MatrixAdd(const float* matDataA, const float* matDataB, float* matDataC,
-// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height)
-
-//Auto: x0: matDataA, x1:matDataB, x2:matDatac,
-//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height
-
-MatrixAdd:
-mov x12, #4 //sizeof(float)
-mul x3, x12, x3
-mul x4, x12, x4
-mul x5, x12, x5
-
-loopH:
-mov x8, x0
-mov x9, x1
-mov x10, x2
-
-mov x11, x6
-
-loop16LineIn:
-cmp x11, #4
-blt L8
-sub x11, x11, #4
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-
-fadd v4.4s, v0.4s, v2.4s
-fadd v5.4s, v1.4s, v3.4s
-
-ld1 {v6.4s, v7.4s}, [x0], #32
-ld1 {v8.4s, v9.4s}, [x1], #32
-
-cmp x11, #4
-blt loop16LineOut
-
-loop16:
-st1 {v4.4s, v5.4s}, [x2], #32
-fadd v10.4s, v6.4s, v8.4s
-fadd v11.4s, v7.4s, v9.4s
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-
-st1 {v10.4s, v11.4s}, [x2], #32
-fadd v4.4s, v0.4s, v2.4s
-fadd v5.4s, v1.4s, v3.4s
-ld1 {v6.4s, v7.4s}, [x0], #32
-ld1 {v8.4s, v9.4s}, [x1], #32
-
-sub x11, x11, #4
-cmp x11, #4
-bge loop16
-
-loop16LineOut:
-st1 {v4.4s, v5.4s}, [x2], #32
-fadd v10.4s, v6.4s, v8.4s
-fadd v11.4s, v7.4s, v9.4s
-st1 {v10.4s, v11.4s}, [x2], #32
-
-
-L8:
-cmp x11, #2
-blt L4
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-fadd v4.4s, v0.4s, v2.4s
-fadd v5.4s, v1.4s, v3.4s
-sub x11, x11, #2
-st1 {v4.4s, v5.4s}, [x2], #32
-
-
-cmp x11, #0
-beq loop16EndLine
-
-L4:
-ld1 {v0.4s}, [x0], #16
-ld1 {v1.4s}, [x1], #16
-fadd v0.4s, v0.4s, v1.4s
-sub x11, x11, #1
-st1 {v0.4s}, [x2], #16
-//bne L4
-
-loop16EndLine:
-add x0, x8, x3
-add x1, x9, x4
-add x2, x10, x5
-
-subs x7, x7, #1
-bne loopH
-
-ret
-#endif
--- a/mindspore/lite/nnacl/assembly/arm64/matrix_sub.S
+++ b/mindspore/lite/nnacl/assembly/arm64/matrix_sub.S
@ -1,105 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global MatrixSub
-#ifndef __APPLE__
-    .type MatrixSub, %function
-#endif
-
-
-
-//void MatrixSub(const float* matDataA, const float* matDataB, float* matDataC,
-// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height)
-
-//Auto: x0: matDataA, x1:matDataB, x2:matDatac,
-//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height
-
-MatrixSub:
-mov x12, #4 //sizeof(float)
-mul x3, x12, x3
-mul x4, x12, x4
-mul x5, x12, x5
-
-loopH:
-mov x8, x0
-mov x9, x1
-mov x10, x2
-
-mov x11, x6
-
-loop16LineIn:
-cmp x11, #4
-blt L8
-sub x11, x11, #4
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-
-fsub v4.4s, v0.4s, v2.4s
-fsub v5.4s, v1.4s, v3.4s
-
-ld1 {v6.4s, v7.4s}, [x0], #32
-ld1 {v8.4s, v9.4s}, [x1], #32
-
-cmp x11, #4
-blt loop16LineOut
-
-loop16:
-st1 {v4.4s, v5.4s}, [x2], #32
-fsub v10.4s, v6.4s, v8.4s
-fsub v11.4s, v7.4s, v9.4s
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-
-st1 {v10.4s, v11.4s}, [x2], #32
-fsub v4.4s, v0.4s, v2.4s
-fsub v5.4s, v1.4s, v3.4s
-ld1 {v6.4s, v7.4s}, [x0], #32
-ld1 {v8.4s, v9.4s}, [x1], #32
-
-sub x11, x11, #4
-cmp x11, #4
-bge loop16
-
-loop16LineOut:
-st1 {v4.4s, v5.4s}, [x2], #32
-fsub v10.4s, v6.4s, v8.4s
-fsub v11.4s, v7.4s, v9.4s
-st1 {v10.4s, v11.4s}, [x2], #32
-
-L8:
-cmp x11, #2
-blt L4
-
-ld1 {v0.4s, v1.4s}, [x0], #32
-ld1 {v2.4s, v3.4s}, [x1], #32
-
-fsub v4.4s, v0.4s, v2.4s
-fsub v5.4s, v1.4s, v3.4s
-
-sub x11, x11, #2
-st1 {v4.4s, v5.4s}, [x2], #32
-
-
-cmp x11, #0
-beq loop16EndLine
-
-L4:
-ld1 {v0.4s}, [x0], #16
-ld1 {v1.4s}, [x1], #16
-fsub v0.4s, v0.4s, v1.4s
-sub x11, x11, #1
-st1 {v0.4s}, [x2], #16
-
-
-loop16EndLine:
-add x0, x8, x3
-add x1, x9, x4
-add x2, x10, x5
-
-subs x7, x7, #1
-bne loopH
-
-ret
-#endif
--- a/mindspore/lite/nnacl/fp16/pooling_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pooling_fp16.c
@ -51,6 +51,12 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
        int in_w_index = out_w_index * stride_w - pad_w;
        int in_h_index = out_h_index * stride_h - pad_h;
        int out_plane_offset = out_batch_offset + index * channel;
+
+        int real_win_h_start = MSMAX(0, -in_h_index);
+        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+        int resl_win_w_start = MSMAX(0, -in_w_index);
+        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+
        for (int j = 0; j < c8; j++) {
          int in_channel_offset = in_batch_offset + j * C8NUM;
          int out_channel_offset = out_plane_offset + j * C8NUM;
@ -60,22 +66,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
          float16_t tmp_avg[8]{0};
 #endif
          int real_count = 0;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
 #ifdef ENABLE_NEON
-                tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset));
+              tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset));
 #else
-                for (int t = 0; t < 8; t++) {
-                  tmp_avg[t] += *(input_ptr + in_offset + t);
-                }
-#endif
-                ++real_count;
+              for (int t = 0; t < 8; t++) {
+                tmp_avg[t] += *(input_ptr + in_offset + t);
              }
+#endif
+              ++real_count;
            }  // win_w loop
          }    // win_h loop
 #ifdef ENABLE_NEON
@ -97,22 +98,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
          float16_t tmp_avg[4]{0};
 #endif
          int real_count = 0;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
 #ifdef ENABLE_NEON
-                tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset));
+              tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset));
 #else
-                for (int j = 0; j < C4NUM; ++j) {
-                  tmp_avg[j] += *(input_ptr + in_offset);
-                }
-#endif
-                ++real_count;
+              for (int j = 0; j < C4NUM; ++j) {
+                tmp_avg[j] += *(input_ptr + in_offset);
              }
+#endif
+              ++real_count;
            }  // win_w loop
          }    // win_h loop
 #ifdef ENABLE_NEON
@ -130,16 +126,11 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
          int out_channel_offset = out_plane_offset + k;
          float16_t tmp_avg = 0;
          int real_count = 0;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-                tmp_avg += *(input_ptr + in_offset);
-                ++real_count;
-              }
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+              tmp_avg += *(input_ptr + in_offset);
+              ++real_count;
            }  // win_w loop
          }    // win_h loop
          *(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count;
@ -148,7 +139,6 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
    }      // out_plane loop
  }        // out_batch loop
 }
-
 void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
  int stride_w = pooling_param->stride_w_;
  int stride_h = pooling_param->stride_h_;
@ -183,6 +173,12 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
        int in_w_index = out_w_index * stride_w - pad_w;
        int in_h_index = out_h_index * stride_h - pad_h;
        int out_plane_offset = out_batch_offset + index * channel;
+
+        int real_win_h_start = MSMAX(0, -in_h_index);
+        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+        int resl_win_w_start = MSMAX(0, -in_w_index);
+        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+
        for (int j = 0; j < c8; j++) {
          int in_channel_offset = in_batch_offset + j * C8NUM;
          int out_channel_offset = out_plane_offset + j * C8NUM;
@ -191,21 +187,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
 #else
          float16_t tmp_max[8]{-FLT_MAX};
 #endif
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
 #ifdef ENABLE_NEON
-                tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset));
+              tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset));
 #else
-                for (int k = 0; k < C8NUM; k++) {
-                  tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
-                }
-#endif
+              for (int k = 0; k < C8NUM; k++) {
+                tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
              }
+#endif
            }  // win_w loop
          }    // win_h loop
 #ifdef ENABLE_NEON
@ -226,21 +217,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
 #else
          float16_t tmp_max[4]{-FLT_MAX};
 #endif
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
 #ifdef ENABLE_NEON
-                tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset));
+              tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset));
 #else
-                for (int k = 0; k < C4NUM; k++) {
-                  tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
-                }
-#endif
+              for (int k = 0; k < C4NUM; k++) {
+                tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
              }
+#endif
            }  // win_w loop
          }    // win_h loop
 #ifdef ENABLE_NEON
@ -257,15 +243,10 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
          int in_channel_offset = in_batch_offset + k;
          int out_channel_offset = out_plane_offset + k;
          float16_t tmp_max = -FLT_MAX;
-          for (int h = 0; h < win_h; h++) {
-            for (int w = 0; w < win_w; w++) {
-              if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
-                  (in_w_index + w) >= in_w) {
-                continue;
-              } else {
-                int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-                tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
-              }
+          for (int h = real_win_h_start; h < real_win_h_end; h++) {
+            for (int w = resl_win_w_start; w < real_win_w_end; w++) {
+              int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+              tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
            }  // win_w loop
          }    // win_h loop
          *(output_ptr + out_channel_offset) = tmp_max;
--- a/mindspore/lite/nnacl/fp32/common_func.c
+++ b/mindspore/lite/nnacl/fp32/common_func.c
@ -15,54 +15,6 @@
 */

 #include "nnacl/fp32/common_func.h"
-
-#ifndef ENABLE_ARM64
-void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
-               size_t row, size_t col) {
-  for (int r = 0; r < row; r++) {
-    for (int c = 0; c < col; c++) {
-      int a_index = c * a_stride + r * C4NUM;
-      int b_index = c * b_stride + r * C4NUM;
-      int c_index = c * c_stride + r * C4NUM;
-      for (int i = 0; i < C4NUM; i++) {
-        dst[c_index + i] = a_ptr[a_index + i] + b_ptr[b_index + i];
-      }
-    }
-  }
-  return;
-}
-
-void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
-               size_t row, size_t col) {
-  for (int r = 0; r < row; r++) {
-    for (int c = 0; c < col; c++) {
-      int a_index = c * a_stride + r * C4NUM;
-      int b_index = c * b_stride + r * C4NUM;
-      int c_index = c * c_stride + r * C4NUM;
-      for (int i = 0; i < C4NUM; i++) {
-        dst[c_index + i] = a_ptr[a_index + i] - b_ptr[b_index + i];
-      }
-    }
-  }
-  return;
-}
-#endif
-
-void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
-                    size_t c_stride, size_t x_stride) {
-  /* U2 = P1 + P6 */
-  MatrixAdd(x_ptr, c12, c12, x_stride, c_stride, c_stride, row, col);
-  /* U3 = U2 + P7 */
-  MatrixAdd(c12, c21, c21, c_stride, c_stride, c_stride, row, col);
-  /* U4 = U2 + P5 */
-  MatrixAdd(c12, c22, c12, c_stride, c_stride, c_stride, row, col);
-  /* U7 = U3 + P5 */
-  MatrixAdd(c21, c22, c22, c_stride, c_stride, c_stride, row, col);
-  /* U5 = U4 + P3 */
-  MatrixAdd(c12, c11, c12, c_stride, c_stride, c_stride, row, col);
-  return;
-}
-
 void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel,
                      size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) {
  for (int oc = 0; oc < output_channel; oc++) {
--- a/mindspore/lite/nnacl/fp32/common_func.h
+++ b/mindspore/lite/nnacl/fp32/common_func.h
@ -31,12 +31,6 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi
                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
 void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
-void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
-               size_t row, size_t col);
-void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
-               size_t row, size_t col);
-void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
-                    size_t c_stride, size_t x_stride);
 float ShortToFloat32(uint16_t srcValue);

 uint16_t Float32ToShort(float srcValue);
--- a/mindspore/lite/nnacl/fp32/pooling.c
+++ b/mindspore/lite/nnacl/fp32/pooling.c
--- a/mindspore/lite/nnacl/fp32/pooling.h
+++ b/mindspore/lite/nnacl/fp32/pooling.h
@ -27,17 +27,10 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-
-void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-
-void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-
-void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-
-void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
-
-void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
+void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
+                float maxf);
+void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
+                float maxf);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
@ -15,6 +15,7 @@
 */

 #include "src/runtime/kernel/arm/fp32/pooling.h"
+#include <float.h>
 #include "nnacl/fp32/pooling.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
@ -52,28 +53,18 @@ int PoolingCPUKernel::ReSize() {
 int PoolingCPUKernel::RunImpl(int task_id) {
  auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
  auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
+  float minf = -FLT_MAX;
+  float maxf = FLT_MAX;
+  if (pooling_param_->act_type_ == ActType_Relu) {
+    minf = 0.f;
+  } else if (pooling_param_->act_type_ == ActType_Relu6) {
+    minf = 0.f;
+    maxf = 6.f;
+  }
  if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
-    switch (pooling_param_->act_type_) {
-      case ActType_Relu:
-        MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
-        break;
-      case ActType_Relu6:
-        MaxPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
-        break;
-      default:
-        MaxPooling(input_ptr, output_ptr, pooling_param_, task_id);
-    }
+    MaxPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf);
  } else {
-    switch (pooling_param_->act_type_) {
-      case ActType_Relu:
-        AvgPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
-        break;
-      case ActType_Relu6:
-        AvgPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
-        break;
-      default:
-        AvgPooling(input_ptr, output_ptr, pooling_param_, task_id);
-    }
+    AvgPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf);
  }
  return RET_OK;
 }