diff --git a/mindspore/lite/nnacl/assembly/arm64/C4BiasAdd.S b/mindspore/lite/nnacl/assembly/arm64/C4BiasAdd.S deleted file mode 100644 index d59b0239f1..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/C4BiasAdd.S +++ /dev/null @@ -1,131 +0,0 @@ -#ifdef __aarch64__ - - .text - .align 5 - //.p2align 5,,15 - .global C4BiasAdd -#ifndef __APPLE__ - .type C4BiasAdd, %function -#endif - -//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) -//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride - -C4BiasAdd: - - LoopOc: - ld1 {v4.4s}, [x2], #16 - mov x6, x4 - mov x7, x0 - cmp x6, #4 - blt Loop1 - - Loop4: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - fadd v0.4s, v0.4s, v4.4s - fadd v1.4s, v1.4s, v4.4s - fadd v2.4s, v2.4s, v4.4s - fadd v3.4s, v3.4s, v4.4s - - cmp x3, #4 - bge Write4x4 - cmp x3, #3 - beq Write3x4 - cmp x3, #2 - beq Write2x4 - - Write1x4: - str s0, [x7] - add x7, x7, x5 - str s1, [x7] - add x7, x7, x5 - str s2, [x7] - add x7, x7, x5 - str s3, [x7] - add x7, x7, x5 - b WriteEndx4 - Write2x4: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - b WriteEndx4 - Write3x4: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - st1 {v1.s}[2], [x8], x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - st1 {v2.s}[2], [x8], x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - st1 {v3.s}[2], [x8], x5 - b WriteEndx4 - Write4x4: - st1 {v0.4s}, [x7], x5 - st1 {v1.4s}, [x7], x5 - st1 {v2.4s}, [x7], x5 - st1 {v3.4s}, [x7], x5 - - WriteEndx4: - subs x6, x6, #4 - beq LoopOcEnd - cmp x6, #4 - blt Loop1 - b Loop4 - - Loop1: - ld1 {v0.4s}, [x1], #16 - fadd v0.4s, v0.4s, v4.4s - - cmp x3, #4 - bge Write4 - cmp x3, #3 - beq Write3 - cmp x3, #2 - beq Write2 - - Write1: - str s0, [x7] - add x7, x7, x5 - b WriteEnd - Write2: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - b WriteEnd - Write3: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - b WriteEnd - Write4: - st1 {v0.4s}, [x7], x5 - WriteEnd: - subs x6, x6, #1 - bne Loop1 - LoopOcEnd: - subs x3, x3, #4 - add x0, x0, #16 - bgt LoopOc - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu.S b/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu.S deleted file mode 100644 index 6a464b400b..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu.S +++ /dev/null @@ -1,137 +0,0 @@ -#ifdef __aarch64__ - - .text - .align 5 - //.p2align 5,,15 - .global C4BiasAddRelu -#ifndef __APPLE__ - .type C4BiasAddRelu, %function -#endif - -//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) -//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride - -C4BiasAddRelu: - dup v5.4s, wzr - LoopOc: - ld1 {v4.4s}, [x2], #16 - mov x6, x4 - mov x7, x0 - cmp x6, #4 - blt Loop1 - - Loop4: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - fadd v0.4s, v0.4s, v4.4s - fadd v1.4s, v1.4s, v4.4s - fadd v2.4s, v2.4s, v4.4s - fadd v3.4s, v3.4s, v4.4s - - fmax v0.4s, v0.4s, v5.4s - fmax v1.4s, v1.4s, v5.4s - fmax v2.4s, v2.4s, v5.4s - fmax v3.4s, v3.4s, v5.4s - - cmp x3, #4 - bge Write4x4 - cmp x3, #3 - beq Write3x4 - cmp x3, #2 - beq Write2x4 - - Write1x4: - str s0, [x7] - add x7, x7, x5 - str s1, [x7] - add x7, x7, x5 - str s2, [x7] - add x7, x7, x5 - str s3, [x7] - add x7, x7, x5 - b WriteEndx4 - Write2x4: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - b WriteEndx4 - Write3x4: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - st1 {v1.s}[2], [x8], x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - st1 {v2.s}[2], [x8], x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - st1 {v3.s}[2], [x8], x5 - b WriteEndx4 - Write4x4: - st1 {v0.4s}, [x7], x5 - st1 {v1.4s}, [x7], x5 - st1 {v2.4s}, [x7], x5 - st1 {v3.4s}, [x7], x5 - - WriteEndx4: - subs x6, x6, #4 - beq LoopOcEnd - cmp x6, #4 - blt Loop1 - b Loop4 - - Loop1: - ld1 {v0.4s}, [x1], #16 - fadd v0.4s, v0.4s, v4.4s - fmax v0.4s, v0.4s, v5.4s - - cmp x3, #4 - bge Write4 - cmp x3, #3 - beq Write3 - cmp x3, #2 - beq Write2 - - Write1: - str s0, [x7] - add x7, x7, x5 - b WriteEnd - Write2: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - b WriteEnd - Write3: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - b WriteEnd - Write4: - st1 {v0.4s}, [x7], x5 - WriteEnd: - subs x6, x6, #1 - bne Loop1 - LoopOcEnd: - subs x3, x3, #4 - add x0, x0, #16 - bgt LoopOc - - ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu6.S b/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu6.S deleted file mode 100644 index b8c8b84842..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/C4BiasAddRelu6.S +++ /dev/null @@ -1,146 +0,0 @@ -#ifdef __aarch64__ - - .text - .align 5 - //.p2align 5,,15 - .global C4BiasAddRelu6 -#ifndef __APPLE__ - .type C4BiasAddRelu6, %function -#endif - -//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) -//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride - -C4BiasAddRelu6: - dup v5.4s, wzr - movi v6.4s, #6 - scvtf v6.4s, v6.4s - - LoopOc: - ld1 {v4.4s}, [x2], #16 - mov x6, x4 - mov x7, x0 - cmp x6, #4 - blt Loop1 - - Loop4: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - fadd v0.4s, v0.4s, v4.4s - fadd v1.4s, v1.4s, v4.4s - fadd v2.4s, v2.4s, v4.4s - fadd v3.4s, v3.4s, v4.4s - - fmax v0.4s, v0.4s, v5.4s - fmax v1.4s, v1.4s, v5.4s - fmax v2.4s, v2.4s, v5.4s - fmax v3.4s, v3.4s, v5.4s - - fmin v0.4s, v0.4s, v6.4s - fmin v1.4s, v1.4s, v6.4s - fmin v2.4s, v2.4s, v6.4s - fmin v3.4s, v3.4s, v6.4s - - cmp x3, #4 - bge Write4x4 - cmp x3, #3 - beq Write3x4 - cmp x3, #2 - beq Write2x4 - - Write1x4: - str s0, [x7] - add x7, x7, x5 - str s1, [x7] - add x7, x7, x5 - str s2, [x7] - add x7, x7, x5 - str s3, [x7] - add x7, x7, x5 - b WriteEndx4 - Write2x4: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - b WriteEndx4 - Write3x4: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x5 - st1 {v1.s}[2], [x8], x5 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x5 - st1 {v2.s}[2], [x8], x5 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x5 - st1 {v3.s}[2], [x8], x5 - b WriteEndx4 - Write4x4: - st1 {v0.4s}, [x7], x5 - st1 {v1.4s}, [x7], x5 - st1 {v2.4s}, [x7], x5 - st1 {v3.4s}, [x7], x5 - - WriteEndx4: - subs x6, x6, #4 - beq LoopOcEnd - cmp x6, #4 - blt Loop1 - b Loop4 - - Loop1: - ld1 {v0.4s}, [x1], #16 - fadd v0.4s, v0.4s, v4.4s - fmax v0.4s, v0.4s, v5.4s - fmin v0.4s, v0.4s, v6.4s - - cmp x3, #4 - bge Write4 - cmp x3, #3 - beq Write3 - cmp x3, #2 - beq Write2 - - Write1: - str s0, [x7] - add x7, x7, x5 - b WriteEnd - Write2: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - b WriteEnd - Write3: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x5 - st1 {v0.s}[2], [x8], x5 - b WriteEnd - Write4: - st1 {v0.4s}, [x7], x5 - WriteEnd: - subs x6, x6, #1 - bne Loop1 - LoopOcEnd: - subs x3, x3, #4 - add x0, x0, #16 - bgt LoopOc - - ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/C4Relu.S b/mindspore/lite/nnacl/assembly/arm64/C4Relu.S deleted file mode 100644 index 70ebb2f295..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/C4Relu.S +++ /dev/null @@ -1,132 +0,0 @@ -#ifdef __aarch64__ - - .text - .align 5 - //.p2align 5,,15 - .global C4Relu -#ifndef __APPLE__ - .type C4Relu, %function -#endif - -//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride) -//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride - -C4Relu: - dup v5.4s, wzr - LoopOc: - mov x6, x3 - mov x7, x0 - cmp x6, #4 - blt Loop1 - - Loop4: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - - fmax v0.4s, v0.4s, v5.4s - fmax v1.4s, v1.4s, v5.4s - fmax v2.4s, v2.4s, v5.4s - fmax v3.4s, v3.4s, v5.4s - - cmp x2, #4 - bge Write4x4 - cmp x2, #3 - beq Write3x4 - cmp x2, #2 - beq Write2x4 - - Write1x4: - str s0, [x7] - add x7, x7, x4 - str s1, [x7] - add x7, x7, x4 - str s2, [x7] - add x7, x7, x4 - str s3, [x7] - add x7, x7, x4 - b WriteEndx4 - Write2x4: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x4 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x4 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x4 - b WriteEndx4 - Write3x4: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - st1 {v0.s}[2], [x8], x4 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x4 - st1 {v1.s}[2], [x8], x4 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x4 - st1 {v2.s}[2], [x8], x4 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x4 - st1 {v3.s}[2], [x8], x4 - b WriteEndx4 - Write4x4: - st1 {v0.4s}, [x7], x4 - st1 {v1.4s}, [x7], x4 - st1 {v2.4s}, [x7], x4 - st1 {v3.4s}, [x7], x4 - - WriteEndx4: - subs x6, x6, #4 - beq LoopOcEnd - cmp x6, #4 - blt Loop1 - b Loop4 - - Loop1: - ld1 {v0.4s}, [x1], #16 - fadd v0.4s, v0.4s, v4.4s - fmax v0.4s, v0.4s, v5.4s - - cmp x2, #4 - bge Write4 - cmp x2, #3 - beq Write3 - cmp x2, #2 - beq Write2 - - Write1: - str s0, [x7] - add x7, x7, x4 - b WriteEnd - Write2: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - b WriteEnd - Write3: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - st1 {v0.s}[2], [x8], x4 - b WriteEnd - Write4: - st1 {v0.4s}, [x7], x4 - WriteEnd: - subs x6, x6, #1 - bne Loop1 - LoopOcEnd: - subs x2, x2, #4 - add x0, x0, #16 - bgt LoopOc - - ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/C4Relu6.S b/mindspore/lite/nnacl/assembly/arm64/C4Relu6.S deleted file mode 100644 index d19c22a6d3..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/C4Relu6.S +++ /dev/null @@ -1,140 +0,0 @@ -#ifdef __aarch64__ - - .text - .align 5 - //.p2align 5,,15 - .global C4Relu6 -#ifndef __APPLE__ - .type C4Relu6, %function -#endif - -//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) -//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride - -C4Relu6: - dup v5.4s, wzr - movi v6.4s, #6 - scvtf v6.4s, v6.4s - - LoopOc: - mov x6, x3 - mov x7, x0 - cmp x6, #4 - blt Loop1 - - Loop4: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - fmax v0.4s, v0.4s, v5.4s - fmax v1.4s, v1.4s, v5.4s - fmax v2.4s, v2.4s, v5.4s - fmax v3.4s, v3.4s, v5.4s - - fmin v0.4s, v0.4s, v6.4s - fmin v1.4s, v1.4s, v6.4s - fmin v2.4s, v2.4s, v6.4s - fmin v3.4s, v3.4s, v6.4s - - cmp x2, #4 - bge Write4x4 - cmp x2, #3 - beq Write3x4 - cmp x2, #2 - beq Write2x4 - - Write1x4: - str s0, [x7] - add x7, x7, x4 - str s1, [x7] - add x7, x7, x4 - str s2, [x7] - add x7, x7, x4 - str s3, [x7] - add x7, x7, x4 - b WriteEndx4 - Write2x4: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x4 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x4 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x4 - b WriteEndx4 - Write3x4: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - st1 {v0.s}[2], [x8], x4 - dup s17, v1.s[1] - stp s1, s17, [x7] - add x7, x7, x4 - st1 {v1.s}[2], [x8], x4 - dup s18, v2.s[1] - stp s2, s18, [x7] - add x7, x7, x4 - st1 {v2.s}[2], [x8], x4 - dup s19, v3.s[1] - stp s3, s19, [x7] - add x7, x7, x4 - st1 {v3.s}[2], [x8], x4 - b WriteEndx4 - Write4x4: - st1 {v0.4s}, [x7], x4 - st1 {v1.4s}, [x7], x4 - st1 {v2.4s}, [x7], x4 - st1 {v3.4s}, [x7], x4 - - WriteEndx4: - subs x6, x6, #4 - beq LoopOcEnd - cmp x6, #4 - blt Loop1 - b Loop4 - - Loop1: - ld1 {v0.4s}, [x1], #16 - fadd v0.4s, v0.4s, v4.4s - fmax v0.4s, v0.4s, v5.4s - fmin v0.4s, v0.4s, v6.4s - - cmp x2, #4 - bge Write4 - cmp x2, #3 - beq Write3 - cmp x2, #2 - beq Write2 - - Write1: - str s0, [x7] - add x7, x7, x4 - b WriteEnd - Write2: - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - b WriteEnd - Write3: - add x8, x7, #8 - dup s16, v0.s[1] - stp s0, s16, [x7] - add x7, x7, x4 - st1 {v0.s}[2], [x8], x4 - b WriteEnd - Write4: - st1 {v0.4s}, [x7], x4 - WriteEnd: - subs x6, x6, #1 - bne Loop1 - LoopOcEnd: - subs x2, x2, #4 - add x0, x0, #16 - bgt LoopOc - - ret -#endif diff --git a/mindspore/lite/nnacl/fp32/common_func.c b/mindspore/lite/nnacl/fp32/common_func.c index 4110547294..bb3148f104 100644 --- a/mindspore/lite/nnacl/fp32/common_func.c +++ b/mindspore/lite/nnacl/fp32/common_func.c @@ -40,32 +40,6 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p return; } -void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, - size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { -#ifndef ENABLE_ARM64 - PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C4NUM); -#else - if (bias_ptr != NULL) { - if (is_relu) { - C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); - } else if (is_relu6) { - C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); - } else { - C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); - } - } else { - if (is_relu) { - C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); - } else if (is_relu6) { - C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); - } else { - // do nothing - } - } -#endif - return; -} - void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { #ifndef ENABLE_ARM64 diff --git a/mindspore/lite/nnacl/fp32/common_func.h b/mindspore/lite/nnacl/fp32/common_func.h index 300149c492..b92c942ecd 100644 --- a/mindspore/lite/nnacl/fp32/common_func.h +++ b/mindspore/lite/nnacl/fp32/common_func.h @@ -27,8 +27,6 @@ extern "C" { #endif -void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, - size_t plane_size, size_t stride, bool is_relu, bool is_relu6); void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6); float ShortToFloat32(uint16_t src_value); @@ -50,11 +48,6 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); void Relu6(float *data, size_t element4); void Relu(float *data, size_t element4); -void C4BiasAdd(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); -void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); -void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); -void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); -void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, size_t output_channel, size_t input_step); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index ba2dfc8a7c..9a2ede760d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -45,8 +45,13 @@ int ConvolutionCPUKernel::InitWeightBias() { int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int oc_block, oc_block_num; +#ifdef ENABLE_ARM32 + oc_block = C4NUM; + oc_block_num = UP_DIV(out_channel, C4NUM); +#else oc_block = C8NUM; oc_block_num = UP_DIV(out_channel, C8NUM); +#endif int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; auto origin_weight = reinterpret_cast(filter_tensor->MutableData()); @@ -113,11 +118,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() { auto output_tensor = out_tensors_.at(kOutputIndex); output_tensor->SetFormat(schema::Format::Format_NHWC); - // #ifdef ENABLE_ARM32 - // gemm_func_ = IndirectGemmFp32_8x4; - // #else +#ifdef ENABLE_ARM32 + gemm_func_ = IndirectGemmFp32_8x4; +#else gemm_func_ = IndirectGemmFp32_8x8; - // #endif +#endif } int ConvolutionCPUKernel::Init() { diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc index 0ce67aed3a..d469b57fd8 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc @@ -170,79 +170,6 @@ TEST_F(TestConv1x1Fp32, Conv1x1WeightTest1) { delete conv_param; } -TEST_F(TestConv1x1Fp32, PostConvFuncC4Test1) { - float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, - -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, - -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, - 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, - -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, - 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, - -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, - -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; - float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; - float out[40] = {0}; - - float no[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, - 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, - -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, - -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, - -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; - PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, false); - CompareOutputData(out, no, 40, 0.0001); - - float relu[] = {0, 0, 8.56133, 0, 0, 0, 1.2270198, 17.954533, 11.086085, 0, - 0, 0, 11.90631, 0.3088621, 11.196218, 0, 0, 0, 0, 0, - 0, 0, 0, 9.464027, 0, 14.387108, 8.693133, 8.080041, 0, 0, - 2.8319538, 7.177942, 0, 12.194644, 0, 0, 0, 0, 0, 0}; - PostConvFuncFp32C4(in, out, bias, 5, 8, 5, true, false); - CompareOutputData(out, relu, 40, 0.0001); - - float corr_relu6[] = {0, 0, 6, 0, 0, 0, 1.2270198, 6, 6, 0, 0, 0, 6, 0.3088621, 6, 0, 0, 0, 0, 0, - 0, 0, 0, 6, 0, 6, 6, 6, 0, 0, 2.8319538, 6, 0, 6, 0, 0, 0, 0, 0, 0}; - PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, true); - CompareOutputData(out, corr_relu6, 40, 0.0001); - - float nob_relu[] = {0, 0, 7.5724425, 0, 0, 0, 0.7406984, 16.965645, - 10.888806, 0, 0, 0, 10.917422, 0.11158327, 11.1863365, 0, - 0, 0, 0, 0, 0, 0, 0, 9.266748, - 0, 13.644127, 8.206812, 7.091153, 0, 0, 2.0889723, 6.6916203, - 0, 11.997365, 0, 0, 0, 0, 0, 0}; - PostConvFuncFp32C4(in, out, nullptr, 5, 8, 5, true, false); - CompareOutputData(out, nob_relu, 40, 0.0001); -} - -TEST_F(TestConv1x1Fp32, PostConvFuncC4Test2) { - float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, - -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, - -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, - 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, - -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, - 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, - -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, - -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; - float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; - float corr[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, - 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, - -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, - -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, - -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; - float out[40] = {0}; - - int thread_count_ = 2; - int thread_oc4_stride_ = 1; - int output_channel = 5; - int plane_size = 8; - - for (int i = 0; i < thread_count_; i++) { - int cur_oc = MSMIN(thread_oc4_stride_ * 4, output_channel - i * thread_oc4_stride_ * 4); - if (cur_oc <= 0) break; - PostConvFuncFp32C4(in + thread_oc4_stride_ * i * 8 * 4, out + i * i * thread_oc4_stride_ * 4, - bias + i * thread_oc4_stride_ * 4, cur_oc, plane_size, output_channel, false, false); - } - - CompareOutputData(out, corr, 40, 0.0001); -} - int Conv1x1TestInit1(std::vector *inputs_, std::vector *outputs_, ConvParameter *conv_param, float **correct) { lite::Tensor *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC,