!6166 [MSLITE][Develop] arm cpu op: remove useless function and files

Merge pull request !6166 from yangruoqi713/lite
pull/6166/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit d49f8d8195

@ -1,131 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAdd
#ifndef __APPLE__
.type C4BiasAdd, %function
#endif
//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAdd:
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

@ -1,137 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAddRelu
#ifndef __APPLE__
.type C4BiasAddRelu, %function
#endif
//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAddRelu:
dup v5.4s, wzr
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

@ -1,146 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAddRelu6
#ifndef __APPLE__
.type C4BiasAddRelu6, %function
#endif
//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAddRelu6:
dup v5.4s, wzr
movi v6.4s, #6
scvtf v6.4s, v6.4s
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
fmin v1.4s, v1.4s, v6.4s
fmin v2.4s, v2.4s, v6.4s
fmin v3.4s, v3.4s, v6.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

@ -1,132 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4Relu
#ifndef __APPLE__
.type C4Relu, %function
#endif
//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride
C4Relu:
dup v5.4s, wzr
LoopOc:
mov x6, x3
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
cmp x2, #4
bge Write4x4
cmp x2, #3
beq Write3x4
cmp x2, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x4
str s1, [x7]
add x7, x7, x4
str s2, [x7]
add x7, x7, x4
str s3, [x7]
add x7, x7, x4
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
st1 {v1.s}[2], [x8], x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
st1 {v2.s}[2], [x8], x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
st1 {v3.s}[2], [x8], x4
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x4
st1 {v1.4s}, [x7], x4
st1 {v2.4s}, [x7], x4
st1 {v3.4s}, [x7], x4
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
cmp x2, #4
bge Write4
cmp x2, #3
beq Write3
cmp x2, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x4
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x4
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x2, x2, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

@ -1,140 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4Relu6
#ifndef __APPLE__
.type C4Relu6, %function
#endif
//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride
C4Relu6:
dup v5.4s, wzr
movi v6.4s, #6
scvtf v6.4s, v6.4s
LoopOc:
mov x6, x3
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
fmin v1.4s, v1.4s, v6.4s
fmin v2.4s, v2.4s, v6.4s
fmin v3.4s, v3.4s, v6.4s
cmp x2, #4
bge Write4x4
cmp x2, #3
beq Write3x4
cmp x2, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x4
str s1, [x7]
add x7, x7, x4
str s2, [x7]
add x7, x7, x4
str s3, [x7]
add x7, x7, x4
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
st1 {v1.s}[2], [x8], x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
st1 {v2.s}[2], [x8], x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
st1 {v3.s}[2], [x8], x4
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x4
st1 {v1.4s}, [x7], x4
st1 {v2.4s}, [x7], x4
st1 {v3.4s}, [x7], x4
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
cmp x2, #4
bge Write4
cmp x2, #3
beq Write3
cmp x2, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x4
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x4
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x2, x2, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

@ -40,32 +40,6 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
return;
}
void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
#ifndef ENABLE_ARM64
PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C4NUM);
#else
if (bias_ptr != NULL) {
if (is_relu) {
C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
} else if (is_relu6) {
C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
} else {
C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
}
} else {
if (is_relu) {
C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
} else if (is_relu6) {
C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
} else {
// do nothing
}
}
#endif
return;
}
void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
#ifndef ENABLE_ARM64

@ -27,8 +27,6 @@
extern "C" {
#endif
void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
float ShortToFloat32(uint16_t src_value);
@ -50,11 +48,6 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size);
void Relu6(float *data, size_t element4);
void Relu(float *data, size_t element4);
void C4BiasAdd(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride);
void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride);
void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride);
void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels,
size_t output_channel, size_t input_step);

@ -45,8 +45,13 @@ int ConvolutionCPUKernel::InitWeightBias() {
int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int oc_block, oc_block_num;
#ifdef ENABLE_ARM32
oc_block = C4NUM;
oc_block_num = UP_DIV(out_channel, C4NUM);
#else
oc_block = C8NUM;
oc_block_num = UP_DIV(out_channel, C8NUM);
#endif
int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;
auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData());
@ -113,11 +118,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() {
auto output_tensor = out_tensors_.at(kOutputIndex);
output_tensor->SetFormat(schema::Format::Format_NHWC);
// #ifdef ENABLE_ARM32
// gemm_func_ = IndirectGemmFp32_8x4;
// #else
#ifdef ENABLE_ARM32
gemm_func_ = IndirectGemmFp32_8x4;
#else
gemm_func_ = IndirectGemmFp32_8x8;
// #endif
#endif
}
int ConvolutionCPUKernel::Init() {

@ -170,79 +170,6 @@ TEST_F(TestConv1x1Fp32, Conv1x1WeightTest1) {
delete conv_param;
}
TEST_F(TestConv1x1Fp32, PostConvFuncC4Test1) {
float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806,
-0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815,
-6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584,
2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964,
-2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0,
11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0,
-8.344107, 0, 0, 0, -3.792715, 0, 0, 0,
-7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0};
float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0};
float out[40] = {0};
float no[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533,
11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405,
-0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027,
-8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942,
-4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402};
PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, false);
CompareOutputData(out, no, 40, 0.0001);
float relu[] = {0, 0, 8.56133, 0, 0, 0, 1.2270198, 17.954533, 11.086085, 0,
0, 0, 11.90631, 0.3088621, 11.196218, 0, 0, 0, 0, 0,
0, 0, 0, 9.464027, 0, 14.387108, 8.693133, 8.080041, 0, 0,
2.8319538, 7.177942, 0, 12.194644, 0, 0, 0, 0, 0, 0};
PostConvFuncFp32C4(in, out, bias, 5, 8, 5, true, false);
CompareOutputData(out, relu, 40, 0.0001);
float corr_relu6[] = {0, 0, 6, 0, 0, 0, 1.2270198, 6, 6, 0, 0, 0, 6, 0.3088621, 6, 0, 0, 0, 0, 0,
0, 0, 0, 6, 0, 6, 6, 6, 0, 0, 2.8319538, 6, 0, 6, 0, 0, 0, 0, 0, 0};
PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, true);
CompareOutputData(out, corr_relu6, 40, 0.0001);
float nob_relu[] = {0, 0, 7.5724425, 0, 0, 0, 0.7406984, 16.965645,
10.888806, 0, 0, 0, 10.917422, 0.11158327, 11.1863365, 0,
0, 0, 0, 0, 0, 0, 0, 9.266748,
0, 13.644127, 8.206812, 7.091153, 0, 0, 2.0889723, 6.6916203,
0, 11.997365, 0, 0, 0, 0, 0, 0};
PostConvFuncFp32C4(in, out, nullptr, 5, 8, 5, true, false);
CompareOutputData(out, nob_relu, 40, 0.0001);
}
TEST_F(TestConv1x1Fp32, PostConvFuncC4Test2) {
float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806,
-0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815,
-6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584,
2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964,
-2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0,
11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0,
-8.344107, 0, 0, 0, -3.792715, 0, 0, 0,
-7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0};
float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0};
float corr[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533,
11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405,
-0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027,
-8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942,
-4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402};
float out[40] = {0};
int thread_count_ = 2;
int thread_oc4_stride_ = 1;
int output_channel = 5;
int plane_size = 8;
for (int i = 0; i < thread_count_; i++) {
int cur_oc = MSMIN(thread_oc4_stride_ * 4, output_channel - i * thread_oc4_stride_ * 4);
if (cur_oc <= 0) break;
PostConvFuncFp32C4(in + thread_oc4_stride_ * i * 8 * 4, out + i * i * thread_oc4_stride_ * 4,
bias + i * thread_oc4_stride_ * 4, cur_oc, plane_size, output_channel, false, false);
}
CompareOutputData(out, corr, 40, 0.0001);
}
int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
ConvParameter *conv_param, float **correct) {
lite::Tensor *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC,

Loading…
Cancel
Save