parent
afc27a3bb4
commit
3bb937e718
@ -0,0 +1,102 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3Corner
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3Corner, %function
|
||||
#endif
|
||||
|
||||
// void ConvDw3x3Corner(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
|
||||
// int in_kw_step, int channel, size_t relu, size_t relu6)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6
|
||||
|
||||
ConvDw3x3Corner:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ldr x8, [sp]
|
||||
|
||||
mov x9, #4
|
||||
mul x13, x6, x9 // x6 * 4
|
||||
mul x4, x4, x9
|
||||
mul x5, x5, x9
|
||||
mov x9, #3
|
||||
mul x14, x13, x9 // x6 * 3 * 4
|
||||
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
add x11, x1, x4
|
||||
ld1 {v4.4s}, [x10], x13 // weight
|
||||
add x12, x2, x14
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
|
||||
cmp x6, #4
|
||||
ble LoopC4Post
|
||||
|
||||
LoopC4:
|
||||
add x1, x1, #16
|
||||
add x2, x2, #16
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
ld1 {v4.4s}, [x10], x13
|
||||
add x11, x1, x4
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
add x12, x2, x14
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
|
||||
cbnz x8, C4_RELU6
|
||||
cbnz x7, C4_RELU
|
||||
b C4_WRITE
|
||||
C4_RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
C4_RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
C4_WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
|
||||
sub x6, x6, #4
|
||||
cmp x6, #4
|
||||
bgt LoopC4
|
||||
|
||||
LoopC4Post:
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
|
||||
cbnz x8, RELU6
|
||||
cbnz x7, RELU
|
||||
b WRITE
|
||||
RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,118 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3Horizontal
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3Horizontal, %function
|
||||
#endif
|
||||
|
||||
// void ConvDw3x3Horizontal(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
|
||||
// int in_kw_step, int channel, size_t relu, size_t relu6)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6
|
||||
|
||||
ConvDw3x3Horizontal:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ldr x8, [sp]
|
||||
|
||||
mov x9, #4
|
||||
mul x13, x6, x9 // x6 * 4
|
||||
mul x4, x4, x9
|
||||
mul x5, x5, x9
|
||||
mov x9, #3
|
||||
mul x14, x13, x9 // x6 * 3 * 4
|
||||
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
add x11, x1, x4
|
||||
ld1 {v4.4s}, [x10], x13
|
||||
add x12, x2, x14
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
add x15, x11, x4
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
add x16, x12, x14
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
ld1 {v16.4s}, [x15], x5
|
||||
ld1 {v18.4s}, [x16], x13
|
||||
ld1 {v17.4s}, [x15], x5
|
||||
ld1 {v19.4s}, [x16], x13
|
||||
|
||||
cmp x6, #4
|
||||
ble LoopC4Post
|
||||
|
||||
LoopC4:
|
||||
add x1, x1, #16
|
||||
add x2, x2, #16
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
ld1 {v4.4s}, [x10], x13
|
||||
add x11, x1, x4
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
add x12, x2, x14
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
add x15, x11, x4
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
add x16, x12, x14
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
fmla v23.4s, v16.4s, v18.4s
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
ld1 {v16.4s}, [x15], x5
|
||||
fmla v23.4s, v17.4s, v19.4s
|
||||
ld1 {v18.4s}, [x16], x13
|
||||
ld1 {v17.4s}, [x15], x5
|
||||
ld1 {v19.4s}, [x16], x13
|
||||
|
||||
cbnz x8, C4_RELU6
|
||||
cbnz x7, C4_RELU
|
||||
b C4_WRITE
|
||||
C4_RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
C4_RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
C4_WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
|
||||
sub x6, x6, #4
|
||||
cmp x6, #4
|
||||
bgt LoopC4
|
||||
|
||||
LoopC4Post:
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
fmla v23.4s, v16.4s, v18.4s
|
||||
fmla v23.4s, v17.4s, v19.4s
|
||||
|
||||
cbnz x8, RELU6
|
||||
cbnz x7, RELU
|
||||
b WRITE
|
||||
RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,199 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3Stride1
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3Stride1, %function
|
||||
#endif
|
||||
|
||||
|
||||
// void ConvDw3x3Stride1(float *output, const float *buffer, const float *weight, const float *bias, int col_size,
|
||||
// int row_size, int channel, int output_h, int output_w, size_t relu, size_t relu6)
|
||||
//
|
||||
// x0: output
|
||||
// x1: input
|
||||
// x2: weight
|
||||
// x3: bias
|
||||
// w4: col_size
|
||||
// w5: row_size
|
||||
// w6: channel
|
||||
// w7: output_h
|
||||
// w8: output_w
|
||||
// w9: relu
|
||||
// w10: relu6
|
||||
|
||||
ConvDw3x3Stride1:
|
||||
sub sp, sp, #128
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
|
||||
ldr w8, [sp]
|
||||
ldr w9, [sp, #8]
|
||||
ldr w10, [sp, #16]
|
||||
|
||||
mov w11, #4
|
||||
mul w15, w4, w11 // col_size * 4
|
||||
mul w16, w6, w11 // channel * 4
|
||||
mul w17, w5, w11 // row_size * 4
|
||||
mov w11, #2
|
||||
mul w14, w11, w15 // col_size * 2 * 4
|
||||
|
||||
movi v23.4s, #6
|
||||
scvtf v23.4s, v23.4s
|
||||
dup v24.4s, wzr
|
||||
|
||||
// Load weights
|
||||
ld1 {v0.4s}, [x2], x16
|
||||
ld1 {v1.4s}, [x2], x16
|
||||
ld1 {v2.4s}, [x2], x16
|
||||
ld1 {v3.4s}, [x2], x16
|
||||
ld1 {v4.4s}, [x2], x16
|
||||
ld1 {v5.4s}, [x2], x16
|
||||
ld1 {v6.4s}, [x2], x16
|
||||
ld1 {v7.4s}, [x2], x16
|
||||
ld1 {v8.4s}, [x2], x16
|
||||
|
||||
mov x11, x1
|
||||
add x12, x11, x17
|
||||
add x13, x12, x17
|
||||
ld1 {v9.4s}, [x11], x15
|
||||
ld1 {v10.4s}, [x11], x15
|
||||
ld1 {v11.4s}, [x11], x15
|
||||
ld1 {v13.4s}, [x12], x15
|
||||
ld1 {v14.4s}, [x12], x15
|
||||
ld1 {v15.4s}, [x12], x15
|
||||
ld1 {v17.4s}, [x13], x15
|
||||
ld1 {v18.4s}, [x13], x15
|
||||
ld1 {v19.4s}, [x13], x15
|
||||
|
||||
ld1 {v21.4s}, [x3]
|
||||
ld1 {v22.4s}, [x3]
|
||||
|
||||
cmp w8, #2
|
||||
beq WIDTH2_LEFT
|
||||
cmp w8, #1
|
||||
beq WIDTH1_LEFT
|
||||
|
||||
WIDTH2_LOOP:
|
||||
fmla v21.4s, v0.4s, v9.4s
|
||||
ld1 {v12.4s}, [x11]
|
||||
ld1 {v16.4s}, [x12]
|
||||
fmla v22.4s, v0.4s, v10.4s
|
||||
ld1 {v20.4s}, [x13]
|
||||
add x1, x1, x14
|
||||
fmla v21.4s, v1.4s, v10.4s
|
||||
mov x11, x1
|
||||
add x12, x11, x17
|
||||
add x13, x12, x17
|
||||
ld1 {v9.4s}, [x11], x15
|
||||
fmla v22.4s, v1.4s, v11.4s
|
||||
ld1 {v10.4s}, [x11], x15
|
||||
fmla v21.4s, v2.4s, v11.4s
|
||||
fmla v22.4s, v2.4s, v12.4s
|
||||
fmla v21.4s, v3.4s, v13.4s
|
||||
ld1 {v11.4s}, [x11], x15
|
||||
fmla v22.4s, v3.4s, v14.4s
|
||||
fmla v21.4s, v4.4s, v14.4s
|
||||
ld1 {v13.4s}, [x12], x15
|
||||
fmla v22.4s, v4.4s, v15.4s
|
||||
fmla v21.4s, v5.4s, v15.4s
|
||||
ld1 {v14.4s}, [x12], x15
|
||||
fmla v22.4s, v5.4s, v16.4s
|
||||
fmla v21.4s, v6.4s, v17.4s
|
||||
ld1 {v15.4s}, [x12], x15
|
||||
fmla v22.4s, v6.4s, v18.4s
|
||||
fmla v21.4s, v7.4s, v18.4s
|
||||
ld1 {v17.4s}, [x13], x15
|
||||
fmla v22.4s, v7.4s, v19.4s
|
||||
fmla v21.4s, v8.4s, v19.4s
|
||||
ld1 {v18.4s}, [x13], x15
|
||||
fmla v22.4s, v8.4s, v20.4s
|
||||
ld1 {v19.4s}, [x13], x15
|
||||
|
||||
cbnz x10, WIDTH2_RELU6
|
||||
cbnz x9, WIDTH2_RELU
|
||||
b WIDTH2_WRITE
|
||||
WIDTH2_RELU6:
|
||||
fmin v21.4s, v21.4s, v23.4s
|
||||
fmin v22.4s, v22.4s, v23.4s
|
||||
WIDTH2_RELU:
|
||||
fmax v21.4s, v21.4s, v24.4s
|
||||
fmax v22.4s, v22.4s, v24.4s
|
||||
WIDTH2_WRITE:
|
||||
st1 {v21.4s}, [x0], x16
|
||||
ld1 {v21.4s}, [x3]
|
||||
st1 {v22.4s}, [x0], x16
|
||||
ld1 {v22.4s}, [x3]
|
||||
|
||||
sub w8, w8, #2
|
||||
cmp w8, #2
|
||||
bgt WIDTH2_LOOP
|
||||
|
||||
cmp w8, #2
|
||||
blt WIDTH1_LEFT
|
||||
|
||||
WIDTH2_LEFT:
|
||||
fmla v21.4s, v0.4s, v9.4s
|
||||
ld1 {v12.4s}, [x11]
|
||||
fmla v22.4s, v0.4s, v10.4s
|
||||
fmla v21.4s, v1.4s, v10.4s
|
||||
ld1 {v16.4s}, [x12]
|
||||
fmla v22.4s, v1.4s, v11.4s
|
||||
fmla v21.4s, v2.4s, v11.4s
|
||||
ld1 {v20.4s}, [x13]
|
||||
fmla v22.4s, v2.4s, v12.4s
|
||||
fmla v21.4s, v3.4s, v13.4s
|
||||
fmla v22.4s, v3.4s, v14.4s
|
||||
fmla v21.4s, v4.4s, v14.4s
|
||||
fmla v22.4s, v4.4s, v15.4s
|
||||
fmla v21.4s, v5.4s, v15.4s
|
||||
fmla v22.4s, v5.4s, v16.4s
|
||||
fmla v21.4s, v6.4s, v17.4s
|
||||
fmla v22.4s, v6.4s, v18.4s
|
||||
fmla v21.4s, v7.4s, v18.4s
|
||||
fmla v22.4s, v7.4s, v19.4s
|
||||
fmla v21.4s, v8.4s, v19.4s
|
||||
fmla v22.4s, v8.4s, v20.4s
|
||||
|
||||
cbnz x10, WIDTH2_LEFT_RELU6
|
||||
cbnz x9, WIDTH2_LEFT_RELU
|
||||
b WIDTH2_LEFT_WRITE
|
||||
WIDTH2_LEFT_RELU6:
|
||||
fmin v21.4s, v21.4s, v23.4s
|
||||
fmin v22.4s, v22.4s, v23.4s
|
||||
WIDTH2_LEFT_RELU:
|
||||
fmax v21.4s, v21.4s, v24.4s
|
||||
fmax v22.4s, v22.4s, v24.4s
|
||||
WIDTH2_LEFT_WRITE:
|
||||
st1 {v21.4s}, [x0], x16
|
||||
st1 {v22.4s}, [x0], x16
|
||||
b End
|
||||
|
||||
WIDTH1_LEFT:
|
||||
fmla v21.4s, v0.4s, v9.4s
|
||||
fmla v21.4s, v1.4s, v10.4s
|
||||
fmla v21.4s, v2.4s, v11.4s
|
||||
fmla v21.4s, v3.4s, v13.4s
|
||||
fmla v21.4s, v4.4s, v14.4s
|
||||
fmla v21.4s, v5.4s, v15.4s
|
||||
fmla v21.4s, v6.4s, v17.4s
|
||||
fmla v21.4s, v7.4s, v18.4s
|
||||
fmla v21.4s, v8.4s, v19.4s
|
||||
|
||||
cbnz x10, WIDTH1_RELU6
|
||||
cbnz x9, WIDTH1_RELU
|
||||
b WIDTH1_WRITE
|
||||
WIDTH1_RELU6:
|
||||
fmin v21.4s, v21.4s, v23.4s
|
||||
WIDTH1_RELU:
|
||||
fmax v21.4s, v21.4s, v24.4s
|
||||
WIDTH1_WRITE:
|
||||
st1 {v21.4s}, [x0]
|
||||
|
||||
End:
|
||||
sub sp, sp, #128
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,201 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3Stride2
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3Stride2, %function
|
||||
#endif
|
||||
|
||||
|
||||
// void ConvDw3x3Stride2(float *output, const float *buffer, const float *weight, const float *bias, int col_size,
|
||||
// int row_size, int channel, int output_h, int output_w, size_t relu, size_t relu6)
|
||||
//
|
||||
// x0: output
|
||||
// x1: input
|
||||
// x2: weight
|
||||
// x3: bias
|
||||
// w4: col_size
|
||||
// w5: row_size
|
||||
// w6: channel
|
||||
// w7: output_h
|
||||
// w8: output_w
|
||||
// w9: relu
|
||||
// w10: relu6
|
||||
|
||||
ConvDw3x3Stride2:
|
||||
sub sp, sp, #128
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
|
||||
ldr w8, [sp]
|
||||
ldr w9, [sp, #8]
|
||||
ldr w10, [sp, #16]
|
||||
|
||||
mov w11, #4
|
||||
mul w15, w4, w11 // col_size * 4
|
||||
mul w16, w6, w11 // channel * 4
|
||||
mul w17, w5, w11 // row_size * 4
|
||||
mov w11, #2
|
||||
mul w14, w11, w15 // col_size * 2 * 4
|
||||
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
// Load weights
|
||||
ld1 {v0.4s}, [x2], x16
|
||||
ld1 {v1.4s}, [x2], x16
|
||||
ld1 {v2.4s}, [x2], x16
|
||||
ld1 {v3.4s}, [x2], x16
|
||||
ld1 {v4.4s}, [x2], x16
|
||||
ld1 {v5.4s}, [x2], x16
|
||||
ld1 {v6.4s}, [x2], x16
|
||||
ld1 {v7.4s}, [x2], x16
|
||||
ld1 {v8.4s}, [x2], x16
|
||||
|
||||
mov x11, x1
|
||||
add x12, x11, x17
|
||||
add x13, x12, x17
|
||||
ld1 {v9.4s}, [x11], x15
|
||||
ld1 {v10.4s}, [x11], x15
|
||||
ld1 {v11.4s}, [x11], x15
|
||||
ld1 {v14.4s}, [x12], x15
|
||||
ld1 {v15.4s}, [x12], x15
|
||||
ld1 {v16.4s}, [x12], x15
|
||||
ld1 {v19.4s}, [x13], x15
|
||||
ld1 {v20.4s}, [x13], x15
|
||||
ld1 {v21.4s}, [x13], x15
|
||||
|
||||
ld1 {v24.4s}, [x3]
|
||||
ld1 {v25.4s}, [x3]
|
||||
|
||||
cmp w8, #2
|
||||
beq WIDTH2_LEFT
|
||||
cmp w8, #1
|
||||
beq WIDTH1_LEFT
|
||||
|
||||
WIDTH2_LOOP:
|
||||
fmla v24.4s, v0.4s, v9.4s
|
||||
ld1 {v12.4s}, [x11], x15
|
||||
fmla v25.4s, v0.4s, v11.4s
|
||||
ld1 {v17.4s}, [x12], x15
|
||||
fmla v24.4s, v1.4s, v10.4s
|
||||
ld1 {v22.4s}, [x13], x15
|
||||
fmla v25.4s, v1.4s, v12.4s
|
||||
ld1 {v13.4s}, [x11], x15
|
||||
fmla v24.4s, v2.4s, v11.4s
|
||||
ld1 {v18.4s}, [x12], x15
|
||||
fmla v25.4s, v2.4s, v13.4s
|
||||
ld1 {v23.4s}, [x13], x15
|
||||
fmla v24.4s, v3.4s, v14.4s
|
||||
mov v9.16b, v13.16b
|
||||
fmla v25.4s, v3.4s, v16.4s
|
||||
mov v14.16b, v18.16b
|
||||
fmla v24.4s, v4.4s, v15.4s
|
||||
fmla v25.4s, v4.4s, v17.4s
|
||||
ld1 {v10.4s}, [x11], x15
|
||||
fmla v24.4s, v5.4s, v16.4s
|
||||
ld1 {v11.4s}, [x11], x15
|
||||
fmla v25.4s, v5.4s, v18.4s
|
||||
ld1 {v15.4s}, [x12], x15
|
||||
fmla v24.4s, v6.4s, v19.4s
|
||||
ld1 {v16.4s}, [x12], x15
|
||||
fmla v25.4s, v6.4s, v21.4s
|
||||
mov v19.16b, v23.16b
|
||||
fmla v24.4s, v7.4s, v20.4s
|
||||
fmla v25.4s, v7.4s, v22.4s
|
||||
ld1 {v20.4s}, [x13], x15
|
||||
fmla v24.4s, v8.4s, v21.4s
|
||||
fmla v25.4s, v8.4s, v23.4s
|
||||
ld1 {v21.4s}, [x13], x15
|
||||
|
||||
cbnz x10, WIDTH2_RELU6
|
||||
cbnz x9, WIDTH2_RELU
|
||||
b WIDTH2_WRITE
|
||||
WIDTH2_RELU6:
|
||||
fmin v24.4s, v24.4s, v26.4s
|
||||
fmin v25.4s, v25.4s, v26.4s
|
||||
WIDTH2_RELU:
|
||||
fmax v24.4s, v24.4s, v27.4s
|
||||
fmax v25.4s, v25.4s, v27.4s
|
||||
WIDTH2_WRITE:
|
||||
st1 {v24.4s}, [x0], x16
|
||||
ld1 {v24.4s}, [x3]
|
||||
st1 {v25.4s}, [x0], x16
|
||||
ld1 {v25.4s}, [x3]
|
||||
|
||||
sub w8, w8, #2
|
||||
cmp w8, #2
|
||||
bgt WIDTH2_LOOP
|
||||
|
||||
cmp w8, #2
|
||||
blt WIDTH1_LEFT
|
||||
|
||||
WIDTH2_LEFT:
|
||||
fmla v24.4s, v0.4s, v9.4s
|
||||
ld1 {v12.4s}, [x11], x15
|
||||
fmla v25.4s, v0.4s, v11.4s
|
||||
ld1 {v17.4s}, [x12], x15
|
||||
fmla v24.4s, v1.4s, v10.4s
|
||||
ld1 {v22.4s}, [x13], x15
|
||||
fmla v25.4s, v1.4s, v12.4s
|
||||
ld1 {v13.4s}, [x11], x15
|
||||
fmla v24.4s, v2.4s, v11.4s
|
||||
ld1 {v18.4s}, [x12], x15
|
||||
fmla v25.4s, v2.4s, v13.4s
|
||||
ld1 {v23.4s}, [x13], x15
|
||||
fmla v24.4s, v3.4s, v14.4s
|
||||
fmla v25.4s, v3.4s, v16.4s
|
||||
fmla v24.4s, v4.4s, v15.4s
|
||||
fmla v25.4s, v4.4s, v17.4s
|
||||
fmla v24.4s, v5.4s, v16.4s
|
||||
fmla v25.4s, v5.4s, v18.4s
|
||||
fmla v24.4s, v6.4s, v19.4s
|
||||
fmla v25.4s, v6.4s, v21.4s
|
||||
fmla v24.4s, v7.4s, v20.4s
|
||||
fmla v25.4s, v7.4s, v22.4s
|
||||
fmla v24.4s, v8.4s, v21.4s
|
||||
fmla v25.4s, v8.4s, v23.4s
|
||||
|
||||
cbnz x10, WIDTH2_LEFT_RELU6
|
||||
cbnz x9, WIDTH2_LEFT_RELU
|
||||
b WIDTH2_LEFT_WRITE
|
||||
WIDTH2_LEFT_RELU6:
|
||||
fmin v24.4s, v24.4s, v26.4s
|
||||
fmin v25.4s, v25.4s, v26.4s
|
||||
WIDTH2_LEFT_RELU:
|
||||
fmax v24.4s, v24.4s, v27.4s
|
||||
fmax v25.4s, v25.4s, v27.4s
|
||||
WIDTH2_LEFT_WRITE:
|
||||
st1 {v24.4s}, [x0], x16
|
||||
st1 {v25.4s}, [x0], x16
|
||||
b End
|
||||
|
||||
WIDTH1_LEFT:
|
||||
fmla v24.4s, v0.4s, v9.4s
|
||||
fmla v24.4s, v1.4s, v10.4s
|
||||
fmla v24.4s, v2.4s, v11.4s
|
||||
fmla v24.4s, v3.4s, v14.4s
|
||||
fmla v24.4s, v4.4s, v15.4s
|
||||
fmla v24.4s, v5.4s, v16.4s
|
||||
fmla v24.4s, v6.4s, v19.4s
|
||||
fmla v24.4s, v7.4s, v20.4s
|
||||
fmla v24.4s, v8.4s, v21.4s
|
||||
|
||||
cbnz x10, WIDTH1_RELU6
|
||||
cbnz x9, WIDTH1_RELU
|
||||
b WIDTH1_WRITE
|
||||
WIDTH1_RELU6:
|
||||
fmin v24.4s, v24.4s, v26.4s
|
||||
WIDTH1_RELU:
|
||||
fmax v24.4s, v24.4s, v27.4s
|
||||
WIDTH1_WRITE:
|
||||
st1 {v24.4s}, [x0]
|
||||
|
||||
End:
|
||||
sub sp, sp, #128
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,114 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3Vertical
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3Vertical, %function
|
||||
#endif
|
||||
|
||||
// void ConvDw3x3Vertical(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
|
||||
// int in_kw_step, int channel, size_t relu, size_t relu6)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6
|
||||
|
||||
ConvDw3x3Vertical:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ldr x8, [sp]
|
||||
|
||||
mov x9, #4
|
||||
mul x13, x6, x9 // x6 * 4
|
||||
mul x4, x4, x9
|
||||
mul x5, x5, x9
|
||||
mov x9, #3
|
||||
mul x14, x13, x9 // x6 * 3 * 4
|
||||
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
add x11, x1, x4
|
||||
ld1 {v4.4s}, [x10], x13
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
add x12, x2, x14
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
ld1 {v16.4s}, [x9], x5
|
||||
ld1 {v18.4s}, [x10], x13
|
||||
ld1 {v17.4s}, [x11], x5
|
||||
ld1 {v19.4s}, [x12], x13
|
||||
|
||||
cmp x6, #4
|
||||
ble LoopC4Post
|
||||
|
||||
LoopC4:
|
||||
add x1, x1, #16
|
||||
add x2, x2, #16
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
ld1 {v0.4s}, [x9], x5
|
||||
ld1 {v4.4s}, [x10], x13
|
||||
add x11, x1, x4
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
add x12, x2, x14
|
||||
ld1 {v1.4s}, [x9], x5
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
ld1 {v5.4s}, [x10], x13
|
||||
ld1 {v2.4s}, [x11], x5
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
ld1 {v6.4s}, [x12], x13
|
||||
ld1 {v3.4s}, [x11], x5
|
||||
fmla v23.4s, v16.4s, v18.4s
|
||||
ld1 {v7.4s}, [x12], x13
|
||||
ld1 {v16.4s}, [x9], x5
|
||||
fmla v23.4s, v17.4s, v19.4s
|
||||
ld1 {v18.4s}, [x10], x13
|
||||
ld1 {v17.4s}, [x11], x5
|
||||
ld1 {v19.4s}, [x12], x13
|
||||
|
||||
cbnz x8, C4_RELU6
|
||||
cbnz x7, C4_RELU
|
||||
b C4_WRITE
|
||||
C4_RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
C4_RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
C4_WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ld1 {v23.4s}, [x3], #16
|
||||
|
||||
sub x6, x6, #4
|
||||
cmp x6, #4
|
||||
bgt LoopC4
|
||||
|
||||
LoopC4Post:
|
||||
fmla v23.4s, v0.4s, v4.4s
|
||||
fmla v23.4s, v1.4s, v5.4s
|
||||
fmla v23.4s, v2.4s, v6.4s
|
||||
fmla v23.4s, v3.4s, v7.4s
|
||||
fmla v23.4s, v16.4s, v18.4s
|
||||
fmla v23.4s, v17.4s, v19.4s
|
||||
|
||||
cbnz x8, RELU6
|
||||
cbnz x7, RELU
|
||||
b WRITE
|
||||
RELU6:
|
||||
fmin v23.4s, v23.4s, v26.4s
|
||||
RELU:
|
||||
fmax v23.4s, v23.4s, v27.4s
|
||||
WRITE:
|
||||
st1 {v23.4s}, [x0], #16
|
||||
ret
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,149 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_INFER_INVALID;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
ConvolutionDepthwise3x3CPUKernel::~ConvolutionDepthwise3x3CPUKernel() {
|
||||
if (packed_weight_ != nullptr) {
|
||||
free(packed_weight_);
|
||||
packed_weight_ = nullptr;
|
||||
}
|
||||
if (sliding_ != nullptr) {
|
||||
delete sliding_;
|
||||
sliding_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
|
||||
// init weight: k, h, w, c; k == group == output_channel, c == 1
|
||||
auto weight_tensor = in_tensors_[kWeightIndex];
|
||||
auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
|
||||
int channel = weight_tensor->Batch();
|
||||
int pack_weight_size = weight_tensor->Batch() * weight_tensor->Height() * weight_tensor->Width();
|
||||
|
||||
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
PackWeightKHWToHWKFp32(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), channel);
|
||||
|
||||
bias_data_ = reinterpret_cast<float *>(malloc(channel * sizeof(float)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
memset(bias_data_, 0, channel * sizeof(float));
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_[kBiasIndex];
|
||||
auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
|
||||
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float));
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::Init() {
|
||||
sliding_ = new (std::nothrow) SlidingWindowParam;
|
||||
if (sliding_ == nullptr) {
|
||||
MS_LOG(ERROR) << "new sliding window param failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto ret = InitWeightBias();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Convolution depthwise 3x3 fp32 InitWeightBias failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::ReSize() {
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
InitSlidingParamConvDw(sliding_, conv_param_, conv_param_->input_channel_);
|
||||
conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::Execute(int task_id) {
|
||||
auto buffer = buffer_ + 64 * 10 * 10 * task_id;
|
||||
ConvDw3x3(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
|
||||
sliding_, task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvDw3x3Run(void *cdata, int task_id) {
|
||||
auto conv_dw = reinterpret_cast<ConvolutionDepthwise3x3CPUKernel *>(cdata);
|
||||
auto ret = conv_dw->Execute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ConvolutionDepthwise3x3Run error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::InitBuffer() {
|
||||
int buffer_size = 64 * 10 * 10 * conv_param_->thread_num_;
|
||||
buffer_ = reinterpret_cast<float *>(context_->allocator->Malloc(buffer_size * sizeof(float)));
|
||||
if (buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwise3x3CPUKernel::Run() {
|
||||
auto ret = InitBuffer();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
input_ptr_ = reinterpret_cast<float *>(input_tensor->data_c());
|
||||
|
||||
auto output_tensor = out_tensors_.at(kOutputIndex);
|
||||
output_ptr_ = reinterpret_cast<float *>(output_tensor->data_c());
|
||||
|
||||
if (sliding_->top_ > 0 || sliding_->bottom_ < conv_param_->output_h_ || sliding_->left_ > 0 ||
|
||||
sliding_->right_ < conv_param_->output_w_) {
|
||||
ConvDw3x3Pad(output_ptr_, input_ptr_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_, sliding_);
|
||||
}
|
||||
ret = ParallelLaunch(this->context_->thread_pool_, ConvDw3x3Run, this, conv_param_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
context_->allocator->Free(buffer_);
|
||||
MS_LOG(ERROR) << "ConvDw3x3Run error: error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
context_->allocator->Free(buffer_);
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
@ -0,0 +1,51 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/kernel/arm/base/convolution_base.h"
|
||||
#include "nnacl/fp32/conv_depthwise.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
|
||||
public:
|
||||
ConvolutionDepthwise3x3CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
~ConvolutionDepthwise3x3CPUKernel() override;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
int InitWeightBias();
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
int InitBuffer();
|
||||
SlidingWindowParam *sliding_ = nullptr;
|
||||
float *packed_weight_ = nullptr;
|
||||
float *input_ptr_ = nullptr;
|
||||
float *output_ptr_ = nullptr;
|
||||
float *buffer_ = nullptr;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_H_
|
Loading…
Reference in new issue