parent
b6726e4a69
commit
e52aa2b12f
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,131 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global C4BiasAdd
|
||||
#ifndef __APPLE__
|
||||
.type C4BiasAdd, %function
|
||||
#endif
|
||||
|
||||
//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||
|
||||
C4BiasAdd:
|
||||
|
||||
LoopOc:
|
||||
ld1 {v4.4s}, [x2], #16
|
||||
mov x6, x4
|
||||
mov x7, x0
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
|
||||
Loop4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fadd v1.4s, v1.4s, v4.4s
|
||||
fadd v2.4s, v2.4s, v4.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4x4
|
||||
cmp x3, #3
|
||||
beq Write3x4
|
||||
cmp x3, #2
|
||||
beq Write2x4
|
||||
|
||||
Write1x4:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
str s1, [x7]
|
||||
add x7, x7, x5
|
||||
str s2, [x7]
|
||||
add x7, x7, x5
|
||||
str s3, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write2x4:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write3x4:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v1.s}[2], [x8], x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v2.s}[2], [x8], x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v3.s}[2], [x8], x5
|
||||
b WriteEndx4
|
||||
Write4x4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
st1 {v1.4s}, [x7], x5
|
||||
st1 {v2.4s}, [x7], x5
|
||||
st1 {v3.4s}, [x7], x5
|
||||
|
||||
WriteEndx4:
|
||||
subs x6, x6, #4
|
||||
beq LoopOcEnd
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
b Loop4
|
||||
|
||||
Loop1:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4
|
||||
cmp x3, #3
|
||||
beq Write3
|
||||
cmp x3, #2
|
||||
beq Write2
|
||||
|
||||
Write1:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
WriteEnd:
|
||||
subs x6, x6, #1
|
||||
bne Loop1
|
||||
LoopOcEnd:
|
||||
subs x3, x3, #4
|
||||
add x0, x0, #16
|
||||
bgt LoopOc
|
||||
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,137 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global C4BiasAddRelu
|
||||
#ifndef __APPLE__
|
||||
.type C4BiasAddRelu, %function
|
||||
#endif
|
||||
|
||||
//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||
|
||||
C4BiasAddRelu:
|
||||
dup v5.4s, wzr
|
||||
LoopOc:
|
||||
ld1 {v4.4s}, [x2], #16
|
||||
mov x6, x4
|
||||
mov x7, x0
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
|
||||
Loop4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fadd v1.4s, v1.4s, v4.4s
|
||||
fadd v2.4s, v2.4s, v4.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmax v1.4s, v1.4s, v5.4s
|
||||
fmax v2.4s, v2.4s, v5.4s
|
||||
fmax v3.4s, v3.4s, v5.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4x4
|
||||
cmp x3, #3
|
||||
beq Write3x4
|
||||
cmp x3, #2
|
||||
beq Write2x4
|
||||
|
||||
Write1x4:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
str s1, [x7]
|
||||
add x7, x7, x5
|
||||
str s2, [x7]
|
||||
add x7, x7, x5
|
||||
str s3, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write2x4:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write3x4:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v1.s}[2], [x8], x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v2.s}[2], [x8], x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v3.s}[2], [x8], x5
|
||||
b WriteEndx4
|
||||
Write4x4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
st1 {v1.4s}, [x7], x5
|
||||
st1 {v2.4s}, [x7], x5
|
||||
st1 {v3.4s}, [x7], x5
|
||||
|
||||
WriteEndx4:
|
||||
subs x6, x6, #4
|
||||
beq LoopOcEnd
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
b Loop4
|
||||
|
||||
Loop1:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4
|
||||
cmp x3, #3
|
||||
beq Write3
|
||||
cmp x3, #2
|
||||
beq Write2
|
||||
|
||||
Write1:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
WriteEnd:
|
||||
subs x6, x6, #1
|
||||
bne Loop1
|
||||
LoopOcEnd:
|
||||
subs x3, x3, #4
|
||||
add x0, x0, #16
|
||||
bgt LoopOc
|
||||
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,146 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global C4BiasAddRelu6
|
||||
#ifndef __APPLE__
|
||||
.type C4BiasAddRelu6, %function
|
||||
#endif
|
||||
|
||||
//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||
|
||||
C4BiasAddRelu6:
|
||||
dup v5.4s, wzr
|
||||
movi v6.4s, #6
|
||||
scvtf v6.4s, v6.4s
|
||||
|
||||
LoopOc:
|
||||
ld1 {v4.4s}, [x2], #16
|
||||
mov x6, x4
|
||||
mov x7, x0
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
|
||||
Loop4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fadd v1.4s, v1.4s, v4.4s
|
||||
fadd v2.4s, v2.4s, v4.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmax v1.4s, v1.4s, v5.4s
|
||||
fmax v2.4s, v2.4s, v5.4s
|
||||
fmax v3.4s, v3.4s, v5.4s
|
||||
|
||||
fmin v0.4s, v0.4s, v6.4s
|
||||
fmin v1.4s, v1.4s, v6.4s
|
||||
fmin v2.4s, v2.4s, v6.4s
|
||||
fmin v3.4s, v3.4s, v6.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4x4
|
||||
cmp x3, #3
|
||||
beq Write3x4
|
||||
cmp x3, #2
|
||||
beq Write2x4
|
||||
|
||||
Write1x4:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
str s1, [x7]
|
||||
add x7, x7, x5
|
||||
str s2, [x7]
|
||||
add x7, x7, x5
|
||||
str s3, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write2x4:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEndx4
|
||||
Write3x4:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v1.s}[2], [x8], x5
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v2.s}[2], [x8], x5
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v3.s}[2], [x8], x5
|
||||
b WriteEndx4
|
||||
Write4x4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
st1 {v1.4s}, [x7], x5
|
||||
st1 {v2.4s}, [x7], x5
|
||||
st1 {v3.4s}, [x7], x5
|
||||
|
||||
WriteEndx4:
|
||||
subs x6, x6, #4
|
||||
beq LoopOcEnd
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
b Loop4
|
||||
|
||||
Loop1:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmin v0.4s, v0.4s, v6.4s
|
||||
|
||||
cmp x3, #4
|
||||
bge Write4
|
||||
cmp x3, #3
|
||||
beq Write3
|
||||
cmp x3, #2
|
||||
beq Write2
|
||||
|
||||
Write1:
|
||||
str s0, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x5
|
||||
st1 {v0.s}[2], [x8], x5
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v0.4s}, [x7], x5
|
||||
WriteEnd:
|
||||
subs x6, x6, #1
|
||||
bne Loop1
|
||||
LoopOcEnd:
|
||||
subs x3, x3, #4
|
||||
add x0, x0, #16
|
||||
bgt LoopOc
|
||||
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,132 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global C4Relu
|
||||
#ifndef __APPLE__
|
||||
.type C4Relu, %function
|
||||
#endif
|
||||
|
||||
//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride)
|
||||
//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride
|
||||
|
||||
C4Relu:
|
||||
dup v5.4s, wzr
|
||||
LoopOc:
|
||||
mov x6, x3
|
||||
mov x7, x0
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
|
||||
Loop4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmax v1.4s, v1.4s, v5.4s
|
||||
fmax v2.4s, v2.4s, v5.4s
|
||||
fmax v3.4s, v3.4s, v5.4s
|
||||
|
||||
cmp x2, #4
|
||||
bge Write4x4
|
||||
cmp x2, #3
|
||||
beq Write3x4
|
||||
cmp x2, #2
|
||||
beq Write2x4
|
||||
|
||||
Write1x4:
|
||||
str s0, [x7]
|
||||
add x7, x7, x4
|
||||
str s1, [x7]
|
||||
add x7, x7, x4
|
||||
str s2, [x7]
|
||||
add x7, x7, x4
|
||||
str s3, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEndx4
|
||||
Write2x4:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x4
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x4
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEndx4
|
||||
Write3x4:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v0.s}[2], [x8], x4
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v1.s}[2], [x8], x4
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v2.s}[2], [x8], x4
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v3.s}[2], [x8], x4
|
||||
b WriteEndx4
|
||||
Write4x4:
|
||||
st1 {v0.4s}, [x7], x4
|
||||
st1 {v1.4s}, [x7], x4
|
||||
st1 {v2.4s}, [x7], x4
|
||||
st1 {v3.4s}, [x7], x4
|
||||
|
||||
WriteEndx4:
|
||||
subs x6, x6, #4
|
||||
beq LoopOcEnd
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
b Loop4
|
||||
|
||||
Loop1:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
|
||||
cmp x2, #4
|
||||
bge Write4
|
||||
cmp x2, #3
|
||||
beq Write3
|
||||
cmp x2, #2
|
||||
beq Write2
|
||||
|
||||
Write1:
|
||||
str s0, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v0.s}[2], [x8], x4
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v0.4s}, [x7], x4
|
||||
WriteEnd:
|
||||
subs x6, x6, #1
|
||||
bne Loop1
|
||||
LoopOcEnd:
|
||||
subs x2, x2, #4
|
||||
add x0, x0, #16
|
||||
bgt LoopOc
|
||||
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,140 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global C4Relu6
|
||||
#ifndef __APPLE__
|
||||
.type C4Relu6, %function
|
||||
#endif
|
||||
|
||||
//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||
//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride
|
||||
|
||||
C4Relu6:
|
||||
dup v5.4s, wzr
|
||||
movi v6.4s, #6
|
||||
scvtf v6.4s, v6.4s
|
||||
|
||||
LoopOc:
|
||||
mov x6, x3
|
||||
mov x7, x0
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
|
||||
Loop4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmax v1.4s, v1.4s, v5.4s
|
||||
fmax v2.4s, v2.4s, v5.4s
|
||||
fmax v3.4s, v3.4s, v5.4s
|
||||
|
||||
fmin v0.4s, v0.4s, v6.4s
|
||||
fmin v1.4s, v1.4s, v6.4s
|
||||
fmin v2.4s, v2.4s, v6.4s
|
||||
fmin v3.4s, v3.4s, v6.4s
|
||||
|
||||
cmp x2, #4
|
||||
bge Write4x4
|
||||
cmp x2, #3
|
||||
beq Write3x4
|
||||
cmp x2, #2
|
||||
beq Write2x4
|
||||
|
||||
Write1x4:
|
||||
str s0, [x7]
|
||||
add x7, x7, x4
|
||||
str s1, [x7]
|
||||
add x7, x7, x4
|
||||
str s2, [x7]
|
||||
add x7, x7, x4
|
||||
str s3, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEndx4
|
||||
Write2x4:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x4
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x4
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEndx4
|
||||
Write3x4:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v0.s}[2], [x8], x4
|
||||
dup s17, v1.s[1]
|
||||
stp s1, s17, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v1.s}[2], [x8], x4
|
||||
dup s18, v2.s[1]
|
||||
stp s2, s18, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v2.s}[2], [x8], x4
|
||||
dup s19, v3.s[1]
|
||||
stp s3, s19, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v3.s}[2], [x8], x4
|
||||
b WriteEndx4
|
||||
Write4x4:
|
||||
st1 {v0.4s}, [x7], x4
|
||||
st1 {v1.4s}, [x7], x4
|
||||
st1 {v2.4s}, [x7], x4
|
||||
st1 {v3.4s}, [x7], x4
|
||||
|
||||
WriteEndx4:
|
||||
subs x6, x6, #4
|
||||
beq LoopOcEnd
|
||||
cmp x6, #4
|
||||
blt Loop1
|
||||
b Loop4
|
||||
|
||||
Loop1:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
fmax v0.4s, v0.4s, v5.4s
|
||||
fmin v0.4s, v0.4s, v6.4s
|
||||
|
||||
cmp x2, #4
|
||||
bge Write4
|
||||
cmp x2, #3
|
||||
beq Write3
|
||||
cmp x2, #2
|
||||
beq Write2
|
||||
|
||||
Write1:
|
||||
str s0, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x8, x7, #8
|
||||
dup s16, v0.s[1]
|
||||
stp s0, s16, [x7]
|
||||
add x7, x7, x4
|
||||
st1 {v0.s}[2], [x8], x4
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v0.4s}, [x7], x4
|
||||
WriteEnd:
|
||||
subs x6, x6, #1
|
||||
bne Loop1
|
||||
LoopOcEnd:
|
||||
subs x2, x2, #4
|
||||
add x0, x0, #16
|
||||
bgt LoopOc
|
||||
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,96 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp32Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp32Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
|
||||
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
|
||||
// x14: relu, x15: relu6
|
||||
ConvDwFp32Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #48
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
ldr x13, [sp, #40]
|
||||
ldr x14, [sp, #48]
|
||||
ldr x15, [sp, #56]
|
||||
|
||||
mov x16, #4
|
||||
mul x8, x8, x16
|
||||
mul x9, x9, x16
|
||||
mul x10, x10, x16
|
||||
mul x11, x11, x16
|
||||
mul x12, x12, x16
|
||||
mul x13, x13, x16
|
||||
mov x16, #16
|
||||
mul x19, x7, x16
|
||||
|
||||
ld1 {v5.4s}, [x3]
|
||||
|
||||
LoopH:
|
||||
mov x23, x1
|
||||
mov x24, x5
|
||||
mov x3, x0
|
||||
LoopW:
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
ld1 {v0.4s}, [x3]
|
||||
fadd v0.4s, v0.4s, v5.4s
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x21, x17
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v1.4s}, [x22], x13
|
||||
ld1 {v2.4s}, [x21], #16
|
||||
fmla v0.4s, v1.4s, v2.4s
|
||||
subs x18, x18, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
add x17, x17, x19
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
cbnz x15, Relu6
|
||||
cbnz x14, Relu
|
||||
b Write
|
||||
Relu6:
|
||||
movi v4.4s, #6
|
||||
scvtf v4.4s, v4.4s
|
||||
fmin v0.4s, v0.4s, v4.4s
|
||||
Relu:
|
||||
dup v3.4s, wzr
|
||||
fmax v0.4s, v0.4s, v3.4s
|
||||
Write:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
add x23, x23, x11
|
||||
subs x24, x24, #1
|
||||
bne LoopW
|
||||
add x0, x0, x8
|
||||
add x1, x1, x10
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #48
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ret
|
||||
#endif
|
@ -0,0 +1,77 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwFp32Center
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwFp32Center, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step);
|
||||
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
|
||||
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
|
||||
DeconvDwFp32Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
|
||||
mov x13, #4
|
||||
mul x7, x7, x13
|
||||
mul x8, x8, x13
|
||||
mul x9, x9, x13
|
||||
mul x10, x10, x13
|
||||
mul x11, x11, x13
|
||||
mul x12, x12, x13
|
||||
mov x13, #16
|
||||
mul x14, x6, x13
|
||||
|
||||
LoopH:
|
||||
mov x15, x0
|
||||
mov x16, x1
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
mov x18, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x22, x19
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v0.4s}, [x21]
|
||||
ld1 {v1.4s}, [x16]
|
||||
ld1 {v2.4s}, [x22], #16
|
||||
fmla v0.4s, v1.4s, v2.4s
|
||||
st1 {v0.4s}, [x21], x12
|
||||
subs x13, x13, #1
|
||||
bne LoopKw
|
||||
add x18, x18, x11
|
||||
add x19, x19, x14
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
add x16, x16, x8
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
add x0, x0, x9
|
||||
add x1, x1, x7
|
||||
subs x3, x3, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
Loading…
Reference in new issue