|
|
@ -36,33 +36,29 @@ ConvDwInt8PostAlign4:
|
|
|
|
ld1 {v2.4s}, [x1], #16
|
|
|
|
ld1 {v2.4s}, [x1], #16
|
|
|
|
ld1 {v3.4s}, [x1], #16
|
|
|
|
ld1 {v3.4s}, [x1], #16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cbz w5, RightShiftDepth16
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqshl v1.4s, v1.4s, v26.4s
|
|
|
|
sqshl v1.4s, v1.4s, v26.4s
|
|
|
|
sqshl v2.4s, v2.4s, v26.4s
|
|
|
|
sqshl v2.4s, v2.4s, v26.4s
|
|
|
|
sqshl v3.4s, v3.4s, v26.4s
|
|
|
|
sqshl v3.4s, v3.4s, v26.4s
|
|
|
|
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
|
|
|
|
sqrdmulh v2.4s, v2.4s, v27.4s
|
|
|
|
|
|
|
|
sqrdmulh v3.4s, v3.4s, v27.4s
|
|
|
|
|
|
|
|
b AddZpDepth16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RightShiftDepth16:
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
sqrdmulh v2.4s, v2.4s, v27.4s
|
|
|
|
sqrdmulh v2.4s, v2.4s, v27.4s
|
|
|
|
sqrdmulh v3.4s, v3.4s, v27.4s
|
|
|
|
sqrdmulh v3.4s, v3.4s, v27.4s
|
|
|
|
|
|
|
|
|
|
|
|
and v16.16b, v28.16b, v0.16b
|
|
|
|
sqrshl v0.4s, v0.4s, v28.4s
|
|
|
|
sshr v16.4s, v16.4s, #31
|
|
|
|
sqrshl v1.4s, v1.4s, v28.4s
|
|
|
|
sqadd v0.4s, v0.4s, v16.4s
|
|
|
|
sqrshl v2.4s, v2.4s, v28.4s
|
|
|
|
srshl v0.4s, v0.4s, v28.4s
|
|
|
|
sqrshl v3.4s, v3.4s, v28.4s
|
|
|
|
and v17.16b, v28.16b, v1.16b
|
|
|
|
|
|
|
|
sshr v17.4s, v17.4s, #31
|
|
|
|
|
|
|
|
sqadd v1.4s, v1.4s, v17.4s
|
|
|
|
|
|
|
|
srshl v1.4s, v1.4s, v28.4s
|
|
|
|
|
|
|
|
and v18.16b, v28.16b, v2.16b
|
|
|
|
|
|
|
|
sshr v18.4s, v18.4s, #31
|
|
|
|
|
|
|
|
sqadd v2.4s, v2.4s, v18.4s
|
|
|
|
|
|
|
|
srshl v2.4s, v2.4s, v28.4s
|
|
|
|
|
|
|
|
and v19.16b, v28.16b, v3.16b
|
|
|
|
|
|
|
|
sshr v19.4s, v19.4s, #31
|
|
|
|
|
|
|
|
sqadd v3.4s, v3.4s, v19.4s
|
|
|
|
|
|
|
|
srshl v3.4s, v3.4s, v28.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AddZpDepth16:
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
add v1.4s, v1.4s, v29.4s
|
|
|
|
add v1.4s, v1.4s, v29.4s
|
|
|
|
add v2.4s, v2.4s, v29.4s
|
|
|
|
add v2.4s, v2.4s, v29.4s
|
|
|
@ -103,27 +99,24 @@ ConvDwInt8PostAlign4:
|
|
|
|
ld1 {v0.4s}, [x1], #16
|
|
|
|
ld1 {v0.4s}, [x1], #16
|
|
|
|
ld1 {v1.4s}, [x1], #16
|
|
|
|
ld1 {v1.4s}, [x1], #16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cbz w5, RightShiftDepth8
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqshl v1.4s, v1.4s, v26.4s
|
|
|
|
sqshl v1.4s, v1.4s, v26.4s
|
|
|
|
|
|
|
|
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
|
|
|
|
b AddZpDepth8
|
|
|
|
|
|
|
|
|
|
|
|
and v16.16b, v28.16b, v0.16b
|
|
|
|
RightShiftDepth8:
|
|
|
|
sshr v16.4s, v16.4s, #31
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqadd v0.4s, v0.4s, v16.4s
|
|
|
|
sqrdmulh v1.4s, v1.4s, v27.4s
|
|
|
|
srshl v0.4s, v0.4s, v28.4s
|
|
|
|
sqrshl v0.4s, v0.4s, v28.4s
|
|
|
|
and v17.16b, v28.16b, v1.16b
|
|
|
|
sqrshl v1.4s, v1.4s, v28.4s
|
|
|
|
sshr v17.4s, v17.4s, #31
|
|
|
|
|
|
|
|
sqadd v1.4s, v1.4s, v17.4s
|
|
|
|
|
|
|
|
srshl v1.4s, v1.4s, v28.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AddZpDepth8:
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
add v1.4s, v1.4s, v29.4s
|
|
|
|
add v1.4s, v1.4s, v29.4s
|
|
|
|
|
|
|
|
|
|
|
|
smax v0.4s, v0.4s, v30.4s
|
|
|
|
smax v0.4s, v0.4s, v30.4s
|
|
|
|
smax v1.4s, v1.4s, v30.4s
|
|
|
|
smax v1.4s, v1.4s, v30.4s
|
|
|
|
|
|
|
|
|
|
|
|
smin v0.4s, v0.4s, v31.4s
|
|
|
|
smin v0.4s, v0.4s, v31.4s
|
|
|
|
smin v1.4s, v1.4s, v31.4s
|
|
|
|
smin v1.4s, v1.4s, v31.4s
|
|
|
|
|
|
|
|
|
|
|
@ -147,11 +140,7 @@ ConvDwInt8PostAlign4:
|
|
|
|
|
|
|
|
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqshl v0.4s, v0.4s, v26.4s
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
sqrdmulh v0.4s, v0.4s, v27.4s
|
|
|
|
|
|
|
|
sqrshl v0.4s, v0.4s, v28.4s
|
|
|
|
and v16.16b, v28.16b, v0.16b
|
|
|
|
|
|
|
|
sshr v16.4s, v16.4s, #31
|
|
|
|
|
|
|
|
sqadd v0.4s, v0.4s, v16.4s
|
|
|
|
|
|
|
|
srshl v0.4s, v0.4s, v28.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
add v0.4s, v0.4s, v29.4s
|
|
|
|
smax v0.4s, v0.4s, v30.4s
|
|
|
|
smax v0.4s, v0.4s, v30.4s
|
|
|
|