[MSLITE][Develop] fix bug of arm cpu conv depthwise int8 3x3

pull/9020/head
yangruoqi713 4 years ago
parent 14a51ef727
commit 8e8c0ed73f

@ -119,12 +119,15 @@ ConvDw3x3Int8Corner:
b AddZpLoop b AddZpLoop
PerChannelPostLoop: PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
@ -145,11 +148,6 @@ ConvDw3x3Int8Corner:
st1 {v24.s}[0], [x0], #4 st1 {v24.s}[0], [x0], #4
ld1 {v23.4s}, [x3], #16 ld1 {v23.4s}, [x3], #16
ld1 {v24.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16
cbz x14, NEXT_LOOP
ld1 {v27.4s}, [x9], #16
ld1 {v28.4s}, [x10], #16
ld1 {v29.4s}, [x11], #16
NEXT_LOOP:
sub x6, x6, #8 sub x6, x6, #8
cmp x6, #8 cmp x6, #8
bgt LoopC8 bgt LoopC8
@ -181,14 +179,14 @@ ConvDw3x3Int8Corner:
b AddZp b AddZp
PerChannelPost: PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s
AddZp: AddZp:
add v23.4s, v23.4s, v26.4s add v23.4s, v23.4s, v26.4s

@ -148,12 +148,15 @@ ConvDw3x3Int8Horizontal:
b AddZpLoop b AddZpLoop
PerChannelPostLoop: PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
@ -209,14 +212,14 @@ ConvDw3x3Int8Horizontal:
b AddZp b AddZp
PerChannelPost: PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s
AddZp: AddZp:
add v23.4s, v23.4s, v26.4s add v23.4s, v23.4s, v26.4s

@ -139,12 +139,15 @@ ConvDw3x3Int8Vertical:
b AddZpLoop b AddZpLoop
PerChannelPostLoop: PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
@ -165,11 +168,6 @@ ConvDw3x3Int8Vertical:
st1 {v24.s}[0], [x0], #4 st1 {v24.s}[0], [x0], #4
ld1 {v23.4s}, [x3], #16 ld1 {v23.4s}, [x3], #16
ld1 {v24.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16
cbz x14, NEXT_LOOP
ld1 {v27.4s}, [x9], #16
ld1 {v28.4s}, [x10], #16
ld1 {v29.4s}, [x11], #16
NEXT_LOOP:
sub x6, x6, #8 sub x6, x6, #8
cmp x6, #8 cmp x6, #8
bgt LoopC8 bgt LoopC8
@ -205,14 +203,14 @@ ConvDw3x3Int8Vertical:
b AddZp b AddZp
PerChannelPost: PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16 ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16 ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16 ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s
AddZp: AddZp:
add v23.4s, v23.4s, v26.4s add v23.4s, v23.4s, v26.4s

Loading…
Cancel
Save